ref: 439e51bc11cd27569f205cda9809bb30986b6488
parent: ef1098be9ce373840e5c744b7c8fe533bdc5bfca
parent: cff49f5e45c3ccc849a5d7b6dfd6e0048859107b
author: zhilwang <zhilwang@cisco.com>
date: Fri Aug 8 05:19:51 EDT 2014
Merge pull request #1249 from dongzha/addArm32SCCNew add arm 32/64 code and UT for SVC SCC motion estimation
--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -45,6 +45,8 @@
4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446F818BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp */; };
4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446FA18BC605C0017DF25 /* wels_preprocess.cpp */; };
4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */; };
+ 6CA38DA31991CACE003EAAE0 /* svc_motion_estimation.S in Sources */ = {isa = PBXBuildFile; fileRef = 6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */; };
+ 6CA38DA51991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */; };
9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */; };
9AED66661946A2B3009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66651946A2B3009A3567 /* utils.cpp */; };
F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */; };
@@ -154,6 +156,8 @@
4CE446FE18BC605C0017DF25 /* welsEncoderExt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = welsEncoderExt.h; sourceTree = "<group>"; };
4CE4470418BC605C0017DF25 /* wels_enc_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_enc_export.def; sourceTree = "<group>"; };
4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsEncoderExt.cpp; sourceTree = "<group>"; };
+ 6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = svc_motion_estimation.S; sourceTree = "<group>"; };
+ 6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = svc_motion_estimation_aarch64_neon.S; path = arm64/svc_motion_estimation_aarch64_neon.S; sourceTree = "<group>"; };
9AED664819469FAF009A3567 /* welsCodecTrace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = welsCodecTrace.h; path = ../../../common/inc/welsCodecTrace.h; sourceTree = "<group>"; };
9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = "<group>"; };
9AED66651946A2B3009A3567 /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = utils.cpp; path = ../../../common/src/utils.cpp; sourceTree = "<group>"; };
@@ -177,6 +181,7 @@
4C34066418C57D0400DFA14A /* arm */ = {
isa = PBXGroup;
children = (
+ 6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */,
4C34066618C57D0400DFA14A /* intra_pred_neon.S */,
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */,
4C34066918C57D0400DFA14A /* memory_neon.S */,
@@ -189,6 +194,7 @@
4CB8F2B219235FAC005D6386 /* arm64 */ = {
isa = PBXGroup;
children = (
+ 6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */,
F5BE8004196B913200ED02ED /* memory_aarch64_neon.S */,
F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */,
4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */,
@@ -423,6 +429,7 @@
4CE4471D18BC605C0017DF25 /* property.cpp in Sources */,
4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */,
4CE4472018BC605C0017DF25 /* sample.cpp in Sources */,
+ 6CA38DA31991CACE003EAAE0 /* svc_motion_estimation.S in Sources */,
4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */,
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */,
9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */,
@@ -455,6 +462,7 @@
4CE4471218BC605C0017DF25 /* encoder.cpp in Sources */,
4CE4471618BC605C0017DF25 /* get_intra_predictor.cpp in Sources */,
4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */,
+ 6CA38DA51991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S in Sources */,
4CE4471418BC605C0017DF25 /* encoder_ext.cpp in Sources */,
4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */,
);
--- /dev/null
+++ b/codec/encoder/core/arm/svc_motion_estimation.S
@@ -1,0 +1,168 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+
+WELS_ASM_FUNC_BEGIN SumOf8x8SingleBlock_neon
+ vld1.64 {d0}, [r0], r1
+ vld1.64 {d1}, [r0], r1
+ vld1.64 {d2}, [r0], r1
+ vld1.64 {d3}, [r0], r1
+ vld1.64 {d4}, [r0], r1
+ vld1.64 {d5}, [r0], r1
+ vld1.64 {d6}, [r0], r1
+ vld1.64 {d7}, [r0]
+ vpaddl.u8 q0, q0
+ vpadal.u8 q0, q1
+ vpadal.u8 q0, q2
+ vpadal.u8 q0, q3
+
+ vpaddl.u16 q0, q0
+ vpadd.i32 d0, d1
+ vpadd.i32 d0, d0
+ vmov r0, r1, d0
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN SumOf16x16SingleBlock_neon
+ vld1.64 {q0}, [r0], r1
+ vpaddl.u8 q0, q0
+.rept 15
+ vld1.64 {q1}, [r0], r1
+ vpadal.u8 q0, q1
+.endr
+ vpaddl.u16 q0, q0
+ vpadd.i32 d0, d1
+ vpadd.i32 d0, d0
+ vmov r0, r1, d0
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon
+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
+ stmdb sp!, {r4-r8}
+ ldr r5, [sp, #24] //pTimesOfFeatureValue
+ ldr r4, [sp, #20] //pFeatureOfBlock
+
+ mov r8, r0
+ mov r6, r1
+ add r8, r6
+ add r4, r6, lsl #1
+
+_height_loop8x8:
+ mov r7, r6
+_width_loop8x8:
+ subs r0, r8, r7
+ vld1.64 {d0}, [r0], r3
+ vld1.64 {d1}, [r0], r3
+ vld1.64 {d2}, [r0], r3
+ vld1.64 {d3}, [r0], r3
+ vld1.64 {d4}, [r0], r3
+ vld1.64 {d5}, [r0], r3
+ vld1.64 {d6}, [r0], r3
+ vld1.64 {d7}, [r0]
+
+ vpaddl.u8 q0, q0
+ vpadal.u8 q0, q1
+ vpadal.u8 q0, q2
+ vpadal.u8 q0, q3
+
+ vpaddl.u16 q0, q0
+ vpadd.i32 d0, d1
+ vpadd.i32 d0, d0
+
+ subs r1, r4, r7, lsl #1
+ vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i]
+ vmov r0, r1, d0
+ add r1, r5, r0, lsl #2
+ ldr r0, [r1]
+ add r0, #1
+ str r0, [r1]
+
+ subs r7, #1
+ bne _width_loop8x8
+
+ add r8, r3
+ add r4, r6, lsl #1
+ subs r2, #1
+ bne _height_loop8x8
+
+ ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon
+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
+ stmdb sp!, {r4-r8}
+ ldr r5, [sp, #24] //pTimesOfFeatureValue
+ ldr r4, [sp, #20] //pFeatureOfBlock
+
+ mov r8, r0
+ mov r6, r1
+ add r8, r6
+ add r4, r6, lsl #1
+
+_height_loop16x16:
+ mov r7, r6
+_width_loop16x16:
+ subs r0, r8, r7
+ vld1.64 {q0}, [r0], r3
+ vpaddl.u8 q0, q0
+.rept 15
+ vld1.64 {q1}, [r0], r3
+ vpadal.u8 q0, q1
+.endr
+ vpaddl.u16 q0, q0
+ vpadd.i32 d0, d1
+ vpadd.i32 d0, d0
+
+ subs r1, r4, r7, lsl #1
+ vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i]
+ vmov r0, r1, d0
+ add r1, r5, r0, lsl #2
+ ldr r0, [r1]
+ add r0, #1
+ str r0, [r1]
+
+ subs r7, #1
+ bne _width_loop16x16
+
+ add r8, r3
+ add r4, r6, lsl #1
+ subs r2, #1
+ bne _height_loop16x16
+
+ ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+#endif
\ No newline at end of file
--- /dev/null
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -1,0 +1,151 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8SingleBlock_AArch64_neon
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[0], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ ld1 {v2.d}[0], [x0], x1
+ ld1 {v2.d}[1], [x0], x1
+ ld1 {v3.d}[0], [x0], x1
+ ld1 {v3.d}[1], [x0]
+ uaddlp v0.8h, v0.16b
+ uadalp v0.8h, v1.16b
+ uadalp v0.8h, v2.16b
+ uadalp v0.8h, v3.16b
+ uaddlv s0, v0.8h
+ mov x0, v0.d[0]
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16SingleBlock_AArch64_neon
+ ld1 {v0.16b}, [x0], x1
+ uaddlp v0.8h, v0.16b
+.rept 15
+ ld1 {v1.16b}, [x0], x1
+ uadalp v0.8h, v1.16b
+.endr
+ uaddlv s0, v0.8h
+ mov x0, v0.d[0]
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8BlockOfFrame_AArch64_neon
+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
+ //x5: pTimesOfFeatureValue
+ //x4: pFeatureOfBlock
+
+ mov x8, x0
+ mov x6, x1
+ add x8, x8, x6
+ add x4, x4, x6, lsl #1
+
+_height_loop8x8:
+ mov x7, x6
+_width_loop8x8:
+ subs x0, x8, x7
+ ld1 {v0.d}[0], [x0], x3
+ ld1 {v0.d}[1], [x0], x3
+ ld1 {v1.d}[0], [x0], x3
+ ld1 {v1.d}[1], [x0], x3
+ ld1 {v2.d}[0], [x0], x3
+ ld1 {v2.d}[1], [x0], x3
+ ld1 {v3.d}[0], [x0], x3
+ ld1 {v3.d}[1], [x0]
+ uaddlp v0.8h, v0.16b
+ uadalp v0.8h, v1.16b
+ uadalp v0.8h, v2.16b
+ uadalp v0.8h, v3.16b
+ uaddlv s0, v0.8h
+
+ subs x1, x4, x7, lsl #1
+ st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i]
+ mov w0, #0
+ ins v0.s[1], w0
+ mov x0, v0.d[0]
+ add x1, x5, x0, lsl #2
+ ldr w0, [x1]
+ add w0, w0, #1
+ str w0, [x1]
+ subs x7, x7, #1
+ cbnz x7, _width_loop8x8
+
+ add x8, x8, x3
+ add x4, x4, x6, lsl #1
+ subs x2, x2, #1
+ cbnz x2, _height_loop8x8
+
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
+ //x5: pTimesOfFeatureValue
+ //x4: pFeatureOfBlock
+
+ mov x8, x0
+ mov x6, x1
+ add x8, x8, x6
+ add x4, x4, x6, lsl #1
+
+_height_loop16x16:
+ mov x7, x6
+_width_loop16x16:
+ subs x0, x8, x7
+ ld1 {v0.16b}, [x0], x3
+ uaddlp v0.8h, v0.16b
+.rept 15
+ ld1 {v1.16b}, [x0], x3
+ uadalp v0.8h, v1.16b
+.endr
+ uaddlv s0, v0.8h
+
+ subs x1, x4, x7, lsl #1
+ st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i]
+ mov w0, #0
+ ins v0.s[1], w0
+ mov x0, v0.d[0]
+ add x1, x5, x0, lsl #2
+ ldr w0, [x1]
+ add w0, w0, #1
+ str w0, [x1]
+ subs x7, x7, #1
+ cbnz x7, _width_loop16x16
+
+ add x8, x8, x3
+ add x4, x4, x6, lsl #1
+ subs x2, x2, #1
+ cbnz x2, _height_loop16x16
+WELS_ASM_AARCH64_FUNC_END
+#endif
\ No newline at end of file
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -244,6 +244,33 @@
void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+#ifdef HAVE_NEON
+extern "C"
+{
+int32_t SumOf8x8SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
+int32_t SumOf16x16SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
+void SumOf8x8BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+ const int32_t kiRefStride,
+ uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf16x16BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+ const int32_t kiRefStride,
+ uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+}
+#endif
+
+#ifdef HAVE_NEON_AARCH64
+extern "C"
+{
+int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
+int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
+void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+ const int32_t kiRefStride,
+ uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf16x16BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+ const int32_t kiRefStride,
+ uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+}
+#endif
int32_t RequestScreenBlockFeatureStorage (CMemoryAlign* pMa, const int32_t kiFrameWidth, const int32_t kiFrameHeight,
const int32_t iNeedFeatureStorage,
SScreenBlockFeatureStorage* pScreenBlockFeatureStorage);
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -102,6 +102,23 @@
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
+#if defined (HAVE_NEON)
+ //for feature search
+ pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
+ pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
+ //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+ pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
+ pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
+#endif
+
+#if defined (HAVE_NEON_AARCH64)
+ //for feature search
+ pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
+ pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
+ //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+ pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
+ pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
+#endif
}
}
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -53,6 +53,7 @@
$(ENCODER_SRCDIR)/core/arm/memory_neon.S\
$(ENCODER_SRCDIR)/core/arm/pixel_neon.S\
$(ENCODER_SRCDIR)/core/arm/reconstruct_neon.S\
+ $(ENCODER_SRCDIR)/core/arm/svc_motion_estimation.S\
ENCODER_OBJS += $(ENCODER_ASM_ARM_SRCS:.S=.$(OBJ))
endif
@@ -64,6 +65,7 @@
$(ENCODER_SRCDIR)/core/arm64/memory_aarch64_neon.S\
$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\
$(ENCODER_SRCDIR)/core/arm64/reconstruct_aarch64_neon.S\
+ $(ENCODER_SRCDIR)/core/arm64/svc_motion_estimation_aarch64_neon.S\
ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))
endif
--- /dev/null
+++ b/test/encoder/EncUT_SVC_me.cpp
@@ -1,0 +1,157 @@
+#include <gtest/gtest.h>
+#include <math.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "cpu_core.h"
+#include "cpu.h"
+#include "macros.h"
+#include "svc_motion_estimate.h"
+
+using namespace WelsSVCEnc;
+#define SVC_ME_TEST_NUM 10
+static void FillWithRandomData (uint8_t* p, int32_t Len) {
+ for (int32_t i = 0; i < Len; i++) {
+ p[i] = rand() % 256;
+ }
+}
+
+//preprocess related
+int32_t SumOf8x8SingleBlock_ref (uint8_t* pRef, const int32_t kiRefStride) {
+ int32_t iSum = 0, i;
+ for (i = 0; i < 8; i++) {
+ iSum += pRef[0] + pRef[1] + pRef[2] + pRef[3];
+ iSum += pRef[4] + pRef[5] + pRef[6] + pRef[7];
+ pRef += kiRefStride;
+ }
+ return iSum;
+}
+int32_t SumOf16x16SingleBlock_ref (uint8_t* pRef, const int32_t kiRefStride) {
+ int32_t iSum = 0, i;
+ for (i = 0; i < 16; i++) {
+ iSum += pRef[0] + pRef[1] + pRef[2] + pRef[3];
+ iSum += pRef[4] + pRef[5] + pRef[6] + pRef[7];
+ iSum += pRef[8] + pRef[9] + pRef[10] + pRef[11];
+ iSum += pRef[12] + pRef[13] + pRef[14] + pRef[15];
+ pRef += kiRefStride;
+ }
+ return iSum;
+}
+
+void SumOf8x8BlockOfFrame_ref (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+ const int32_t kiRefStride,
+ uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {
+ int32_t x, y;
+ uint8_t* pRef;
+ uint16_t* pBuffer;
+ int32_t iSum;
+ for (y = 0; y < kiHeight; y++) {
+ pRef = pRefPicture + kiRefStride * y;
+ pBuffer = pFeatureOfBlock + kiWidth * y;
+ for (x = 0; x < kiWidth; x++) {
+ iSum = SumOf8x8SingleBlock_c (pRef + x, kiRefStride);
+
+ pBuffer[x] = iSum;
+ pTimesOfFeatureValue[iSum]++;
+ }
+ }
+}
+
+void SumOf16x16BlockOfFrame_ref (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+ const int32_t kiRefStride,
+ uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {
+ //TODO: this is similar to SumOf8x8BlockOfFrame_c expect the calling of single block func, refactor-able?
+ int32_t x, y;
+ uint8_t* pRef;
+ uint16_t* pBuffer;
+ int32_t iSum;
+ for (y = 0; y < kiHeight; y++) {
+ pRef = pRefPicture + kiRefStride * y;
+ pBuffer = pFeatureOfBlock + kiWidth * y;
+ for (x = 0; x < kiWidth; x++) {
+ iSum = SumOf16x16SingleBlock_c (pRef + x, kiRefStride);
+
+ pBuffer[x] = iSum;
+ pTimesOfFeatureValue[iSum]++;
+ }
+ }
+}
+
+#define GENERATE_SumOfSingleBlock(anchor, method) \
+TEST (SVC_ME_FunTest, method) {\
+ ENFORCE_STACK_ALIGN_1D (uint8_t, uiRefBuf, 16*320, 16);\
+ int32_t iRes[2];\
+ for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) {\
+ FillWithRandomData (uiRefBuf,16*320);\
+ iRes[0] = anchor (uiRefBuf,320);\
+ iRes[1] = method (uiRefBuf,320);\
+ ASSERT_EQ (iRes[0], iRes[1]);\
+ }\
+}
+
+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_c)
+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_c)
+
+#ifdef HAVE_NEON
+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_neon)
+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_neon)
+#endif
+
+#ifdef HAVE_NEON_AARCH64
+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_AArch64_neon)
+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_AArch64_neon)
+#endif
+
+
+#define ENFORCE_NEW_ALIGN_1D(_tp, _nm, _nbuff, _sz, _al) \
+_tp *_nbuff = new _tp[(_sz)+(_al)-1]; \
+_tp *_nm = _nbuff + ((_al)-1) - (((uintptr_t)(_nbuff + ((_al)-1)) & ((_al)-1))/sizeof(_tp));
+
+#define GENERATE_SumOfFrame(anchor, method, kiWidth, kiHeight) \
+TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
+ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock1, pFeatureOfBlockBuff1, (kiWidth*kiHeight), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock2, pFeatureOfBlockBuff2, (kiWidth*kiHeight), 16) \
+uint32_t pTimesOfFeatureValue[2][65536]; \
+for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) {\
+ FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16));\
+ memset(pTimesOfFeatureValue[0], 0, 65536*sizeof(uint32_t)); \
+ memset(pTimesOfFeatureValue[1], 0, 65536*sizeof(uint32_t)); \
+ anchor (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock1,pTimesOfFeatureValue[0]); \
+ method (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock2,pTimesOfFeatureValue[1]); \
+ for(int32_t j=0;j<kiWidth*kiHeight;j++){\
+ ASSERT_EQ (pFeatureOfBlock1[j], pFeatureOfBlock2[j]);\
+ }\
+ for(int32_t j=0;j<65536;j++){\
+ ASSERT_EQ (pTimesOfFeatureValue[0][j], pTimesOfFeatureValue[1][j]);\
+ }\
+}\
+delete[] pRefPictureBuff; \
+delete[] pFeatureOfBlockBuff1; \
+delete[] pFeatureOfBlockBuff2; \
+}
+
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 1)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 640, 320)
+
+#ifdef HAVE_NEON
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
+#endif
+
+#ifdef HAVE_NEON_AARCH64
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 1)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 320)
+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 640, 320)
+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 640, 320)
+#endif
--- a/test/encoder/targets.mk
+++ b/test/encoder/targets.mk
@@ -1,7 +1,7 @@
ENCODER_UNITTEST_SRCDIR=test/encoder
ENCODER_UNITTEST_CPP_SRCS=\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_DecodeMbAux.cpp\
- $(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\
+ $(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMb.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMbAux.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_ExpGolomb.cpp\
@@ -13,6 +13,7 @@
$(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\
+ $(ENCODER_UNITTEST_SRCDIR)/EncUT_SVC_me.cpp\
ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))