ref: e25a82b3d6f0ff687d67d771223b22f712b3d9c0
parent: 4b5e893fcc54ca46cab4335ed98a75ecf41fd3fc
parent: 0f95fac4caf6c9f285582a93be9bf5e3433c44ba
author: zhilwang <zhilwang@cisco.com>
date: Fri Aug 15 10:14:51 EDT 2014
Merge pull request #1279 from dongzha/NewAddARMHash add arm32/64 code for InitHash
--- a/codec/encoder/core/arm/svc_motion_estimation.S
+++ b/codec/encoder/core/arm/svc_motion_estimation.S
@@ -235,4 +235,133 @@
_SumOf16x16BlockOfFrame_neon_end:
ldmia sp!, {r4-r12}
WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN InitializeHashforFeature_neon
+// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+ stmdb sp!, {r4-r7}
+ ldr r4, [sp, #16] //pFeatureValuePointerList
+ bic r5, r2, #3
+_hash_assign_loop_x4:
+ vld1.64 {q0}, [r0]!
+ vshl.u32 q0, q0, #2
+ vceq.u32 q1, q0, #0
+ vand.i32 d2, d2, d3
+ vmov r6, r7, d2
+ and r6, r6, r7
+ cmp r6, #0xffffffff
+ beq _hash_assign_with_copy_x4
+
+ veor q1, q1
+ vext.32 q2, q1, q0, #3
+ vext.32 q3, q1, q0, #2
+ vext.32 q4, q1, q0, #1
+ vadd.u32 q0, q0, q2
+ vadd.u32 q0, q0, q3
+ vadd.u32 q0, q0, q4
+ vext.32 q2, q1, q0, #3
+ vdup.32 q3, r1
+ vadd.u32 q2, q2, q3
+ vst1.64 {q2}, [r3]!
+ vst1.64 {q2}, [r4]!
+ vmov.32 r6, d1[1]
+ add r1, r1, r6
+ b _assign_next
+
+_hash_assign_with_copy_x4:
+ vdup.32 q2, r1
+ vst1.64 {q2}, [r3]!
+ vst1.64 {q2}, [r4]!
+
+_assign_next:
+ subs r5, r5, #4
+ bne _hash_assign_loop_x4
+
+ and r5, r2, #3
+ cmp r5, #0
+ beq _hash_assign_end
+_hash_assign_loop_x4_rem:
+ str r1, [r3], #4
+ str r1, [r4], #4
+ ldr r7, [r0], #4
+ lsl r7, r7, #2
+ add r1, r1, r7
+ subs r5, r5, #1
+ bne _hash_assign_loop_x4_rem
+_hash_assign_end:
+
+ ldmia sp!, {r4-r7}
+WELS_ASM_FUNC_END
+
+.align 16
+mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
+mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
+mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
+
+WELS_ASM_FUNC_BEGIN FillQpelLocationByFeatureValue_neon
+// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
+ stmdb sp!, {r4-r8}
+ vpush {q4-q7}
+ adr r7, mv_x_inc_x4
+ vld1.64 {q7}, [r7]
+ adr r7, mv_y_inc_x4
+ vld1.64 {q6}, [r7]
+ adr r7, mx_x_offset_x4
+ vld1.64 {q5}, [r7]
+ veor q4, q4
+ veor q3, q3
+ vdup.32 q8, r3
+_hash_height_loop:
+ mov r7, r1
+ vmov q2, q5 //mx_x_offset_x4
+_hash_width_loop:
+ vld1.64 {d0}, [r0]!
+ vshll.u16 q0, d0, #2
+ vadd.u32 q0, q8
+ vmov q1, q2
+ vmov q4, q3
+ vzip.16 q1, q4
+
+ vmov.32 r4, d0[0]
+ ldr r5, [r4]
+ vmov.32 r6, d2[0]
+ str r6, [r5]
+ add r5, r5, #4
+ pld [r5] // cache miss?
+ str r5, [r4]
+
+ vmov.32 r4, d0[1]
+ ldr r5, [r4]
+ vmov.32 r6, d2[1]
+ str r6, [r5]
+ add r5, r5, #4
+ pld [r5] // cache miss?
+ str r5, [r4]
+
+ vmov.32 r4, d1[0]
+ ldr r5, [r4]
+ vmov.32 r6, d3[0]
+ str r6, [r5]
+ add r5, r5, #4
+ pld [r5] // cache miss?
+ str r5, [r4]
+
+ vmov.32 r4, d1[1]
+ ldr r5, [r4]
+ vmov.32 r6, d3[1]
+ str r6, [r5]
+ add r5, r5, #4
+ pld [r5] // cache miss?
+ str r5, [r4]
+
+ vadd.u16 q2, q2, q7
+ subs r7, #4
+ bne _hash_width_loop
+
+ vadd.u16 q3, q3, q6
+ subs r2, #1
+ bne _hash_height_loop
+
+ vpop {q4-q7}
+ ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
#endif
--- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -217,4 +217,121 @@
cbnz x2, _height_loop16x16
_SumOf16x16BlockOfFrame_AArch64_neon_end:
WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN InitializeHashforFeature_AArch64_neon
+// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+ mov x9, #3
+ bic x5, x2, x9
+ mov x8, #0
+_hash_assign_loop_x4:
+ ld1 {v0.16b}, [x0], #16
+ shl v0.4s, v0.4s, #2
+ addv s1, v0.4s
+ umov w7, v1.s[0]
+ cbz w7, _hash_assign_with_copy_x4
+
+ ins v2.d[0], x1
+ umov w8, v0.s[0]
+ add x1, x1, x8
+ ins v2.d[1], x1
+ umov w8, v0.s[1]
+ add x1, x1, x8
+ ins v3.d[0], x1
+ umov w8, v0.s[2]
+ add x1, x1, x8
+ ins v3.d[1], x1
+ umov w8, v0.s[3]
+ add x1, x1, x8
+ st1 {v2.16b, v3.16b}, [x3], #32
+ st1 {v2.16b, v3.16b}, [x4], #32
+ b _assign_next
+_hash_assign_with_copy_x4:
+ dup v2.2d, x1
+ dup v3.2d, x1
+ st1 {v2.16b, v3.16b}, [x3], #32
+ st1 {v2.16b, v3.16b}, [x4], #32
+
+_assign_next:
+ subs x5, x5, #4
+ cbnz x5, _hash_assign_loop_x4
+
+ and x5, x2, x9
+ cbz x5, _hash_assign_end
+
+
+_hash_assign_loop_x4_rem:
+ str x1, [x3], #8
+ str x1, [x4], #8
+ ldr w8, [x0], #4
+ lsl w8, w8, #2
+ add x1, x1, x8
+ subs x5, x5, #1
+ cbnz x5, _hash_assign_loop_x4_rem
+
+_hash_assign_end:
+WELS_ASM_AARCH64_FUNC_END
+
+.align 16
+mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
+mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
+mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
+
+WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon
+// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
+ ldr q7, mv_x_inc_x4
+ ldr q6, mv_y_inc_x4
+ ldr q5, mx_x_offset_x4
+
+ eor v4.16b, v4.16b, v4.16b
+ eor v3.16b, v3.16b, v3.16b
+ dup v16.2d, x3 // v8->v16
+
+_hash_height_loop:
+ mov x7, x1
+ mov.16b v2, v5 //mx_x_offset_x4
+
+_hash_width_loop:
+ ld1 {v0.d}[0], [x0], #8
+
+ ushll v0.4s, v0.4h, #3
+ uaddw v17.2d, v16.2d, v0.2s
+ uaddw2 v18.2d, v16.2d, v0.4s
+ zip1 v1.8h, v2.8h, v3.8h
+
+ umov x4, v17.d[0]
+ ldr x5, [x4]
+ umov w6, v1.s[0]
+ str w6, [x5]
+ add x5, x5, #4
+ str x5, [x4]
+
+ umov x4, v17.d[1]
+ ldr x5, [x4]
+ umov w6, v1.s[1]
+ str w6, [x5]
+ add x5, x5, #4
+ str x5, [x4]
+
+ umov x4, v18.d[0]
+ ldr x5, [x4]
+ umov w6, v1.s[2]
+ str w6, [x5]
+ add x5, x5, #4
+ str x5, [x4]
+
+ umov x4, v18.d[1]
+ ldr x5, [x4]
+ umov w6, v1.s[3]
+ str w6, [x5]
+ add x5, x5, #4
+ str x5, [x4]
+
+ add v2.8h, v2.8h, v7.8h
+ subs x7, x7, #4
+ cbnz x7, _hash_width_loop
+
+ add v3.8h, v3.8h, v6.8h
+ subs x2, x2, #1
+ cbnz x2, _hash_height_loop
+WELS_ASM_AARCH64_FUNC_END
#endif
\ No newline at end of file
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -271,6 +271,10 @@
#ifdef HAVE_NEON
extern "C"
{
+void InitializeHashforFeature_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+ uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+void FillQpelLocationByFeatureValue_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+ uint16_t** pFeatureValuePointerList);
int32_t SumOf8x8SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
int32_t SumOf16x16SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
void SumOf8x8BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
@@ -285,6 +289,10 @@
#ifdef HAVE_NEON_AARCH64
extern "C"
{
+void InitializeHashforFeature_AArch64_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+ uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+void FillQpelLocationByFeatureValue_AArch64_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+ uint16_t** pFeatureValuePointerList);
int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -125,6 +125,8 @@
#if defined (HAVE_NEON)
if (uiCpuFlag & WELS_CPU_NEON) {
//for feature search
+ pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_neon;
+ pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
@@ -136,6 +138,8 @@
#if defined (HAVE_NEON_AARCH64)
if (uiCpuFlag & WELS_CPU_NEON) {
//for feature search
+ pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_AArch64_neon;
+ pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_AArch64_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
--- a/test/encoder/EncUT_SVC_me.cpp
+++ b/test/encoder/EncUT_SVC_me.cpp
@@ -281,6 +281,10 @@
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 10, 10)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 16, 16)
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 640, 320)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 640, 320)
#endif
#ifdef HAVE_NEON_AARCH64
@@ -290,4 +294,8 @@
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 640, 320)
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 10, 10)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 16, 16)
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 640, 320)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 640, 320)
#endif