shithub: openh264

Download patch

ref: e25a82b3d6f0ff687d67d771223b22f712b3d9c0
parent: 4b5e893fcc54ca46cab4335ed98a75ecf41fd3fc
parent: 0f95fac4caf6c9f285582a93be9bf5e3433c44ba
author: zhilwang <zhilwang@cisco.com>
date: Fri Aug 15 10:14:51 EDT 2014

Merge pull request #1279 from dongzha/NewAddARMHash

add arm32/64 code for InitHash

--- a/codec/encoder/core/arm/svc_motion_estimation.S
+++ b/codec/encoder/core/arm/svc_motion_estimation.S
@@ -235,4 +235,133 @@
 _SumOf16x16BlockOfFrame_neon_end:
     ldmia sp!, {r4-r12}
 WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN InitializeHashforFeature_neon
+// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+    stmdb sp!, {r4-r7}
+    ldr	r4, [sp, #16] //pFeatureValuePointerList
+    bic r5, r2, #3
+_hash_assign_loop_x4:
+    vld1.64 {q0}, [r0]!
+    vshl.u32 q0, q0, #2
+    vceq.u32 q1, q0, #0
+    vand.i32 d2, d2, d3
+    vmov r6, r7, d2
+    and r6, r6, r7
+    cmp r6, #0xffffffff
+    beq _hash_assign_with_copy_x4
+
+    veor q1, q1
+    vext.32 q2, q1, q0, #3
+    vext.32 q3, q1, q0, #2
+    vext.32 q4, q1, q0, #1
+    vadd.u32 q0, q0, q2
+    vadd.u32 q0, q0, q3
+    vadd.u32 q0, q0, q4
+    vext.32 q2, q1, q0, #3
+    vdup.32  q3, r1
+    vadd.u32 q2, q2, q3
+    vst1.64 {q2}, [r3]!
+    vst1.64 {q2}, [r4]!
+    vmov.32 r6, d1[1]
+    add r1, r1, r6
+    b _assign_next
+
+_hash_assign_with_copy_x4:
+    vdup.32  q2, r1
+    vst1.64 {q2}, [r3]!
+    vst1.64 {q2}, [r4]!
+
+_assign_next:
+	subs r5, r5, #4
+	bne _hash_assign_loop_x4
+
+    and r5, r2, #3
+    cmp r5, #0
+    beq _hash_assign_end
+_hash_assign_loop_x4_rem:
+    str r1, [r3], #4
+    str r1, [r4], #4
+    ldr r7, [r0], #4
+    lsl r7, r7, #2
+    add r1, r1, r7
+    subs r5, r5, #1
+    bne _hash_assign_loop_x4_rem
+_hash_assign_end:
+
+    ldmia sp!, {r4-r7}
+WELS_ASM_FUNC_END
+
+.align 16
+mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
+mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
+mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
+
+WELS_ASM_FUNC_BEGIN FillQpelLocationByFeatureValue_neon
+// void  (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
+    stmdb sp!, {r4-r8}
+    vpush		{q4-q7}
+    adr r7, mv_x_inc_x4
+    vld1.64 {q7}, [r7]
+    adr r7, mv_y_inc_x4
+    vld1.64 {q6}, [r7]
+    adr r7, mx_x_offset_x4
+    vld1.64 {q5}, [r7]
+    veor q4, q4
+    veor q3, q3
+    vdup.32 q8, r3
+_hash_height_loop:
+    mov r7, r1
+    vmov q2, q5 //mx_x_offset_x4
+_hash_width_loop:
+    vld1.64 {d0}, [r0]!
+    vshll.u16 q0, d0, #2
+    vadd.u32 q0, q8
+    vmov q1, q2
+    vmov q4, q3
+    vzip.16 q1, q4
+
+    vmov.32 r4, d0[0]
+    ldr r5, [r4]
+    vmov.32 r6, d2[0]
+    str r6, [r5]
+    add r5, r5, #4
+    pld [r5] // cache miss?
+    str r5, [r4]
+
+    vmov.32 r4, d0[1]
+    ldr r5, [r4]
+    vmov.32 r6, d2[1]
+    str r6, [r5]
+    add r5, r5, #4
+    pld [r5] // cache miss?
+    str r5, [r4]
+
+    vmov.32 r4, d1[0]
+    ldr r5, [r4]
+    vmov.32 r6, d3[0]
+    str r6, [r5]
+    add r5, r5, #4
+    pld [r5] // cache miss?
+    str r5, [r4]
+
+    vmov.32 r4, d1[1]
+    ldr r5, [r4]
+    vmov.32 r6, d3[1]
+    str r6, [r5]
+    add r5, r5, #4
+    pld [r5] // cache miss?
+    str r5, [r4]
+
+    vadd.u16 q2, q2, q7
+    subs r7, #4
+    bne _hash_width_loop
+
+    vadd.u16 q3, q3, q6
+    subs r2, #1
+    bne _hash_height_loop
+
+    vpop		{q4-q7}
+    ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
 #endif
--- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -217,4 +217,121 @@
     cbnz x2, _height_loop16x16
 _SumOf16x16BlockOfFrame_AArch64_neon_end:
 WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN InitializeHashforFeature_AArch64_neon
+// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+    mov x9, #3
+    bic x5, x2, x9
+    mov x8, #0
+_hash_assign_loop_x4:
+    ld1 {v0.16b}, [x0], #16
+    shl v0.4s, v0.4s, #2
+    addv s1, v0.4s
+    umov w7, v1.s[0]
+    cbz w7, _hash_assign_with_copy_x4
+
+    ins v2.d[0], x1
+    umov w8, v0.s[0]
+    add x1, x1, x8
+    ins v2.d[1], x1
+    umov w8, v0.s[1]
+    add x1, x1, x8
+    ins v3.d[0], x1
+    umov w8, v0.s[2]
+    add x1, x1, x8
+    ins v3.d[1], x1
+    umov w8, v0.s[3]
+    add x1, x1, x8
+    st1 {v2.16b, v3.16b}, [x3], #32
+    st1 {v2.16b, v3.16b}, [x4], #32
+    b _assign_next
+_hash_assign_with_copy_x4:
+    dup  v2.2d, x1
+    dup  v3.2d, x1
+    st1 {v2.16b, v3.16b}, [x3], #32
+    st1 {v2.16b, v3.16b}, [x4], #32
+
+_assign_next:
+	subs x5, x5, #4
+	cbnz x5, _hash_assign_loop_x4
+
+    and x5, x2, x9
+    cbz x5, _hash_assign_end
+
+
+_hash_assign_loop_x4_rem:
+    str x1, [x3], #8
+    str x1, [x4], #8
+    ldr w8, [x0], #4
+    lsl w8, w8, #2
+    add x1, x1, x8
+    subs x5, x5, #1
+    cbnz x5, _hash_assign_loop_x4_rem
+
+_hash_assign_end:
+WELS_ASM_AARCH64_FUNC_END
+
+.align 16
+mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
+mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
+mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
+
+WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon
+// void  (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
+    ldr q7, mv_x_inc_x4
+    ldr q6, mv_y_inc_x4
+    ldr q5, mx_x_offset_x4
+
+    eor v4.16b, v4.16b, v4.16b
+    eor v3.16b, v3.16b, v3.16b
+    dup v16.2d, x3 // v8->v16
+
+_hash_height_loop:
+    mov x7, x1
+    mov.16b v2, v5 //mx_x_offset_x4
+
+_hash_width_loop:
+    ld1 {v0.d}[0], [x0], #8
+
+    ushll v0.4s, v0.4h, #3
+    uaddw   v17.2d, v16.2d, v0.2s
+    uaddw2  v18.2d, v16.2d, v0.4s
+    zip1 v1.8h, v2.8h, v3.8h
+
+    umov x4, v17.d[0]
+    ldr x5, [x4]
+    umov w6, v1.s[0]
+    str w6, [x5]
+    add x5, x5, #4
+    str x5, [x4]
+
+    umov x4, v17.d[1]
+    ldr x5, [x4]
+    umov w6, v1.s[1]
+    str w6, [x5]
+    add x5, x5, #4
+    str x5, [x4]
+
+    umov x4, v18.d[0]
+    ldr x5, [x4]
+    umov w6, v1.s[2]
+    str w6, [x5]
+    add x5, x5, #4
+    str x5, [x4]
+
+    umov x4, v18.d[1]
+    ldr x5, [x4]
+    umov w6, v1.s[3]
+    str w6, [x5]
+    add x5, x5, #4
+    str x5, [x4]
+
+    add v2.8h, v2.8h, v7.8h
+    subs x7, x7, #4
+    cbnz x7, _hash_width_loop
+
+    add v3.8h, v3.8h, v6.8h
+    subs x2, x2, #1
+    cbnz x2, _hash_height_loop
+WELS_ASM_AARCH64_FUNC_END
 #endif
\ No newline at end of file
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -271,6 +271,10 @@
 #ifdef HAVE_NEON
 extern "C"
 {
+void InitializeHashforFeature_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+                                    uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+void FillQpelLocationByFeatureValue_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+                                          uint16_t** pFeatureValuePointerList);
 int32_t SumOf8x8SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
 int32_t SumOf16x16SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
 void SumOf8x8BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
@@ -285,6 +289,10 @@
 #ifdef HAVE_NEON_AARCH64
 extern "C"
 {
+void InitializeHashforFeature_AArch64_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+                                    uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+void FillQpelLocationByFeatureValue_AArch64_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+                                          uint16_t** pFeatureValuePointerList);
 int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
 int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
 void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -125,6 +125,8 @@
 #if defined (HAVE_NEON)
     if (uiCpuFlag & WELS_CPU_NEON) {
       //for feature search
+      pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_neon;
+      pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_neon;
       pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
       pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
       //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
@@ -136,6 +138,8 @@
 #if defined (HAVE_NEON_AARCH64)
     if (uiCpuFlag & WELS_CPU_NEON) {
       //for feature search
+      pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_AArch64_neon;
+      pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_AArch64_neon;
       pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
       pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
       //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
--- a/test/encoder/EncUT_SVC_me.cpp
+++ b/test/encoder/EncUT_SVC_me.cpp
@@ -281,6 +281,10 @@
 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
 GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 10, 10)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 16, 16)
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 640, 320)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 640, 320)
 #endif
 
 #ifdef HAVE_NEON_AARCH64
@@ -290,4 +294,8 @@
 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 320)
 GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 640, 320)
 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 640, 320)
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 10, 10)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 16, 16)
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 640, 320)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 640, 320)
 #endif