shithub: openh264

Download patch

ref: 711f5cabe727eeff386dc2d25d5c2e5f7f6d6785
parent: 1e34a61dd60fc45bc1a7f668162dc7d7da340fa7
parent: e14186b5354289e2504c505f9612fe455411b18e
author: zhilwang <zhilwang@cisco.com>
date: Fri Aug 15 05:02:15 EDT 2014

Merge pull request #1273 from dongzha/RefineArmCodeForSumBlock

refine arm code for sum of frame

--- a/codec/encoder/core/arm/svc_motion_estimation.S
+++ b/codec/encoder/core/arm/svc_motion_estimation.S
@@ -72,9 +72,9 @@
 
 WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon
 //(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
-    stmdb sp!, {r4-r8}
-    ldr	r5, [sp, #24] //pTimesOfFeatureValue
-    ldr	r4, [sp, #20] //pFeatureOfBlock
+    stmdb sp!, {r4-r12}
+    ldr	r5, [sp, #40] //pTimesOfFeatureValue
+    ldr	r4, [sp, #36] //pFeatureOfBlock
 
     mov r8, r0
     mov r6, r1
@@ -81,9 +81,8 @@
     add r8, r6
     add r4, r4, r6, lsl #1
 
-_height_loop8x8:
     mov r7, r6
-_width_loop8x8:
+_width_loop8x8_1:
     subs r0, r8, r7
     vld1.64 {d0}, [r0], r3
     vld1.64 {d1}, [r0], r3
@@ -98,7 +97,6 @@
     vpadal.u8 q0, q1
     vpadal.u8 q0, q2
     vpadal.u8 q0, q3
-
     vpaddl.u16 q0, q0
     vpadd.i32 d0, d1
     vpadd.i32 d0, d0
@@ -112,21 +110,57 @@
     str r0, [r1]
 
     subs r7, #1
-    bne _width_loop8x8
+    bne _width_loop8x8_1
 
     add r8, r3
     add r4, r4, r6, lsl #1
     subs r2, #1
-    bne _height_loop8x8
+    beq _SumOf8x8BlockOfFrame_end
 
-    ldmia sp!, {r4-r8}
+
+_height_loop8x8:
+    mov r7, r6
+_width_loop8x8_2:
+    subs r0, r8, r7
+    subs r1, r4, r7, lsl #1
+
+    subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i]
+    ldrh  r10, [r9] // sum of last line of pFeatureOfBlock[i]
+
+    subs r11, r0, r3
+    vld1.64 {d1}, [r11]
+    add r0, r11, r3, lsl #3
+    vld1.64 {d0}, [r0] //
+
+    vpaddl.u8 q0, q0
+    vpadd.u16 d0, d0, d1
+    vpaddl.u16 d0, d0
+    vmov r11, r12, d0
+    subs r10, r12
+    add r0, r10, r11
+
+    strh r0, [r1] // sum -> pFeatureOfBlock[i]
+
+    add r1, r5, r0, lsl #2
+    ldr r0, [r1]
+    add r0, #1
+    str r0, [r1]
+    subs r7, #1
+    bne _width_loop8x8_2
+
+    add r8, r3
+    add r4, r4, r6, lsl #1
+    subs r2, #1
+    bne _height_loop8x8
+_SumOf8x8BlockOfFrame_end:
+    ldmia sp!, {r4-r12}
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon
 //(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
-    stmdb sp!, {r4-r8}
-    ldr	r5, [sp, #24] //pTimesOfFeatureValue
-    ldr	r4, [sp, #20] //pFeatureOfBlock
+    stmdb sp!, {r4-r12}
+    ldr	r5, [sp, #40] //pTimesOfFeatureValue
+    ldr	r4, [sp, #36] //pFeatureOfBlock
 
     mov r8, r0
     mov r6, r1
@@ -133,9 +167,8 @@
     add r8, r6
     add r4, r4, r6, lsl #1
 
-_height_loop16x16:
     mov r7, r6
-_width_loop16x16:
+_width_loop16x16_1:
     subs r0, r8, r7
     vld1.64 {q0}, [r0], r3
     vpaddl.u8 q0, q0
@@ -156,13 +189,50 @@
     str r0, [r1]
 
     subs r7, #1
-    bne _width_loop16x16
+    bne _width_loop16x16_1
+    add r8, r3
+    add r4, r4, r6, lsl #1
+    subs r2, #1
+    beq _SumOf16x16BlockOfFrame_neon_end
 
+_height_loop16x16:
+    mov r7, r6
+_width_loop16x16_2:
+    subs r0, r8, r7
+    subs r1, r4, r7, lsl #1
+    subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i]
+    ldrh  r10, [r9] // sum of last line of pFeatureOfBlock[i]
+
+    subs r11, r0, r3
+    vld1.64 {q1}, [r11]
+    add r0, r11, r3, lsl #4
+    vld1.64 {q0}, [r0] //
+
+    vpaddl.u8 q0, q0
+    vpaddl.u8 q1, q1
+    vpadd.u16 d0, d0, d1
+    vpadd.u16 d1, d2, d3
+    vpadd.u16 d0, d0, d1
+    vpaddl.u16 d0, d0
+
+    vmov r11, r12, d0
+    subs r10, r12
+    add r0, r10, r11
+
+    strh r0, [r1] // sum -> pFeatureOfBlock[i]
+    add r1, r5, r0, lsl #2
+    ldr r0, [r1]
+    add r0, #1
+    str r0, [r1]
+
+    subs r7, #1
+    bne _width_loop16x16_2
+
     add r8, r3
     add r4, r4, r6, lsl #1
     subs r2, #1
     bne _height_loop16x16
-
-    ldmia sp!, {r4-r8}
+_SumOf16x16BlockOfFrame_neon_end:
+    ldmia sp!, {r4-r12}
 WELS_ASM_FUNC_END
 #endif
--- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -72,9 +72,8 @@
     add x8, x8, x6
     add x4, x4, x6, lsl #1
 
-_height_loop8x8:
     mov x7, x6
-_width_loop8x8:
+_width_loop8x8_1:
     subs x0, x8, x7
     ld1 {v0.d}[0], [x0], x3
     ld1 {v0.d}[1], [x0], x3
@@ -100,13 +99,48 @@
     add w0, w0, #1
     str w0, [x1]
     subs x7, x7, #1
-    cbnz x7, _width_loop8x8
+    cbnz x7, _width_loop8x8_1
 
     add x8, x8, x3
     add x4, x4, x6, lsl #1
     subs x2, x2, #1
-    cbnz x2, _height_loop8x8
+    cbz x2, _SumOf8x8BlockOfFrame_AArch64_neon_end
 
+_height_loop8x8:
+    mov x7, x6
+_width_loop8x8_2:
+    subs x0, x8, x7
+    subs x1, x4, x7, lsl #1
+    subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i]
+    ldrh  w10, [x9] // sum of last line of pFeatureOfBlock[i]
+
+    subs x11, x0, x3
+    ld1 {v0.d}[1], [x11]
+    add x0, x11, x3, lsl #3
+    ld1 {v0.d}[0], [x0] //
+
+    uaddlp v0.8h, v0.16b
+    addp v0.8h, v0.8h, v1.8h
+    uaddlp v0.4s, v0.8h
+    umov w11, v0.s[0]
+    umov w12, v0.s[1]
+
+    subs w10, w10, w12
+    mov x0, #0
+    add w0, w10, w11
+    strh w0, [x1] // sum -> pFeatureOfBlock[i]
+    add x1, x5, x0, lsl #2
+    ldr w0, [x1]
+    add w0, w0, #1
+    str w0, [x1]
+    subs x7, x7, #1
+    cbnz x7, _width_loop8x8_2
+
+    add x8, x8, x3
+    add x4, x4, x6, lsl #1
+    subs x2, x2, #1
+    cbnz x2, _height_loop8x8
+_SumOf8x8BlockOfFrame_AArch64_neon_end:
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
@@ -119,9 +153,8 @@
     add x8, x8, x6
     add x4, x4, x6, lsl #1
 
-_height_loop16x16:
     mov x7, x6
-_width_loop16x16:
+_width_loop16x16_1:
     subs x0, x8, x7
     ld1 {v0.16b}, [x0], x3
     uaddlp v0.8h, v0.16b
@@ -141,11 +174,47 @@
     add w0, w0, #1
     str w0, [x1]
     subs x7, x7, #1
-    cbnz x7, _width_loop16x16
+    cbnz x7, _width_loop16x16_1
 
     add x8, x8, x3
     add x4, x4, x6, lsl #1
     subs x2, x2, #1
+    cbz x2, _SumOf16x16BlockOfFrame_AArch64_neon_end
+
+_height_loop16x16:
+    mov x7, x6
+_width_loop16x16_2:
+    subs x0, x8, x7
+
+    subs x1, x4, x7, lsl #1
+    subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i]
+    ldrh  w10, [x9] // sum of last line of pFeatureOfBlock[i]
+
+    subs x11, x0, x3
+    ld1 {v1.16b}, [x11]
+    add x0, x11, x3, lsl #4
+    ld1 {v0.16b}, [x0] //
+
+    uaddlv h0, v0.16b
+    uaddlv h1, v1.16b
+    umov w11, v0.h[0]
+    umov w12, v1.h[0]
+
+    subs w10, w10, w12
+    mov x0, #0
+    add w0, w10, w11
+    strh w0, [x1] // sum -> pFeatureOfBlock[i]
+    add x1, x5, x0, lsl #2
+    ldr w0, [x1]
+    add w0, w0, #1
+    str w0, [x1]
+    subs x7, x7, #1
+    cbnz x7, _width_loop16x16_2
+
+    add x8, x8, x3
+    add x4, x4, x6, lsl #1
+    subs x2, x2, #1
     cbnz x2, _height_loop16x16
+_SumOf16x16BlockOfFrame_AArch64_neon_end:
 WELS_ASM_AARCH64_FUNC_END
 #endif
\ No newline at end of file