ref: 711f5cabe727eeff386dc2d25d5c2e5f7f6d6785
parent: 1e34a61dd60fc45bc1a7f668162dc7d7da340fa7
parent: e14186b5354289e2504c505f9612fe455411b18e
author: zhilwang <zhilwang@cisco.com>
date: Fri Aug 15 05:02:15 EDT 2014
Merge pull request #1273 from dongzha/RefineArmCodeForSumBlock refine arm code for sum of frame
--- a/codec/encoder/core/arm/svc_motion_estimation.S
+++ b/codec/encoder/core/arm/svc_motion_estimation.S
@@ -72,9 +72,9 @@
WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
- stmdb sp!, {r4-r8}
- ldr r5, [sp, #24] //pTimesOfFeatureValue
- ldr r4, [sp, #20] //pFeatureOfBlock
+ stmdb sp!, {r4-r12}
+ ldr r5, [sp, #40] //pTimesOfFeatureValue
+ ldr r4, [sp, #36] //pFeatureOfBlock
mov r8, r0
mov r6, r1
@@ -81,9 +81,8 @@
add r8, r6
add r4, r4, r6, lsl #1
-_height_loop8x8:
mov r7, r6
-_width_loop8x8:
+_width_loop8x8_1:
subs r0, r8, r7
vld1.64 {d0}, [r0], r3
vld1.64 {d1}, [r0], r3
@@ -98,7 +97,6 @@
vpadal.u8 q0, q1
vpadal.u8 q0, q2
vpadal.u8 q0, q3
-
vpaddl.u16 q0, q0
vpadd.i32 d0, d1
vpadd.i32 d0, d0
@@ -112,21 +110,57 @@
str r0, [r1]
subs r7, #1
- bne _width_loop8x8
+ bne _width_loop8x8_1
add r8, r3
add r4, r4, r6, lsl #1
subs r2, #1
- bne _height_loop8x8
+ beq _SumOf8x8BlockOfFrame_end
- ldmia sp!, {r4-r8}
+
+_height_loop8x8:
+ mov r7, r6
+_width_loop8x8_2:
+ subs r0, r8, r7
+ subs r1, r4, r7, lsl #1
+
+ subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i]
+ ldrh r10, [r9] // sum of last line of pFeatureOfBlock[i]
+
+ subs r11, r0, r3
+ vld1.64 {d1}, [r11]
+ add r0, r11, r3, lsl #3
+ vld1.64 {d0}, [r0] //
+
+ vpaddl.u8 q0, q0
+ vpadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vmov r11, r12, d0
+ subs r10, r12
+ add r0, r10, r11
+
+ strh r0, [r1] // sum -> pFeatureOfBlock[i]
+
+ add r1, r5, r0, lsl #2
+ ldr r0, [r1]
+ add r0, #1
+ str r0, [r1]
+ subs r7, #1
+ bne _width_loop8x8_2
+
+ add r8, r3
+ add r4, r4, r6, lsl #1
+ subs r2, #1
+ bne _height_loop8x8
+_SumOf8x8BlockOfFrame_end:
+ ldmia sp!, {r4-r12}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
- stmdb sp!, {r4-r8}
- ldr r5, [sp, #24] //pTimesOfFeatureValue
- ldr r4, [sp, #20] //pFeatureOfBlock
+ stmdb sp!, {r4-r12}
+ ldr r5, [sp, #40] //pTimesOfFeatureValue
+ ldr r4, [sp, #36] //pFeatureOfBlock
mov r8, r0
mov r6, r1
@@ -133,9 +167,8 @@
add r8, r6
add r4, r4, r6, lsl #1
-_height_loop16x16:
mov r7, r6
-_width_loop16x16:
+_width_loop16x16_1:
subs r0, r8, r7
vld1.64 {q0}, [r0], r3
vpaddl.u8 q0, q0
@@ -156,13 +189,50 @@
str r0, [r1]
subs r7, #1
- bne _width_loop16x16
+ bne _width_loop16x16_1
+ add r8, r3
+ add r4, r4, r6, lsl #1
+ subs r2, #1
+ beq _SumOf16x16BlockOfFrame_neon_end
+_height_loop16x16:
+ mov r7, r6
+_width_loop16x16_2:
+ subs r0, r8, r7
+ subs r1, r4, r7, lsl #1
+ subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i]
+ ldrh r10, [r9] // sum of last line of pFeatureOfBlock[i]
+
+ subs r11, r0, r3
+ vld1.64 {q1}, [r11]
+ add r0, r11, r3, lsl #4
+ vld1.64 {q0}, [r0] //
+
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vpadd.u16 d0, d0, d1
+ vpadd.u16 d1, d2, d3
+ vpadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+
+ vmov r11, r12, d0
+ subs r10, r12
+ add r0, r10, r11
+
+ strh r0, [r1] // sum -> pFeatureOfBlock[i]
+ add r1, r5, r0, lsl #2
+ ldr r0, [r1]
+ add r0, #1
+ str r0, [r1]
+
+ subs r7, #1
+ bne _width_loop16x16_2
+
add r8, r3
add r4, r4, r6, lsl #1
subs r2, #1
bne _height_loop16x16
-
- ldmia sp!, {r4-r8}
+_SumOf16x16BlockOfFrame_neon_end:
+ ldmia sp!, {r4-r12}
WELS_ASM_FUNC_END
#endif
--- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -72,9 +72,8 @@
add x8, x8, x6
add x4, x4, x6, lsl #1
-_height_loop8x8:
mov x7, x6
-_width_loop8x8:
+_width_loop8x8_1:
subs x0, x8, x7
ld1 {v0.d}[0], [x0], x3
ld1 {v0.d}[1], [x0], x3
@@ -100,13 +99,48 @@
add w0, w0, #1
str w0, [x1]
subs x7, x7, #1
- cbnz x7, _width_loop8x8
+ cbnz x7, _width_loop8x8_1
add x8, x8, x3
add x4, x4, x6, lsl #1
subs x2, x2, #1
- cbnz x2, _height_loop8x8
+ cbz x2, _SumOf8x8BlockOfFrame_AArch64_neon_end
+_height_loop8x8:
+ mov x7, x6
+_width_loop8x8_2:
+ subs x0, x8, x7
+ subs x1, x4, x7, lsl #1
+ subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i]
+ ldrh w10, [x9] // sum of last line of pFeatureOfBlock[i]
+
+ subs x11, x0, x3
+ ld1 {v0.d}[1], [x11]
+ add x0, x11, x3, lsl #3
+ ld1 {v0.d}[0], [x0] //
+
+ uaddlp v0.8h, v0.16b
+ addp v0.8h, v0.8h, v1.8h
+ uaddlp v0.4s, v0.8h
+ umov w11, v0.s[0]
+ umov w12, v0.s[1]
+
+ subs w10, w10, w12
+ mov x0, #0
+ add w0, w10, w11
+ strh w0, [x1] // sum -> pFeatureOfBlock[i]
+ add x1, x5, x0, lsl #2
+ ldr w0, [x1]
+ add w0, w0, #1
+ str w0, [x1]
+ subs x7, x7, #1
+ cbnz x7, _width_loop8x8_2
+
+ add x8, x8, x3
+ add x4, x4, x6, lsl #1
+ subs x2, x2, #1
+ cbnz x2, _height_loop8x8
+_SumOf8x8BlockOfFrame_AArch64_neon_end:
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
@@ -119,9 +153,8 @@
add x8, x8, x6
add x4, x4, x6, lsl #1
-_height_loop16x16:
mov x7, x6
-_width_loop16x16:
+_width_loop16x16_1:
subs x0, x8, x7
ld1 {v0.16b}, [x0], x3
uaddlp v0.8h, v0.16b
@@ -141,11 +174,47 @@
add w0, w0, #1
str w0, [x1]
subs x7, x7, #1
- cbnz x7, _width_loop16x16
+ cbnz x7, _width_loop16x16_1
add x8, x8, x3
add x4, x4, x6, lsl #1
subs x2, x2, #1
+ cbz x2, _SumOf16x16BlockOfFrame_AArch64_neon_end
+
+_height_loop16x16:
+ mov x7, x6
+_width_loop16x16_2:
+ subs x0, x8, x7
+
+ subs x1, x4, x7, lsl #1
+ subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i]
+ ldrh w10, [x9] // sum of last line of pFeatureOfBlock[i]
+
+ subs x11, x0, x3
+ ld1 {v1.16b}, [x11]
+ add x0, x11, x3, lsl #4
+ ld1 {v0.16b}, [x0] //
+
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ umov w11, v0.h[0]
+ umov w12, v1.h[0]
+
+ subs w10, w10, w12
+ mov x0, #0
+ add w0, w10, w11
+ strh w0, [x1] // sum -> pFeatureOfBlock[i]
+ add x1, x5, x0, lsl #2
+ ldr w0, [x1]
+ add w0, w0, #1
+ str w0, [x1]
+ subs x7, x7, #1
+ cbnz x7, _width_loop16x16_2
+
+ add x8, x8, x3
+ add x4, x4, x6, lsl #1
+ subs x2, x2, #1
cbnz x2, _height_loop16x16
+_SumOf16x16BlockOfFrame_AArch64_neon_end:
WELS_ASM_AARCH64_FUNC_END
#endif
\ No newline at end of file