ref: d4df861993010586fdf61794f12ae923891872ac
parent: b704a993f61b1b07b1f3ac478935992239383084
author: B Krishnan Iyer <krishnaniyer97@gmail.com>
date: Tue Jul 23 12:07:11 EDT 2019
arm: mc: neon: Reduce usage of general purpose registers in blend/blend_v functions A73 A53 Current Earlier Current Earlier blend_h_w2_8bpc_neon: 74.1 74.1 137.5 137.5 blend_h_w4_8bpc_neon: 65.8 65.8 147.1 147.1 blend_h_w8_8bpc_neon: 68.9 68.7 131.7 131.7 blend_h_w16_8bpc_neon: 86 85.6 190.3 190.4 blend_h_w32_8bpc_neon: 149.2 149.8 358 358.3 blend_h_w64_8bpc_neon: 263.1 264.1 629.8 630.3 blend_h_w128_8bpc_neon: 571 575.4 1404.5 1404.2 blend_v_w2_8bpc_neon: 118.7 120.1 195.3 196.4 blend_v_w4_8bpc_neon: 245.8 247.2 357.3 358.4 blend_v_w8_8bpc_neon: 202 204.2 357.2 358.4 blend_v_w16_8bpc_neon: 234.8 238.5 591.3 591.8 blend_v_w32_8bpc_neon: 344.4 347.2 994.7 997.2 blend_w4_8bpc_neon: 37.5 38.3 96.7 98.7 blend_w8_8bpc_neon: 53 54.8 123.3 125.3 blend_w16_8bpc_neon: 151 150.8 332.4 334.5 blend_w32_8bpc_neon: 370.9 361.6 908.4 910.7
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -451,15 +451,15 @@
function blend_8bpc_neon, export=1
- push {r4-r8,lr}
- ldr r4, [sp, #24]
- ldr r5, [sp, #28]
- clz r6, r3
- adr r7, L(blend_tbl)
- sub r6, r6, #26
- ldr r6, [r7, r6, lsl #2]
- add r7, r7, r6
- bx r7
+ push {r4-r5,lr}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ clz lr, r3
+ adr r3, L(blend_tbl)
+ sub lr, lr, #26
+ ldr lr, [r3, lr, lsl #2]
+ add r3, r3, lr
+ bx r3
.align 2
L(blend_tbl):
.word 320f - L(blend_tbl) + CONFIG_THUMB
@@ -486,7 +486,7 @@
vst1.32 {d20[0]}, [r0], r1
vst1.32 {d20[1]}, [r12], r1
bgt 4b
- pop {r4-r8,pc}
+ pop {r4-r5,pc}
80:
vmov.i8 d16, #64
add r12, r0, r1
@@ -510,7 +510,7 @@
vst1.u8 {d22}, [r0], r1
vst1.u8 {d23}, [r12], r1
bgt 8b
- pop {r4-r8,pc}
+ pop {r4-r5,pc}
160:
vmov.i8 q12, #64
add r12, r0, r1
@@ -540,8 +540,7 @@
vst1.u8 {q9}, [r0], r1
vst1.u8 {q10}, [r12], r1
bgt 16b
- pop {r4-r8,pc}
-
+ pop {r4-r5,pc}
320:
vmov.i8 q10, #64
32:
@@ -565,7 +564,7 @@
vrshrn.i16 d27, q14, #6
vst1.u8 {q12, q13}, [r0], r1
bgt 32b
- pop {r4-r8,pc}
+ pop {r4-r5,pc}
endfunc
function blend_h_8bpc_neon, export=1
@@ -719,16 +718,16 @@
endfunc
function blend_v_8bpc_neon, export=1
- push {r4-r8,lr}
- ldr r4, [sp, #24]
+ push {r4-r5,lr}
+ ldr r4, [sp, #12]
movrel r5, X(obmc_masks)
add r5, r5, r3
- clz r8, r3
- adr r7, L(blend_v_tbl)
- sub r8, r8, #26
- ldr r8, [r7, r8, lsl #2]
- add r7, r7, r8
- bx r7
+ clz lr, r3
+ adr r3, L(blend_v_tbl)
+ sub lr, lr, #26
+ ldr lr, [r3, lr, lsl #2]
+ add r3, r3, lr
+ bx r3
.align 2
L(blend_v_tbl):
.word 320f - L(blend_v_tbl) + CONFIG_THUMB
@@ -756,7 +755,7 @@
vst1.8 {d6[0]}, [r0], r1
vst1.8 {d6[1]}, [r12], r1
bgt 2b
- pop {r4-r8,pc}
+ pop {r4-r5,pc}
40:
vmov.i8 d22, #64
vld1.32 {d4[]}, [r5]
@@ -780,7 +779,7 @@
add r0, r0, r1
add r12, r12, r1
bgt 4b
- pop {r4-r8,pc}
+ pop {r4-r5,pc}
80:
vmov.i8 d16, #64
vld1.u8 {d2}, [r5]
@@ -807,7 +806,7 @@
add r0, r0, r1
add r12, r12, r1
bgt 8b
- pop {r4-r8,pc}
+ pop {r4-r5,pc}
160:
vmov.i8 q12, #64
vld1.u8 {q2}, [r5]
@@ -840,7 +839,7 @@
add r0, r0, r1
add r12, r12, r1
bgt 16b
- pop {r4-r8,pc}
+ pop {r4-r5,pc}
320:
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5]
@@ -861,7 +860,7 @@
vrshrn.i16 d2, q15, #6
vst1.u8 {d0, d1, d2}, [r0], r1
bgt 32b
- pop {r4-r8,pc}
+ pop {r4-r5,pc}
endfunc