ref: b704a993f61b1b07b1f3ac478935992239383084
parent: 5ab6d23190edd767d98ef565398aba9938aa6afb
author: B Krishnan Iyer <krishnaniyer97@gmail.com>
date: Mon Jul 22 17:08:57 EDT 2019
arm: mc: neon: Use vld with ! post-increment instead of a register in blend/blend_h/blend_v function A73 A53 Current Earlier Current Earlier blend_h_w2_8bpc_neon: 74.1 74.6 137.5 137 blend_h_w4_8bpc_neon: 65.8 66 147.1 146.6 blend_h_w8_8bpc_neon: 68.7 68.6 131.7 131.2 blend_h_w16_8bpc_neon: 85.6 85.9 190.4 192 blend_h_w32_8bpc_neon: 149.8 149.8 358.3 357.6 blend_h_w64_8bpc_neon: 264.1 262.8 630.3 629.5 blend_h_w128_8bpc_neon: 575.4 577 1404.2 1402 blend_v_w2_8bpc_neon: 120.1 121.3 196.4 195.5 blend_v_w4_8bpc_neon: 247.2 247.5 358.4 358.5 blend_v_w8_8bpc_neon: 204.2 205.2 358.4 358.5 blend_v_w16_8bpc_neon: 238.5 237.1 591.8 590.5 blend_v_w32_8bpc_neon: 347.2 345.8 997.2 994.1 blend_w4_8bpc_neon: 38.3 38.6 98.7 99.2 blend_w8_8bpc_neon: 54.8 55.1 125.3 125.8 blend_w16_8bpc_neon: 150.8 150.1 334.5 344 blend_w32_8bpc_neon: 361.6 360.4 910.7 910.9
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -472,12 +472,12 @@
add r12, r0, r1
lsl r1, r1, #1
4:
- vld1.32 {d2[]}, [r5], r3
- vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d2[]}, [r5]!
+ vld1.32 {d1[]}, [r2]!
vld1.32 {d0[]}, [r0]
subs r4, r4, #2
- vld1.32 {d2[1]}, [r5], r3
- vld1.32 {d1[1]}, [r2], r3
+ vld1.32 {d2[1]}, [r5]!
+ vld1.32 {d1[1]}, [r2]!
vld1.32 {d0[1]}, [r12]
vsub.i8 d3, d22, d2
vmull.u8 q8, d1, d2
@@ -492,12 +492,12 @@
add r12, r0, r1
lsl r1, r1, #1
8:
- vld1.u8 {d2}, [r5], r3
- vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d2}, [r5]!
+ vld1.u8 {d4}, [r2]!
vld1.u8 {d0}, [r0]
vsub.i8 d17, d16, d2
- vld1.u8 {d3}, [r5], r3
- vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d3}, [r5]!
+ vld1.u8 {d5}, [r2]!
vld1.u8 {d1}, [r12]
subs r4, r4, #2
vsub.i8 d18, d16, d3
@@ -516,13 +516,13 @@
add r12, r0, r1
lsl r1, r1, #1
16:
- vld1.u8 {q2}, [r5], r3
- vld1.u8 {q1}, [r2], r3
+ vld1.u8 {q2}, [r5]!
+ vld1.u8 {q1}, [r2]!
vld1.u8 {q0}, [r0]
subs r4, r4, #2
vsub.i8 q11, q12, q2
- vld1.u8 {q15}, [r5], r3
- vld1.u8 {q14}, [r2], r3
+ vld1.u8 {q15}, [r5]!
+ vld1.u8 {q14}, [r2]!
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d22
@@ -545,8 +545,8 @@
320:
vmov.i8 q10, #64
32:
- vld1.u8 {q2, q3}, [r5], r3
- vld1.u8 {q8, q9}, [r2], r3
+ vld1.u8 {q2, q3}, [r5]!
+ vld1.u8 {q8, q9}, [r2]!
vld1.u8 {q0, q1}, [r0]
subs r4, r4, #1
vsub.i8 q11, q10, q2
@@ -596,11 +596,11 @@
lsl r1, r1, #1
2:
vld1.16 {d2[], d3[]}, [r5]!
- vld1.16 {d1[]}, [r2], r3
+ vld1.16 {d1[]}, [r2]!
subs r4, r4, #2
vld1.16 {d0[]}, [r0]
vzip.8 d2, d3
- vld1.16 {d1[1]}, [r2], r3
+ vld1.16 {d1[1]}, [r2]!
vsub.i8 d4, d22, d2
vld1.16 {d0[1]}, [r12]
vmull.u8 q8, d1, d2
@@ -616,10 +616,10 @@
lsl r1, r1, #1
4:
vld1.u8 {d2[]}, [r5]!
- vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d1[]}, [r2]!
subs r4, r4, #2
vld1.u8 {d6[]}, [r5]!
- vld1.32 {d1[1]}, [r2], r3
+ vld1.32 {d1[1]}, [r2]!
vext.u8 d2, d2, d6, #4
vld1.32 {d0[]}, [r0]
vsub.i8 d3, d22, d2
@@ -637,11 +637,11 @@
lsl r1, r1, #1
8:
vld1.u8 {d2[]}, [r5]!
- vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d4}, [r2]!
vld1.u8 {d0}, [r0]
vsub.i8 d17, d16, d2
vld1.u8 {d3[]}, [r5]!
- vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d5}, [r2]!
vld1.u8 {d1}, [r12]
subs r4, r4, #2
vsub.i8 d18, d16, d3
@@ -661,12 +661,12 @@
lsl r1, r1, #1
16:
vld1.u8 {d4[]}, [r5]!
- vld1.u8 {q1}, [r2], r3
+ vld1.u8 {q1}, [r2]!
vsub.i8 d5, d24, d4
vld1.u8 {q0}, [r0]
subs r4, r4, #2
vld1.u8 {d30[]}, [r5]!
- vld1.u8 {q14}, [r2], r3
+ vld1.u8 {q14}, [r2]!
vsub.i8 d31, d24, d30
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d4
@@ -744,14 +744,15 @@
lsl r1, r1, #1
vsub.i8 d3, d22, d2
2:
- vld1.8 {d1[]}, [r2], r3
+ vld1.16 {d1[]}, [r2]!
vld1.8 {d0[]}, [r0]
subs r4, r4, #2
- vld1.8 {d1[1]}, [r2], r3
+ vld1.8 {d1[1]}, [r2]
vld1.8 {d0[1]}, [r12]
vmull.u8 q2, d1, d2
vmlal.u8 q2, d0, d3
vrshrn.i16 d6, q2, #6
+ add r2, r2, #2
vst1.8 {d6[0]}, [r0], r1
vst1.8 {d6[1]}, [r12], r1
bgt 2b
@@ -764,9 +765,9 @@
vsub.i8 d5, d22, d4
sub r1, r1, #3
4:
- vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d2[]}, [r2]!
vld1.32 {d0[]}, [r0]
- vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d2[1]}, [r2]!
vld1.32 {d0[1]}, [r12]
subs r4, r4, #2
vmull.u8 q3, d2, d4
@@ -788,9 +789,9 @@
vsub.i8 d17, d16, d2
sub r1, r1, #6
8:
- vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d4}, [r2]!
vld1.u8 {d0}, [r0]
- vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d5}, [r2]!
vld1.u8 {d1}, [r12]
subs r4, r4, #2
vmull.u8 q3, d2, d4
@@ -815,10 +816,10 @@
vsub.i8 q11, q12, q2
sub r1, r1, #12
16:
- vld1.u8 {q1}, [r2], r3
+ vld1.u8 {q1}, [r2]!
vld1.u8 {q0}, [r0]
subs r4, r4, #2
- vld1.u8 {q14}, [r2], r3
+ vld1.u8 {q14}, [r2]!
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d22
@@ -846,7 +847,7 @@
vsub.i8 q11, q10, q2
vsub.i8 q12, q10, q3
32:
- vld1.u8 {q8, q9}, [r2], r3
+ vld1.u8 {q8, q9}, [r2]!
vld1.u8 {q0, q1}, [r0]
subs r4, r4, #1
vmull.u8 q15, d16, d4