ref: 52e9b4353f968fd27e2bd912b0e2302509063068
parent: a7f6fe32989ed5c13cc588f17db59b747d4a5fd5
	author: Martin Storsjö <martin@martin.st>
	date: Wed Mar  4 05:51:50 EST 2020
	
arm: mc: Optimize blend_v Use a post-increment with a register on the last increment, avoiding a separate increment. Avoid processing the last 8 pixels in the w32 case when we only output 24 pixels. Before: ARM32 Cortex A7 A8 A9 A53 A72 A73 blend_v_w4_8bpc_neon: 450.4 574.7 538.7 374.6 199.3 260.5 blend_v_w8_8bpc_neon: 559.6 351.3 552.5 357.6 214.8 204.3 blend_v_w16_8bpc_neon: 926.3 511.6 787.9 593.0 271.0 246.8 blend_v_w32_8bpc_neon: 1482.5 917.0 1149.5 991.9 354.0 368.9 ARM64 blend_v_w4_8bpc_neon: 351.1 200.0 224.1 blend_v_w8_8bpc_neon: 333.0 212.4 203.8 blend_v_w16_8bpc_neon: 495.2 302.0 247.0 blend_v_w32_8bpc_neon: 840.0 557.8 514.0 After: ARM32 blend_v_w4_8bpc_neon: 435.5 575.8 537.6 356.2 198.3 259.5 blend_v_w8_8bpc_neon: 545.2 347.9 553.5 339.1 207.8 204.2 blend_v_w16_8bpc_neon: 913.7 511.0 788.1 573.7 275.4 243.3 blend_v_w32_8bpc_neon: 1445.3 951.2 1079.1 920.4 352.2 361.6 ARM64 blend_v_w4_8bpc_neon: 333.0 191.3 225.9 blend_v_w8_8bpc_neon: 314.9 199.3 203.5 blend_v_w16_8bpc_neon: 476.9 301.3 241.1 blend_v_w32_8bpc_neon: 766.9 432.8 416.9
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -753,7 +753,7 @@
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d5, d22, d4
- sub r1, r1, #3
+ sub r1, r1, #2
4:
         vld1.u8         {d2},     [r2,  :64]!         vld1.32         {d0[]},   [r0,  :32]@@ -764,10 +764,8 @@
vrshrn.i16 d20, q3, #6
         vst1.16         {d20[0]}, [r0,  :16]!         vst1.16         {d20[2]}, [r12, :16]!-        vst1.8          {d20[2]}, [r0]!-        vst1.8          {d20[6]}, [r12]!- add r0, r0, r1
- add r12, r12, r1
+        vst1.8          {d20[2]}, [r0],  r1+        vst1.8          {d20[6]}, [r12], r1bgt 4b
         pop             {r4-r5,pc}80:
@@ -776,7 +774,7 @@
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d17, d16, d2
- sub r1, r1, #6
+ sub r1, r1, #4
8:
         vld1.u8         {d4,  d5},  [r2,  :128]!         vld1.u8         {d0},  [r0,  :64]@@ -790,10 +788,8 @@
vrshrn.i16 d23, q10, #6
         vst1.32         {d22[0]}, [r0,  :32]!         vst1.32         {d23[0]}, [r12, :32]!-        vst1.16         {d22[2]}, [r0,  :16]!-        vst1.16         {d23[2]}, [r12, :16]!- add r0, r0, r1
- add r12, r12, r1
+        vst1.16         {d22[2]}, [r0,  :16], r1+        vst1.16         {d23[2]}, [r12, :16], r1bgt 8b
         pop             {r4-r5,pc}160:
@@ -802,7 +798,7 @@
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 q11, q12, q14
- sub r1, r1, #12
+ sub r1, r1, #8
16:
         vld1.u8         {q1,  q2},  [r2,  :128]!         vld1.u8         {q0},  [r0,  :128]@@ -822,10 +818,8 @@
vrshrn.i16 d21, q8, #6
         vst1.u8         {d18},    [r0,  :64]!         vst1.u8         {d20},    [r12, :64]!-        vst1.32         {d19[0]}, [r0,  :32]!-        vst1.32         {d21[0]}, [r12, :32]!- add r0, r0, r1
- add r12, r12, r1
+        vst1.32         {d19[0]}, [r0,  :32], r1+        vst1.32         {d21[0]}, [r12, :32], r1bgt 16b
         pop             {r4-r5,pc}320:
@@ -832,10 +826,10 @@
vmov.i8 q10, #64
         vld1.u8         {q2,  q3},  [r5,  :128]vsub.i8 q11, q10, q2
- vsub.i8 q12, q10, q3
+ vsub.i8 d24, d20, d6
32:
         vld1.u8         {q8,  q9},  [r2,  :128]!-        vld1.u8         {q0,  q1},  [r0,  :128]+        vld1.u8         {d0,  d1,  d2},  [r0,  :64]subs r4, r4, #1
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -709,8 +709,8 @@
ret
40:
         ld1r            {v0.2s},   [x5]+ sub x1, x1, #2
sub v1.8b, v4.8b, v0.8b
- sub x1, x1, #3
4:
         ld1             {v2.8b},   [x2],  #8         ld1             {v3.s}[0],   [x0]@@ -721,16 +721,14 @@
rshrn v5.8b, v5.8h, #6
         st1             {v5.h}[0],   [x0],  #2         st1             {v5.h}[2],   [x8],  #2-        st1             {v5.b}[2],   [x0],  #1-        st1             {v5.b}[6],   [x8],  #1- add x0, x0, x1
- add x8, x8, x1
+        st1             {v5.b}[2],   [x0],  x1+        st1             {v5.b}[6],   [x8],  x1b.gt 4b
ret
80:
         ld1r            {v0.2d},   [x5]+ sub x1, x1, #4
sub v1.16b, v4.16b, v0.16b
- sub x1, x1, #6
8:
         ld1             {v2.16b},  [x2],  #16         ld1             {v3.d}[0],   [x0]@@ -744,16 +742,14 @@
rshrn2 v7.16b, v6.8h, #6
         st1             {v7.s}[0],   [x0],  #4         st1             {v7.s}[2],   [x8],  #4-        st1             {v7.h}[2],   [x0],  #2-        st1             {v7.h}[6],   [x8],  #2- add x0, x0, x1
- add x8, x8, x1
+        st1             {v7.h}[2],   [x0],  x1+        st1             {v7.h}[6],   [x8],  x1b.gt 8b
ret
160:
         ld1             {v0.16b},  [x5]+ sub x1, x1, #8
sub v2.16b, v4.16b, v0.16b
- sub x1, x1, #12
16:
         ld1             {v5.16b,  v6.16b},  [x2],  #32         ld1             {v7.16b},  [x0]@@ -773,17 +769,15 @@
rshrn2 v22.16b, v21.8h, #6
         st1             {v19.8b},  [x0],  #8         st1             {v22.8b},  [x8],  #8-        st1             {v19.s}[2],  [x0],  #4-        st1             {v22.s}[2],  [x8],  #4- add x0, x0, x1
- add x8, x8, x1
+        st1             {v19.s}[2],  [x0],  x1+        st1             {v22.s}[2],  [x8],  x1b.gt 16b
ret
320:
         ld1             {v0.16b,  v1.16b},  [x5]+ sub x1, x1, #16
sub v2.16b, v4.16b, v0.16b
- sub v3.16b, v4.16b, v1.16b
- sub x1, x1, #24
+ sub v3.8b, v4.8b, v1.8b
32:
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64         ld1             {v5.16b,  v6.16b},  [x0]@@ -795,8 +789,6 @@
umlal2 v23.8h, v5.16b, v2.16b
umull v28.8h, v17.8b, v1.8b
umlal v28.8h, v6.8b, v3.8b
- umull2 v29.8h, v17.16b, v1.16b
- umlal2 v29.8h, v6.16b, v3.16b
umull v30.8h, v18.8b, v0.8b
umlal v30.8h, v20.8b, v2.8b
umull2 v31.8h, v18.16b, v0.16b
@@ -803,22 +795,16 @@
umlal2 v31.8h, v20.16b, v2.16b
umull v25.8h, v19.8b, v1.8b
umlal v25.8h, v21.8b, v3.8b
- umull2 v26.8h, v19.16b, v1.16b
- umlal2 v26.8h, v21.16b, v3.16b
rshrn v24.8b, v22.8h, #6
rshrn2 v24.16b, v23.8h, #6
rshrn v28.8b, v28.8h, #6
- rshrn2 v28.16b, v29.8h, #6
rshrn v30.8b, v30.8h, #6
rshrn2 v30.16b, v31.8h, #6
rshrn v27.8b, v25.8h, #6
- rshrn2 v27.16b, v26.8h, #6
         st1             {v24.16b}, [x0],  #16         st1             {v30.16b}, [x8],  #16-        st1             {v28.8b},  [x0],  #8-        st1             {v27.8b},  [x8],  #8- add x0, x0, x1
- add x8, x8, x1
+        st1             {v28.8b},  [x0],  x1+        st1             {v27.8b},  [x8],  x1b.gt 32b
ret
L(blend_v_tbl):
--
⑨