shithub: dav1d

Download patch

ref: d4df861993010586fdf61794f12ae923891872ac
parent: b704a993f61b1b07b1f3ac478935992239383084
author: B Krishnan Iyer <krishnaniyer97@gmail.com>
date: Tue Jul 23 12:07:11 EDT 2019

arm: mc: neon: Reduce usage of general purpose registers in blend/blend_v functions

	                	A73		A53
                	Current	Earlier	Current	Earlier
blend_h_w2_8bpc_neon:	74.1	74.1	137.5	137.5
blend_h_w4_8bpc_neon:	65.8	65.8	147.1	147.1
blend_h_w8_8bpc_neon:	68.9	68.7	131.7	131.7
blend_h_w16_8bpc_neon:	86	85.6	190.3	190.4
blend_h_w32_8bpc_neon:	149.2	149.8	358	358.3
blend_h_w64_8bpc_neon:	263.1	264.1	629.8	630.3
blend_h_w128_8bpc_neon:	571	575.4	1404.5	1404.2
blend_v_w2_8bpc_neon:	118.7	120.1	195.3	196.4
blend_v_w4_8bpc_neon:	245.8	247.2	357.3	358.4
blend_v_w8_8bpc_neon:	202	204.2	357.2	358.4
blend_v_w16_8bpc_neon:	234.8	238.5	591.3	591.8
blend_v_w32_8bpc_neon:	344.4	347.2	994.7	997.2
blend_w4_8bpc_neon:	37.5	38.3	96.7	98.7
blend_w8_8bpc_neon:	53	54.8	123.3	125.3
blend_w16_8bpc_neon:	151	150.8	332.4	334.5
blend_w32_8bpc_neon:	370.9	361.6	908.4	910.7

--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -451,15 +451,15 @@
 
 
 function blend_8bpc_neon, export=1
-        push            {r4-r8,lr}
-        ldr             r4,  [sp, #24]
-        ldr             r5,  [sp, #28]
-        clz             r6,  r3
-        adr             r7,  L(blend_tbl)
-        sub             r6,  r6,  #26
-        ldr             r6,  [r7, r6, lsl #2]
-        add             r7,  r7,  r6
-        bx              r7
+        push            {r4-r5,lr}
+        ldr             r4,  [sp, #12]
+        ldr             r5,  [sp, #16]
+        clz             lr,  r3
+        adr             r3,  L(blend_tbl)
+        sub             lr,  lr,  #26
+        ldr             lr,  [r3, lr, lsl #2]
+        add             r3,  r3,  lr
+        bx              r3
         .align 2
 L(blend_tbl):
         .word 320f  - L(blend_tbl) + CONFIG_THUMB
@@ -486,7 +486,7 @@
         vst1.32         {d20[0]}, [r0],  r1
         vst1.32         {d20[1]}, [r12], r1
         bgt             4b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 80:
         vmov.i8         d16, #64
         add             r12, r0,  r1
@@ -510,7 +510,7 @@
         vst1.u8         {d22}, [r0],  r1
         vst1.u8         {d23}, [r12], r1
         bgt             8b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 160:
         vmov.i8         q12, #64
         add             r12, r0,  r1
@@ -540,8 +540,7 @@
         vst1.u8         {q9},  [r0],  r1
         vst1.u8         {q10}, [r12], r1
         bgt             16b
-        pop             {r4-r8,pc}
-
+        pop             {r4-r5,pc}
 320:
         vmov.i8         q10, #64
 32:
@@ -565,7 +564,7 @@
         vrshrn.i16      d27, q14, #6
         vst1.u8         {q12, q13}, [r0],  r1
         bgt             32b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 endfunc
 
 function blend_h_8bpc_neon, export=1
@@ -719,16 +718,16 @@
 endfunc
 
 function blend_v_8bpc_neon, export=1
-        push            {r4-r8,lr}
-        ldr             r4,  [sp, #24]
+        push            {r4-r5,lr}
+        ldr             r4,  [sp, #12]
         movrel          r5,  X(obmc_masks)
         add             r5,  r5,  r3
-        clz             r8,  r3
-        adr             r7,  L(blend_v_tbl)
-        sub             r8,  r8,  #26
-        ldr             r8,  [r7, r8, lsl #2]
-        add             r7,  r7,  r8
-        bx              r7
+        clz             lr,  r3
+        adr             r3,  L(blend_v_tbl)
+        sub             lr,  lr,  #26
+        ldr             lr,  [r3, lr, lsl #2]
+        add             r3,  r3,  lr
+        bx              r3
         .align 2
 L(blend_v_tbl):
         .word 320f  - L(blend_v_tbl) + CONFIG_THUMB
@@ -756,7 +755,7 @@
         vst1.8          {d6[0]}, [r0],  r1
         vst1.8          {d6[1]}, [r12], r1
         bgt             2b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 40:
         vmov.i8         d22, #64
         vld1.32         {d4[]},  [r5]
@@ -780,7 +779,7 @@
         add             r0,  r0,  r1
         add             r12, r12, r1
         bgt             4b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 80:
         vmov.i8         d16, #64
         vld1.u8         {d2}, [r5]
@@ -807,7 +806,7 @@
         add             r0,  r0,  r1
         add             r12, r12, r1
         bgt             8b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 160:
         vmov.i8         q12, #64
         vld1.u8         {q2},  [r5]
@@ -840,7 +839,7 @@
         add             r0,  r0,  r1
         add             r12, r12, r1
         bgt             16b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 320:
         vmov.i8         q10, #64
         vld1.u8         {q2, q3},  [r5]
@@ -861,7 +860,7 @@
         vrshrn.i16      d2,  q15, #6
         vst1.u8         {d0, d1, d2}, [r0], r1
         bgt             32b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 endfunc