ref: 65d9599807330af74033b50f3063b2bafd539995
parent: 5df6c0458555dd03fd5796e9d9342d1394ade446
	author: James Zern <jzern@google.com>
	date: Wed Jun  3 14:51:13 EDT 2015
	
vp9_reconintra_neon_asm/tm4x4: simplify left load
use vld1.8 {d0[]}, [r0] rather than ldrb+vdup; mildly faster
Change-Id: Ia5ffc736bcb0f5497b7d9e55a93bf5a5f5f6928c
--- a/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm
@@ -298,8 +298,7 @@
|vp9_tm_predictor_4x4_neon| PROC
; Load ytop_left = above[-1];
sub r12, r2, #1
- ldrb r12, [r12]
- vdup.u8 d0, r12
+    vld1.u8             {d0[]}, [r12]; Load above 4 pixels
     vld1.32             {d2[0]}, [r2]@@ -309,10 +308,10 @@
; Load left row by row and compute left + (above - ytop_left)
; 1st row and 2nd row
- ldrb r12, [r3], #1
- ldrb r2, [r3], #1
- vdup.u16 q1, r12
- vdup.u16 q2, r2
+    vld1.u8             {d2[]}, [r3]!+    vld1.u8             {d4[]}, [r3]!+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
vadd.s16 q1, q1, q3
vadd.s16 q2, q2, q3
vqmovun.s16 d0, q1
@@ -321,10 +320,10 @@
     vst1.32             {d1[0]}, [r0], r1; 3rd row and 4th row
- ldrb r12, [r3], #1
- ldrb r2, [r3], #1
- vdup.u16 q1, r12
- vdup.u16 q2, r2
+    vld1.u8             {d2[]}, [r3]!+    vld1.u8             {d4[]}, [r3]+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
vadd.s16 q1, q1, q3
vadd.s16 q2, q2, q3
vqmovun.s16 d0, q1
--
⑨