shithub: dav1d

Download patch

ref: b4291523d926658c68c29f7f5d2b270e19ed39c2
parent: 8fd0bc90ba4ce34e62eea90b9df7c105cccf2886
author: Martin Storsjö <martin@martin.st>
date: Fri Jun 26 20:37:52 EDT 2020

arm32: ipred: Optimize ipred_dc_w32

Do the horizontal summing in the same way as for other cases of
32 pixel summing.

This doesn't seem to affect the runtime significantly though (checkasm
benchmarks vary by a couple cycles), but it's 5 instructions shorter
at least.

--- a/src/arm/32/ipred.S
+++ b/src/arm/32/ipred.S
@@ -718,16 +718,13 @@
         add             r2,  r2,  #1
         vld1.8          {d2,  d3,  d4,  d5},  [r2]
         vadd.s16        d0,  d0,  d30
-        vaddl.u8        q2,  d4,  d5
-        vadd.u16        d4,  d4,  d5
         vaddl.u8        q1,  d2,  d3
+        vaddl.u8        q2,  d4,  d5
+        vadd.u16        q1,  q1,  q2
         vadd.u16        d2,  d2,  d3
-        vpadd.u16       d4,  d4
         vpadd.u16       d2,  d2
-        vpadd.u16       d4,  d4
         vpadd.u16       d2,  d2
         cmp             r4,  #32
-        vadd.s16        d0,  d0,  d4
         vadd.s16        d0,  d0,  d2
         vshl.u16        d4,  d0,  d28
         beq             1f