shithub: dav1d

Download patch

ref: 010eae8b2d6ad8713df22de7e7603fb30d4d8ceb
parent: ef64567e1d6c5444c92097200680769b67a37da4
author: Martin Storsjö <martin@martin.st>
date: Wed Jan 15 07:41:08 EST 2020

arm64: itx: Fix overflow/clipping in negation in idct16

Don't assume we can do a clipped negation in 16 bit before the
multiplication (as it might affect the end result), but do the
multiplication first and negate in 32 bit, just like in the reference.

--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -1070,9 +1070,12 @@
         rshrn_sz        v27, v6,  v7,  #12, \sz                   // t14a
 
         smull_smlsl     v4,  v5,  v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
-        neg             v29\sz,  v29\sz
-        smull_smlsl     v6,  v7,  v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
+        smull_smlal     v6,  v7,  v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
         rshrn_sz        v29, v4,  v5,  #12, \sz                   // t13a
+        neg             v6.4s,   v6.4s
+.ifc \sz, .8h
+        neg             v7.4s,   v7.4s
+.endif
         rshrn_sz        v23, v6,  v7,  #12, \sz                   // t10a
 
         sqsub           v2\sz,   v17\sz,  v19\sz  // t11a