ref: 010eae8b2d6ad8713df22de7e7603fb30d4d8ceb
parent: ef64567e1d6c5444c92097200680769b67a37da4
author: Martin Storsjö <martin@martin.st>
date: Wed Jan 15 07:41:08 EST 2020
arm64: itx: Fix overflow/clipping in negation in idct16 Don't assume we can do a clipped negation in 16 bit before the multiplication (as it might affect the end result), but do the multiplication first and negate in 32 bit, just like in the reference.
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -1070,9 +1070,12 @@
rshrn_sz v27, v6, v7, #12, \sz // t14a
smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
- neg v29\sz, v29\sz
- smull_smlsl v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
+ smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
rshrn_sz v29, v4, v5, #12, \sz // t13a
+ neg v6.4s, v6.4s
+.ifc \sz, .8h
+ neg v7.4s, v7.4s
+.endif
rshrn_sz v23, v6, v7, #12, \sz // t10a
sqsub v2\sz, v17\sz, v19\sz // t11a