ref: 679754e77dd809a607fbd057109be59e7039bd7d
parent: ce6f1f77eb29bf96f946e9529ca5caa29a898f95
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Fri Oct 19 06:28:54 EDT 2018
Fix wiener overflow (#79).
--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -36,7 +36,6 @@
pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
pb_15: times 16 db 15
-pw_128: times 2 dw 128
pw_2048: times 2 dw 2048
pw_16380: times 2 dw 16380
pw_0_128: dw 0, 128
@@ -50,8 +49,6 @@
vpbroadcastb m14, [fhq+2]
vpbroadcastb m13, [fhq+4]
vpbroadcastw m12, [fhq+6]
- vpbroadcastd m9, [pw_128]
- paddw m12, m9
vpbroadcastd m11, [pw_2048]
vpbroadcastd m10, [pw_16380]
lea r11, [pb_right_ext_mask]
@@ -153,17 +150,20 @@
pmaddubsw m3, m14
pmaddubsw m7, m13
pmaddubsw m4, m13
- pmullw m6, m12
- pmullw m5, m12
- ; note that m6/5 are unsigned here, whereas the others are signed
- psubw m0, m10
- psubw m2, m10
paddw m0, m8
paddw m2, m3
+ psllw m8, m6, 7
+ psllw m3, m5, 7
+ psubw m8, m10
+ psubw m3, m10
+ pmullw m6, m12
+ pmullw m5, m12
paddw m0, m7
paddw m2, m4
paddw m0, m6
paddw m2, m5
+ paddsw m0, m8
+ paddsw m2, m3
psraw m0, 3
psraw m2, 3
paddw m0, m11