ref: 8394990b2749608ea710a9fbfe82bb4bba1529c9
parent: ef5918098d5c7f8ffda960274e3f8e38f02cb487
author: Kyle Siefring <kylesiefring@gmail.com>
date: Mon May 1 05:15:29 EDT 2017
block error sse2: sum in 32 bits when possible Add 31bit pairs before unpacking in x86 block error code BUG=webm:1210 Change-Id: I5ca8c7f7775585a17fe09d6bbfc25e1f2955eb0a
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -39,23 +39,18 @@
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
+ paddd m2, m3
; accumulate in 64bit
punpckldq m7, m0, m5
punpckhdq m0, m5
paddq m4, m7
- punpckldq m7, m1, m5
- paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m7
punpckldq m7, m2, m5
- paddq m4, m1
+ paddq m4, m0
punpckhdq m2, m5
paddq m6, m7
- punpckldq m7, m3, m5
paddq m6, m2
- punpckhdq m3, m5
- paddq m6, m7
- paddq m6, m3
jg .loop
; accumulate horizontally and store in return value
@@ -98,15 +93,13 @@
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
pmaddwd m0, m0
pmaddwd m1, m1
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
; accumulate in 64bit
punpckldq m3, m0, m5
punpckhdq m0, m5
paddq m4, m3
- punpckldq m3, m1, m5
paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m3
- paddq m4, m1
jnz .loop
; accumulate horizontally and store in return value