ref: abff678866f9f74c58616705b01d460efb994fe2
parent: 166dc85bed481decb5fdd572a3300d8c6f83887e
	author: Jingning Han <jingning@google.com>
	date: Mon Aug 26 12:12:16 EDT 2013
	
Fix overflow issue in SSSE3 32x32 quantization The 32x32 quantization process can potentially have the intermediate stacks over 16-bit range, thereby causing enc/dec mismatch. This commit fixes this overflow issue in the SSSE3 implementation, as well as the prototype, of 32x32 quantization. This fixes issue 607 from webm@googlecode. Change-Id: I85635e6ca236b90c3dcfc40d449215c7b9caa806
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -701,7 +701,7 @@
specialize vp9_quantize_b $ssse3_x86_64
prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b_32x32
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
#
# Structured Similarity (SSIM)
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -84,7 +84,6 @@
*eob_ptr = eob + 1;
}
-// This function works well for large transform size.
void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
int16_t *zbin_ptr, int16_t *round_ptr,
@@ -105,8 +104,8 @@
eob = -1;
// Base ZBIN
- zbins[0] = zbin_ptr[0] + zbin_oq_value;
- zbins[1] = zbin_ptr[1] + zbin_oq_value;
+ zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
+ zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
nzbins[0] = zbins[0] * -1;
nzbins[1] = zbins[1] * -1;
@@ -114,7 +113,7 @@
// Pre-scan pass
     for (i = 0; i < n_coeffs; i++) {rc = scan[i];
- z = coeff_ptr[rc] * 2;
+ z = coeff_ptr[rc];
// If the coefficient is out of the base ZBIN range, keep it for
// quantization.
@@ -130,14 +129,14 @@
// Calculate ZBIN
zbin = (zbins[rc != 0]);
- z = coeff_ptr[rc] * 2;
+ z = coeff_ptr[rc];
sz = (z >> 31); // sign of z
x = (z ^ sz) - sz; // x = abs(z)
       if (x >= zbin) {- x += (round_ptr[rc != 0]);
+ x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
- quant_shift_ptr[rc != 0]) >> 16; // quantize (x)
+ quant_shift_ptr[rc != 0]) >> 15; // quantize (x)
x = (y ^ sz) - sz; // get the sign back
qcoeff_ptr[rc] = x; // write to destination
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -36,6 +36,14 @@
pshufd m4, m4, 0
mova m2, [quantq] ; m2 = quant
paddw m0, m4 ; m0 = zbin + zbin_oq
+%ifidn %1, b_32x32
+ pcmpeqw m5, m5
+ psrlw m5, 15
+ paddw m0, m5
+ paddw m1, m5
+ psrlw m0, 1 ; m0 = (m0 + 1) / 2
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
mova m3, [r2q] ; m3 = dequant
psubw m0, [pw_1]
mov r2, shiftmp
@@ -43,6 +51,9 @@
mova m4, [r2] ; m4 = shift
mov r4, dqcoeffmp
mov r5, iscanmp
+%ifidn %1, b_32x32
+ psllw m4, 1
+%endif
pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
lea coeffq, [ coeffq+ncoeffq*2]
@@ -56,10 +67,6 @@
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
-%ifidn %1, b_32x32
- paddw m6, m6
- paddw m11, m11
-%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
punpckhqdq m0, m0
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
@@ -112,10 +119,6 @@
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
-%ifidn %1, b_32x32
- paddw m6, m6
- paddw m11, m11
-%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
%ifidn %1, b_32x32
@@ -164,6 +167,7 @@
pmaxsw m8, m13
add ncoeffq, mmsize
jl .ac_only_loop
+
%ifidn %1, b_32x32
jmp .accumulate_eob
.skip_iter:
--
⑨