ref: b9c1dcc5fa3674f6d4fdcfb5d7d0e324216d6bb3
parent: 75752ab7c0a09365cd4d6d94ec5b72b688773f67
author: Johann <johannkoenig@google.com>
date: Tue Aug 22 10:25:27 EDT 2017
quantize ssse3: copy style from sse2 Change-Id: I53f8a160e640c674ea035fc112e207b6dca42598
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -23,104 +23,88 @@
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan_ptr, const int16_t *iscan_ptr) {
const __m128i zero = _mm_setzero_si128();
+ intptr_t index = 16;
+
+ __m128i zbin, round, quant, dequant, shift;
__m128i coeff0, coeff1;
- __m128i eob;
- __m128i zbin;
- __m128i round, quant, dequant, shift;
- intptr_t index = 0;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i qtmp0, qtmp1;
+ __m128i zero_coeff0, zero_coeff1, iscan0, iscan1;
+ __m128i eob, eob0, eob1;
+
(void)scan_ptr;
(void)skip_block;
assert(!skip_block);
- // Setup global values
- {
- const __m128i one = _mm_set1_epi16(1);
- zbin = _mm_load_si128((const __m128i *)zbin_ptr);
- // x86 has no "greater *or equal* comparison. Subtract 1 from zbin so
- // it is a strict "greater" comparison.
- zbin = _mm_sub_epi16(zbin, one);
- round = _mm_load_si128((const __m128i *)round_ptr);
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
- }
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ // x86 has no "greater *or equal* comparison. Subtract 1 from zbin so
+ // it is a strict "greater" comparison.
+ zbin = _mm_sub_epi16(zbin, _mm_set1_epi16(1));
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
- {
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- __m128i cmp_mask0, cmp_mask1;
- __m128i zero_coeff0, zero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
- // Do DC and first 15 AC
- coeff0 = load_tran_low(coeff_ptr + index);
- coeff1 = load_tran_low(coeff_ptr + index + 8);
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
- qcoeff0 = _mm_abs_epi16(coeff0);
- qcoeff1 = _mm_abs_epi16(coeff1);
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
- cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
- // Overwrite DC component.
- zbin = _mm_unpackhi_epi64(zbin, zbin);
- cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ round = _mm_unpackhi_epi64(round, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+ qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+ qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
- qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
- qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+ qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
- qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
- shift = _mm_unpackhi_epi64(shift, shift);
- qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+ // Reinsert signs
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
- // Reinsert signs
- qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
- qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
- // Mask out zbin threshold coeffs
- qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
- qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
- store_tran_low(qcoeff0, qcoeff_ptr + index);
- store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+ store_tran_low(coeff0, dqcoeff_ptr);
+ store_tran_low(coeff1, dqcoeff_ptr + 8);
- store_tran_low(coeff0, dqcoeff_ptr + index);
- store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+ // Scan for eob.
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);
+ iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);
+ eob = _mm_andnot_si128(zero_coeff0, iscan0);
+ eob1 = _mm_andnot_si128(zero_coeff1, iscan1);
+ eob = _mm_max_epi16(eob, eob1);
- // Scan for eob
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + index));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + index + 8));
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);
- iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);
- eob = _mm_andnot_si128(zero_coeff0, iscan0);
- eob1 = _mm_andnot_si128(zero_coeff1, iscan1);
- eob = _mm_max_epi16(eob, eob1);
- }
- index += 16;
-
- // AC only loop
+ // AC only loop.
while (index < n_coeffs) {
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- __m128i cmp_mask0, cmp_mask1;
- __m128i zero_coeff0, zero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
-
coeff0 = load_tran_low(coeff_ptr + index);
coeff1 = load_tran_low(coeff_ptr + index + 8);
@@ -142,11 +126,9 @@
qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
- // Reinsert signs
qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
- // Mask out zbin threshold coeffs
qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
@@ -159,12 +141,10 @@
store_tran_low(coeff0, dqcoeff_ptr + index);
store_tran_low(coeff1, dqcoeff_ptr + index + 8);
- // Scan for eob
zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + index));
iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + index + 8));
- // Add one to convert from indices to counts
iscan0 = _mm_sub_epi16(iscan0, cmp_mask0);
iscan1 = _mm_sub_epi16(iscan1, cmp_mask1);
eob0 = _mm_andnot_si128(zero_coeff0, iscan0);
@@ -175,7 +155,7 @@
index += 16;
}
- // Accumulate EOB
+ // Accumulate eob.
{
__m128i eob_shuffled;
eob_shuffled = _mm_shuffle_epi32(eob, 0xe);