ref: 85fbf6569c3c34d9c091d2c09d7374c66e7b3b59
parent: 50f0dd8ee96458c34401a63d5d843b12be47557f
author: chiyotsai <chiyotsai@google.com>
date: Mon Mar 4 05:40:14 EST 2019
Optimize SSE4_1 lowbd temporal filter implementation - Change some unaligned loads to aligned loads - Preload filter weights BUG=webm:1591 Change-Id: I4e5e755e1fa5613d1c14191265bf80b0bfd0b75c
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -75,11 +75,11 @@
// by weight.
static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants,
const int strength, const int rounding,
- const int weight) {
+ const __m128i *weight) {
// _mm_srl_epi16 uses the lower 64 bit value for the shift.
const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
const __m128i rounding_u16 = _mm_set1_epi16(rounding);
- const __m128i weight_u16 = _mm_set1_epi16(weight);
+ const __m128i weight_u16 = *weight;
const __m128i sixteen = _mm_set1_epi16(16);
// modifier * 3 / index;
@@ -98,62 +98,6 @@
return _mm_mullo_epi16(sum, weight_u16);
}
-static __m128i average_4_4(__m128i sum, const __m128i *mul_constants,
- const int strength, const int rounding,
- const int weight_0, const int weight_1) {
- // _mm_srl_epi16 uses the lower 64 bit value for the shift.
- const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
- const __m128i rounding_u16 = _mm_set1_epi16(rounding);
- const __m128i weight_u16 =
- _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1,
- weight_1, weight_1);
- const __m128i sixteen = _mm_set1_epi16(16);
-
- // modifier * 3 / index;
- sum = _mm_mulhi_epu16(sum, *mul_constants);
-
- sum = _mm_adds_epu16(sum, rounding_u16);
- sum = _mm_srl_epi16(sum, strength_u128);
-
- // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
- // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
- // So this needs to use the epu16 version which did not come until SSE4.
- sum = _mm_min_epu16(sum, sixteen);
-
- sum = _mm_sub_epi16(sixteen, sum);
-
- return _mm_mullo_epi16(sum, weight_u16);
-}
-
-static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
- const __m128i *mul_constants_0,
- const __m128i *mul_constants_1,
- const int strength, const int rounding,
- const int weight) {
- const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
- const __m128i rounding_u16 = _mm_set1_epi16(rounding);
- const __m128i weight_u16 = _mm_set1_epi16(weight);
- const __m128i sixteen = _mm_set1_epi16(16);
- __m128i input_0, input_1;
-
- input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0);
- input_0 = _mm_adds_epu16(input_0, rounding_u16);
-
- input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1);
- input_1 = _mm_adds_epu16(input_1, rounding_u16);
-
- input_0 = _mm_srl_epi16(input_0, strength_u128);
- input_1 = _mm_srl_epi16(input_1, strength_u128);
-
- input_0 = _mm_min_epu16(input_0, sixteen);
- input_1 = _mm_min_epu16(input_1, sixteen);
- input_0 = _mm_sub_epi16(sixteen, input_0);
- input_1 = _mm_sub_epi16(sixteen, input_1);
-
- *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
- *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
-}
-
// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
uint16_t *count, uint32_t *accumulator) {
@@ -336,7 +280,7 @@
const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
const int *blk_fw) {
const int rounding = (1 << strength) >> 1;
- int weight = top_weight;
+ __m128i weight_first, weight_second;
__m128i mul_first, mul_second;
@@ -360,9 +304,18 @@
(void)block_width;
+ // Initialize the weights
+ if (blk_fw) {
+ weight_first = _mm_set1_epi16(blk_fw[0]);
+ weight_second = _mm_set1_epi16(blk_fw[1]);
+ } else {
+ weight_first = _mm_set1_epi16(top_weight);
+ weight_second = weight_first;
+ }
+
// First row
- mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
- mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+ mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
// Add luma values
get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
@@ -382,15 +335,10 @@
sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
// Get modifier and store result
- if (blk_fw) {
- sum_row_first =
- average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
- sum_row_second =
- average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
- } else {
- average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
- strength, rounding, weight);
- }
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
y_accum);
@@ -408,16 +356,18 @@
v_dist += DIST_STRIDE;
// Then all the rows except the last one
- mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
- mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
+ mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]);
+ mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]);
for (h = 1; h < block_height - 1; ++h) {
// Move the weight to bottom half
if (!use_whole_blk && h == block_height / 2) {
if (blk_fw) {
- blk_fw += 2;
+ weight_first = _mm_set1_epi16(blk_fw[2]);
+ weight_second = _mm_set1_epi16(blk_fw[3]);
} else {
- weight = bottom_weight;
+ weight_first = _mm_set1_epi16(bottom_weight);
+ weight_second = weight_first;
}
}
// Shift the rows up
@@ -456,15 +406,10 @@
sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
// Get modifier and store result
- if (blk_fw) {
- sum_row_first =
- average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
- sum_row_second =
- average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
- } else {
- average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
- strength, rounding, weight);
- }
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
y_accum);
@@ -476,8 +421,8 @@
}
// The last row
- mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
- mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+ mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
// Shift the rows up
sum_row_1_first = sum_row_2_first;
@@ -503,15 +448,10 @@
sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
// Get modifier and store result
- if (blk_fw) {
- sum_row_first =
- average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
- sum_row_second =
- average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
- } else {
- average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
- strength, rounding, weight);
- }
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
y_accum);
}
@@ -634,8 +574,9 @@
const int16_t *const *neighbors, int top_weight, int bottom_weight,
const int *blk_fw) {
const int rounding = (1 << strength) >> 1;
- int weight = top_weight;
+ __m128i weight;
+
__m128i mul;
__m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
@@ -648,8 +589,16 @@
(void)uv_block_width;
+ // Initilize weight
+ if (blk_fw) {
+ weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0],
+ blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]);
+ } else {
+ weight = _mm_set1_epi16(top_weight);
+ }
+
// First row
- mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
+ mul = _mm_load_si128((const __m128i *)neighbors[0]);
// Add chroma values
get_sum_8(u_dist, &u_sum_row_2);
@@ -666,15 +615,9 @@
add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
// Get modifier and store result
- if (blk_fw) {
- u_sum_row =
- average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
- v_sum_row =
- average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
- } else {
- u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
- v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
- }
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
@@ -694,15 +637,16 @@
y_dist += DIST_STRIDE * (1 + ss_y);
// Then all the rows except the last one
- mul = _mm_loadu_si128((const __m128i *)neighbors[1]);
+ mul = _mm_load_si128((const __m128i *)neighbors[1]);
for (h = 1; h < uv_block_height - 1; ++h) {
// Move the weight pointer to the bottom half of the blocks
if (h == uv_block_height / 2) {
if (blk_fw) {
- blk_fw += 2;
+ weight = _mm_setr_epi16(blk_fw[2], blk_fw[2], blk_fw[2], blk_fw[2],
+ blk_fw[3], blk_fw[3], blk_fw[3], blk_fw[3]);
} else {
- weight = bottom_weight;
+ weight = _mm_set1_epi16(bottom_weight);
}
}
@@ -726,15 +670,8 @@
add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
// Get modifier and store result
- if (blk_fw) {
- u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0],
- blk_fw[1]);
- v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0],
- blk_fw[1]);
- } else {
- u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
- v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
- }
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
@@ -756,7 +693,7 @@
}
// The last row
- mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
+ mul = _mm_load_si128((const __m128i *)neighbors[0]);
// Shift the rows up
u_sum_row_1 = u_sum_row_2;
@@ -773,15 +710,8 @@
add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
// Get modifier and store result
- if (blk_fw) {
- u_sum_row =
- average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
- v_sum_row =
- average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
- } else {
- u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
- v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
- }
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);