ref: 6dfeea6592b9a6327679f76a192f1dfd4eb02ea4
parent: 58fe1bde59ada9eb25b510e7be2e43dda5a5ddf6
author: Johann <johannkoenig@google.com>
date: Wed Mar 15 06:40:58 EDT 2017
vp9 temporal filter: sse4 implementation Approximates division using multiply and shift. Speeds up both sizes (8x8 and 16x16) by 30 times. Fix the call sites to use the RTCD function. Delete sse2 and mips implementation. They were based on a previous implementation of the filter. It was changed in Dec 2015: ece4fd5d2247c9512b31a93dd593de567beaf928 BUG=webm:1378 Change-Id: I0818e767a802966520b5c6e7999584ad13159276
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -141,9 +141,10 @@
count_chk.CopyFrom(count_ref);
reference_filter(a, b, width, height, filter_strength, filter_weight,
&accum_ref, &count_ref);
- filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
- height, filter_strength, filter_weight,
- accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
+ ASM_REGISTER_STATE_CHECK(
+ filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
+ height, filter_strength, filter_weight,
+ accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()));
EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
EXPECT_TRUE(count_chk.CheckValues(count_ref));
if (HasFailure()) {
@@ -150,7 +151,7 @@
printf("Width: %d Height: %d\n", width, height);
count_chk.PrintDifference(count_ref);
accum_chk.PrintDifference(accum_ref);
- ASSERT_TRUE(false);
+ return;
}
}
}
@@ -157,37 +158,43 @@
}
TEST_P(TemporalFilterTest, CompareReferenceRandom) {
- const int width = 16;
- const int height = 16;
- Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
- // The second buffer must not have any border.
- Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
- Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
- Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
- Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
- Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+ for (int width = 8; width <= 16; width += 8) {
+ for (int height = 8; height <= 16; height += 8) {
+ Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
+ // The second buffer must not have any border.
+ Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+ Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+ Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+ Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+ Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
- a.Set(&rnd_, &ACMRandom::Rand8);
- b.Set(&rnd_, &ACMRandom::Rand8);
+ for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
+ for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
+ for (int repeat = 0; repeat < 10; ++repeat) {
+ a.Set(&rnd_, &ACMRandom::Rand8);
+ b.Set(&rnd_, &ACMRandom::Rand8);
- for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
- for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
- accum_ref.Set(rnd_.Rand8());
- accum_chk.CopyFrom(accum_ref);
- count_ref.Set(rnd_.Rand8());
- count_chk.CopyFrom(count_ref);
- reference_filter(a, b, width, height, filter_strength, filter_weight,
- &accum_ref, &count_ref);
- filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
- height, filter_strength, filter_weight,
- accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
- EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
- EXPECT_TRUE(count_chk.CheckValues(count_ref));
- if (HasFailure()) {
- printf("Weight: %d Strength: %d\n", filter_weight, filter_strength);
- count_chk.PrintDifference(count_ref);
- accum_chk.PrintDifference(accum_ref);
- ASSERT_TRUE(false);
+ accum_ref.Set(rnd_.Rand8());
+ accum_chk.CopyFrom(accum_ref);
+ count_ref.Set(rnd_.Rand8());
+ count_chk.CopyFrom(count_ref);
+ reference_filter(a, b, width, height, filter_strength,
+ filter_weight, &accum_ref, &count_ref);
+ ASM_REGISTER_STATE_CHECK(filter_func_(
+ a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height,
+ filter_strength, filter_weight, accum_chk.TopLeftPixel(),
+ count_chk.TopLeftPixel()));
+ EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
+ EXPECT_TRUE(count_chk.CheckValues(count_ref));
+ if (HasFailure()) {
+ printf("Weight: %d Strength: %d\n", filter_weight,
+ filter_strength);
+ count_chk.PrintDifference(count_ref);
+ accum_chk.PrintDifference(accum_ref);
+ return;
+ }
+ }
+ }
}
}
}
@@ -222,9 +229,8 @@
accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
}
vpx_usec_timer_mark(&timer);
- const int elapsed_time =
- static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
- printf("Temporal filter %dx%d time: %5d ms\n", width, height,
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("Temporal filter %dx%d time: %5d us\n", width, height,
elapsed_time);
}
}
@@ -233,10 +239,8 @@
INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest,
::testing::Values(&vp9_temporal_filter_apply_c));
-/* TODO(johannkoenig): https://bugs.chromium.org/p/webm/issues/detail?id=1378
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest,
::testing::Values(&vp9_temporal_filter_apply_sse4_1));
#endif // HAVE_SSE4_1
-*/
} // namespace
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -198,7 +198,7 @@
specialize qw/vp9_diamond_search_sad avx/;
add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-specialize qw/vp9_temporal_filter_apply sse2 msa/;
+specialize qw/vp9_temporal_filter_apply sse4_1/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
--- a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
+++ /dev/null
@@ -1,285 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vpx_dsp/mips/macros_msa.h"
-
-static void temporal_filter_apply_8size_msa(const uint8_t *frm1_ptr,
- uint32_t stride,
- const uint8_t *frm2_ptr,
- int32_t filt_sth, int32_t filt_wgt,
- uint32_t *acc, uint16_t *cnt) {
- uint32_t row;
- uint64_t f0, f1, f2, f3;
- v16i8 frm2, frm1 = { 0 };
- v16i8 frm4, frm3 = { 0 };
- v16u8 frm_r, frm_l;
- v8i16 frm2_r, frm2_l;
- v8i16 diff0, diff1, mod0_h, mod1_h;
- v4i32 cnst3, cnst16, filt_wt, strength;
- v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
- v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
- v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
- v4i32 acc0, acc1, acc2, acc3;
- v8i16 cnt0, cnt1;
-
- filt_wt = __msa_fill_w(filt_wgt);
- strength = __msa_fill_w(filt_sth);
- cnst3 = __msa_ldi_w(3);
- cnst16 = __msa_ldi_w(16);
-
- for (row = 2; row--;) {
- LD4(frm1_ptr, stride, f0, f1, f2, f3);
- frm1_ptr += (4 * stride);
-
- LD_SB2(frm2_ptr, 16, frm2, frm4);
- frm2_ptr += 32;
-
- LD_SW2(acc, 4, acc0, acc1);
- LD_SW2(acc + 8, 4, acc2, acc3);
- LD_SH2(cnt, 8, cnt0, cnt1);
-
- INSERT_D2_SB(f0, f1, frm1);
- INSERT_D2_SB(f2, f3, frm3);
- ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
- HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
- UNPCK_SH_SW(diff0, diff0_r, diff0_l);
- UNPCK_SH_SW(diff1, diff1_r, diff1_l);
- MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
- mod0_w, mod1_w, mod2_w, mod3_w);
- MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
- mod1_w, mod2_w, mod3_w);
- SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
- diff0_r = (mod0_w < cnst16);
- diff0_l = (mod1_w < cnst16);
- diff1_r = (mod2_w < cnst16);
- diff1_l = (mod3_w < cnst16);
-
- SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
- mod1_w, mod2_w, mod3_w);
-
- mod0_w = diff0_r & mod0_w;
- mod1_w = diff0_l & mod1_w;
- mod2_w = diff1_r & mod2_w;
- mod3_w = diff1_l & mod3_w;
-
- MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
- mod0_w, mod1_w, mod2_w, mod3_w);
- PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
- ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
- ST_SH2(mod0_h, mod1_h, cnt, 8);
- cnt += 16;
-
- UNPCK_UB_SH(frm2, frm2_r, frm2_l);
- UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
- UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
- MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
- mod0_w, mod1_w, mod2_w, mod3_w);
- ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
- mod2_w, mod3_w);
-
- ST_SW2(mod0_w, mod1_w, acc, 4);
- acc += 8;
- ST_SW2(mod2_w, mod3_w, acc, 4);
- acc += 8;
-
- LD_SW2(acc, 4, acc0, acc1);
- LD_SW2(acc + 8, 4, acc2, acc3);
- LD_SH2(cnt, 8, cnt0, cnt1);
-
- ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
- HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
- UNPCK_SH_SW(diff0, diff0_r, diff0_l);
- UNPCK_SH_SW(diff1, diff1_r, diff1_l);
- MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
- mod0_w, mod1_w, mod2_w, mod3_w);
- MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
- mod1_w, mod2_w, mod3_w);
- SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
- diff0_r = (mod0_w < cnst16);
- diff0_l = (mod1_w < cnst16);
- diff1_r = (mod2_w < cnst16);
- diff1_l = (mod3_w < cnst16);
-
- SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
- mod1_w, mod2_w, mod3_w);
-
- mod0_w = diff0_r & mod0_w;
- mod1_w = diff0_l & mod1_w;
- mod2_w = diff1_r & mod2_w;
- mod3_w = diff1_l & mod3_w;
-
- MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
- mod0_w, mod1_w, mod2_w, mod3_w);
- PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
- ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
- ST_SH2(mod0_h, mod1_h, cnt, 8);
- cnt += 16;
- UNPCK_UB_SH(frm4, frm2_r, frm2_l);
- UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
- UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
- MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
- mod0_w, mod1_w, mod2_w, mod3_w);
- ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
- mod2_w, mod3_w);
-
- ST_SW2(mod0_w, mod1_w, acc, 4);
- acc += 8;
- ST_SW2(mod2_w, mod3_w, acc, 4);
- acc += 8;
- }
-}
-
-static void temporal_filter_apply_16size_msa(const uint8_t *frm1_ptr,
- uint32_t stride,
- const uint8_t *frm2_ptr,
- int32_t filt_sth, int32_t filt_wgt,
- uint32_t *acc, uint16_t *cnt) {
- uint32_t row;
- v16i8 frm1, frm2, frm3, frm4;
- v16u8 frm_r, frm_l;
- v16i8 zero = { 0 };
- v8u16 frm2_r, frm2_l;
- v8i16 diff0, diff1, mod0_h, mod1_h;
- v4i32 cnst3, cnst16, filt_wt, strength;
- v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
- v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
- v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
- v4i32 acc0, acc1, acc2, acc3;
- v8i16 cnt0, cnt1;
-
- filt_wt = __msa_fill_w(filt_wgt);
- strength = __msa_fill_w(filt_sth);
- cnst3 = __msa_ldi_w(3);
- cnst16 = __msa_ldi_w(16);
-
- for (row = 8; row--;) {
- LD_SB2(frm1_ptr, stride, frm1, frm3);
- frm1_ptr += stride;
-
- LD_SB2(frm2_ptr, 16, frm2, frm4);
- frm2_ptr += 16;
-
- LD_SW2(acc, 4, acc0, acc1);
- LD_SW2(acc, 4, acc2, acc3);
- LD_SH2(cnt, 8, cnt0, cnt1);
-
- ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
- HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
- UNPCK_SH_SW(diff0, diff0_r, diff0_l);
- UNPCK_SH_SW(diff1, diff1_r, diff1_l);
- MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
- mod0_w, mod1_w, mod2_w, mod3_w);
- MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
- mod1_w, mod2_w, mod3_w);
- SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
- diff0_r = (mod0_w < cnst16);
- diff0_l = (mod1_w < cnst16);
- diff1_r = (mod2_w < cnst16);
- diff1_l = (mod3_w < cnst16);
-
- SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
- mod1_w, mod2_w, mod3_w);
-
- mod0_w = diff0_r & mod0_w;
- mod1_w = diff0_l & mod1_w;
- mod2_w = diff1_r & mod2_w;
- mod3_w = diff1_l & mod3_w;
-
- MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
- mod0_w, mod1_w, mod2_w, mod3_w);
- PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
- ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
- ST_SH2(mod0_h, mod1_h, cnt, 8);
- cnt += 16;
-
- ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
- UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
- UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
- MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
- mod0_w, mod1_w, mod2_w, mod3_w);
- ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
- mod2_w, mod3_w);
-
- ST_SW2(mod0_w, mod1_w, acc, 4);
- acc += 8;
- ST_SW2(mod2_w, mod3_w, acc, 4);
- acc += 8;
-
- LD_SW2(acc, 4, acc0, acc1);
- LD_SW2(acc + 8, 4, acc2, acc3);
- LD_SH2(cnt, 8, cnt0, cnt1);
-
- ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
- HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
- UNPCK_SH_SW(diff0, diff0_r, diff0_l);
- UNPCK_SH_SW(diff1, diff1_r, diff1_l);
- MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
- mod0_w, mod1_w, mod2_w, mod3_w);
- MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
- mod1_w, mod2_w, mod3_w);
- SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
- diff0_r = (mod0_w < cnst16);
- diff0_l = (mod1_w < cnst16);
- diff1_r = (mod2_w < cnst16);
- diff1_l = (mod3_w < cnst16);
-
- SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
- mod1_w, mod2_w, mod3_w);
-
- mod0_w = diff0_r & mod0_w;
- mod1_w = diff0_l & mod1_w;
- mod2_w = diff1_r & mod2_w;
- mod3_w = diff1_l & mod3_w;
-
- MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
- mod0_w, mod1_w, mod2_w, mod3_w);
- PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
- ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
- ST_SH2(mod0_h, mod1_h, cnt, 8);
- cnt += 16;
-
- ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
- UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
- UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
- MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
- mod0_w, mod1_w, mod2_w, mod3_w);
- ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
- mod2_w, mod3_w);
- ST_SW2(mod0_w, mod1_w, acc, 4);
- acc += 8;
- ST_SW2(mod2_w, mod3_w, acc, 4);
- acc += 8;
-
- frm1_ptr += stride;
- frm2_ptr += 16;
- }
-}
-
-void vp9_temporal_filter_apply_msa(const uint8_t *frame1_ptr, uint32_t stride,
- const uint8_t *frame2_ptr, uint32_t blk_w,
- uint32_t blk_h, int32_t strength,
- int32_t filt_wgt, uint32_t *accu,
- uint16_t *cnt) {
- if (8 == (blk_w * blk_h)) {
- temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
- filt_wgt, accu, cnt);
- } else if (16 == (blk_w * blk_h)) {
- temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
- filt_wgt, accu, cnt);
- } else {
- vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
- strength, filt_wgt, accu, cnt);
- }
-}
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include <math.h>
#include <limits.h>
@@ -100,6 +101,12 @@
int byte = 0;
const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ assert(filter_weight >= 0);
+ assert(filter_weight <= 2);
+
for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++, k++) {
int pixel_value = *frame2;
@@ -376,45 +383,44 @@
if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int adj_strength = strength + 2 * (mbd->bd - 8);
// Apply the filter (YUV)
- vp9_highbd_temporal_filter_apply_c(
+ vp9_highbd_temporal_filter_apply(
f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
adj_strength, filter_weight, accumulator, count);
- vp9_highbd_temporal_filter_apply_c(
+ vp9_highbd_temporal_filter_apply(
f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
mb_uv_width, mb_uv_height, adj_strength, filter_weight,
accumulator + 256, count + 256);
- vp9_highbd_temporal_filter_apply_c(
+ vp9_highbd_temporal_filter_apply(
f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
mb_uv_width, mb_uv_height, adj_strength, filter_weight,
accumulator + 512, count + 512);
} else {
// Apply the filter (YUV)
- vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
- predictor, 16, 16, strength,
- filter_weight, accumulator, count);
- vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
- predictor + 256, mb_uv_width,
- mb_uv_height, strength, filter_weight,
- accumulator + 256, count + 256);
- vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
- predictor + 512, mb_uv_width,
- mb_uv_height, strength, filter_weight,
- accumulator + 512, count + 512);
- }
-#else
- // Apply the filter (YUV)
- // TODO(jingning): Need SIMD optimization for this.
- vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+ vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16, strength, filter_weight,
accumulator, count);
- vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
+ vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 256,
count + 256);
- vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
+ vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 512,
count + 512);
+ }
+#else
+ // Apply the filter (YUV)
+ vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+ predictor, 16, 16, strength, filter_weight,
+ accumulator, count);
+ vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+ predictor + 256, mb_uv_width, mb_uv_height,
+ strength, filter_weight, accumulator + 256,
+ count + 256);
+ vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+ predictor + 512, mb_uv_width, mb_uv_height,
+ strength, filter_weight, accumulator + 512,
+ count + 512);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
--- /dev/null
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -1,0 +1,378 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
+// which may be bound by the edges of the block being filtered.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+
+// Load values from 'a' and 'b'. Compute the difference squared and sum
+// neighboring values such that:
+// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
+// Values to the left and right of the row are set to 0.
+// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
+static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
+ const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
+
+ const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
+ const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
+
+ const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
+ const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
+
+ // Shift all the values one place to the left/right so we can efficiently sum
+ // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
+ const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
+ const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
+
+ // It becomes necessary to treat the values as unsigned at this point. The
+ // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
+ // forward since the filter is only applied to smooth small pixel changes.
+ // Once the value has saturated to uint16_t it is well outside the useful
+ // range.
+ __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
+ sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+ *sum = sum_u16;
+}
+
+static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
+ __m128i *sum_1) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
+ const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
+
+ const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
+ const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
+ const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
+ const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);
+
+ const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
+ const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
+ const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
+ const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);
+
+ __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
+ // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
+ __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);
+
+ __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
+ sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+ *sum_0 = sum_u16;
+
+ shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
+ shift_right = _mm_srli_si128(diff_sq_1_u16, 2);
+
+ sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
+ sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+ *sum_1 = sum_u16;
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static __m128i average_8(__m128i sum, const __m128i mul_constants,
+ const int strength, const int rounding,
+ const int weight) {
+ // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 = _mm_set1_epi16(weight);
+ const __m128i sixteen = _mm_set1_epi16(16);
+
+ // modifier * 3 / index;
+ sum = _mm_mulhi_epu16(sum, mul_constants);
+
+ sum = _mm_adds_epu16(sum, rounding_u16);
+ sum = _mm_srl_epi16(sum, strength_u128);
+
+ // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+ // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+ // So this needs to use the epu16 version which did not come until SSE4.
+ sum = _mm_min_epu16(sum, sixteen);
+
+ sum = _mm_sub_epi16(sixteen, sum);
+
+ return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
+ const __m128i mul_constants_0,
+ const __m128i mul_constants_1, const int strength,
+ const int rounding, const int weight) {
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 = _mm_set1_epi16(weight);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ __m128i input_0, input_1;
+
+ input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
+ input_0 = _mm_adds_epu16(input_0, rounding_u16);
+
+ input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
+ input_1 = _mm_adds_epu16(input_1, rounding_u16);
+
+ input_0 = _mm_srl_epi16(input_0, strength_u128);
+ input_1 = _mm_srl_epi16(input_1, strength_u128);
+
+ input_0 = _mm_min_epu16(input_0, sixteen);
+ input_1 = _mm_min_epu16(input_1, sixteen);
+ input_0 = _mm_sub_epi16(sixteen, input_0);
+ input_1 = _mm_sub_epi16(sixteen, input_1);
+
+ *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
+ *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+ uint16_t *count, unsigned int *accumulator) {
+ const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+ __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+ __m128i pred_0_u32, pred_1_u32;
+ __m128i accum_0_u32, accum_1_u32;
+
+ count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+ _mm_storeu_si128((__m128i *)count, count_u16);
+
+ pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static void accumulate_and_store_16(const __m128i sum_0_u16,
+ const __m128i sum_1_u16,
+ const uint8_t *pred, uint16_t *count,
+ unsigned int *accumulator) {
+ const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+ count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+ __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+ pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+ __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+ __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+ count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+ _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+ count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+ _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+ pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+ pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+ pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+ pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+ accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+ accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+ accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+ accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
+ const uint8_t *b, unsigned int width,
+ unsigned int height, int strength,
+ int weight, unsigned int *accumulator,
+ uint16_t *count) {
+ unsigned int h;
+ const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ assert(weight >= 0);
+ assert(weight <= 2);
+
+ assert(width == 8 || width == 16);
+
+ // TODO(johannkoenig) Use uint32_t for accumulator.
+ assert(sizeof(*accumulator) == sizeof(uint32_t));
+
+ if (width == 8) {
+ __m128i sum_row_a, sum_row_b, sum_row_c;
+ __m128i mul_constants = _mm_setr_epi16(
+ NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+
+ sum_8(a, b, &sum_row_a);
+ sum_8(a + stride, b + width, &sum_row_b);
+ sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
+ sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
+ accumulate_and_store_8(sum_row_c, b, count, accumulator);
+
+ a += stride + stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
+
+ for (h = 0; h < height - 2; ++h) {
+ sum_8(a, b + width, &sum_row_c);
+ sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
+ sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
+ sum_row_a =
+ average_8(sum_row_a, mul_constants, strength, rounding, weight);
+ accumulate_and_store_8(sum_row_a, b, count, accumulator);
+
+ a += stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ sum_row_a = sum_row_b;
+ sum_row_b = sum_row_c;
+ }
+
+ mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+ sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
+ sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
+ accumulate_and_store_8(sum_row_a, b, count, accumulator);
+
+ } else { // width == 16
+ __m128i sum_row_a_0, sum_row_a_1;
+ __m128i sum_row_b_0, sum_row_b_1;
+ __m128i sum_row_c_0, sum_row_c_1;
+ __m128i mul_constants_0 = _mm_setr_epi16(
+ NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
+ mul_constants_1 = _mm_setr_epi16(
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+
+ sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
+ sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);
+
+ sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+ sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+
+ average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+ strength, rounding, weight);
+ accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+
+ a += stride + stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
+ mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
+ for (h = 0; h < height - 2; ++h) {
+ sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);
+
+ sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+ sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
+ sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+ sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
+
+ average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
+ strength, rounding, weight);
+ accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
+
+ a += stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ sum_row_a_0 = sum_row_b_0;
+ sum_row_a_1 = sum_row_b_1;
+ sum_row_b_0 = sum_row_c_0;
+ sum_row_b_1 = sum_row_c_1;
+ }
+
+ mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
+ mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+ sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+ sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+
+ average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+ strength, rounding, weight);
+ accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+ }
+}
--- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ /dev/null
@@ -1,212 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; void vp9_temporal_filter_apply_sse2 | arg
-; (unsigned char *frame1, | 0
-; unsigned int stride, | 1
-; unsigned char *frame2, | 2
-; unsigned int block_width, | 3
-; unsigned int block_height, | 4
-; int strength, | 5
-; int filter_weight, | 6
-; unsigned int *accumulator, | 7
-; unsigned short *count) | 8
-global sym(vp9_temporal_filter_apply_sse2) PRIVATE
-sym(vp9_temporal_filter_apply_sse2):
-
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ALIGN_STACK 16, rax
- %define block_width 0
- %define block_height 16
- %define strength 32
- %define filter_weight 48
- %define rounding_bit 64
- %define rbp_backup 80
- %define stack_size 96
- sub rsp, stack_size
- mov [rsp + rbp_backup], rbp
- ; end prolog
-
- mov edx, arg(3)
- mov [rsp + block_width], rdx
- mov edx, arg(4)
- mov [rsp + block_height], rdx
- movd xmm6, arg(5)
- movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
-
- ; calculate the rounding bit outside the loop
- ; 0x8000 >> (16 - strength)
- mov rdx, 16
- sub rdx, arg(5) ; 16 - strength
- movq xmm4, rdx ; can't use rdx w/ shift
- movdqa xmm5, [GLOBAL(_const_top_bit)]
- psrlw xmm5, xmm4
- movdqa [rsp + rounding_bit], xmm5
-
- mov rsi, arg(0) ; src/frame1
- mov rdx, arg(2) ; predictor frame
- mov rdi, arg(7) ; accumulator
- mov rax, arg(8) ; count
-
- ; dup the filter weight and store for later
- movd xmm0, arg(6) ; filter_weight
- pshuflw xmm0, xmm0, 0
- punpcklwd xmm0, xmm0
- movdqa [rsp + filter_weight], xmm0
-
- mov rbp, arg(1) ; stride
- pxor xmm7, xmm7 ; zero for extraction
-
- mov rcx, [rsp + block_width]
- imul rcx, [rsp + block_height]
- add rcx, rdx
- cmp dword ptr [rsp + block_width], 8
- jne .temporal_filter_apply_load_16
-
-.temporal_filter_apply_load_8:
- movq xmm0, [rsi] ; first row
- lea rsi, [rsi + rbp] ; += stride
- punpcklbw xmm0, xmm7 ; src[ 0- 7]
- movq xmm1, [rsi] ; second row
- lea rsi, [rsi + rbp] ; += stride
- punpcklbw xmm1, xmm7 ; src[ 8-15]
- jmp .temporal_filter_apply_load_finished
-
-.temporal_filter_apply_load_16:
- movdqa xmm0, [rsi] ; src (frame1)
- lea rsi, [rsi + rbp] ; += stride
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7 ; src[ 0- 7]
- punpckhbw xmm1, xmm7 ; src[ 8-15]
-
-.temporal_filter_apply_load_finished:
- movdqa xmm2, [rdx] ; predictor (frame2)
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm7 ; pred[ 0- 7]
- punpckhbw xmm3, xmm7 ; pred[ 8-15]
-
- ; modifier = src_byte - pixel_value
- psubw xmm0, xmm2 ; src - pred[ 0- 7]
- psubw xmm1, xmm3 ; src - pred[ 8-15]
-
- ; modifier *= modifier
- pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
- pmullw xmm1, xmm1 ; modifer[ 8-15]^2
-
- ; modifier *= 3
- pmullw xmm0, [GLOBAL(_const_3w)]
- pmullw xmm1, [GLOBAL(_const_3w)]
-
- ; modifer += 0x8000 >> (16 - strength)
- paddw xmm0, [rsp + rounding_bit]
- paddw xmm1, [rsp + rounding_bit]
-
- ; modifier >>= strength
- psrlw xmm0, [rsp + strength]
- psrlw xmm1, [rsp + strength]
-
- ; modifier = 16 - modifier
- ; saturation takes care of modifier > 16
- movdqa xmm3, [GLOBAL(_const_16w)]
- movdqa xmm2, [GLOBAL(_const_16w)]
- psubusw xmm3, xmm1
- psubusw xmm2, xmm0
-
- ; modifier *= filter_weight
- pmullw xmm2, [rsp + filter_weight]
- pmullw xmm3, [rsp + filter_weight]
-
- ; count
- movdqa xmm4, [rax]
- movdqa xmm5, [rax+16]
- ; += modifier
- paddw xmm4, xmm2
- paddw xmm5, xmm3
- ; write back
- movdqa [rax], xmm4
- movdqa [rax+16], xmm5
- lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
-
- ; load and extract the predictor up to shorts
- pxor xmm7, xmm7
- movdqa xmm0, [rdx]
- lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7 ; pred[ 0- 7]
- punpckhbw xmm1, xmm7 ; pred[ 8-15]
-
- ; modifier *= pixel_value
- pmullw xmm0, xmm2
- pmullw xmm1, xmm3
-
- ; expand to double words
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm7 ; [ 0- 3]
- punpckhwd xmm2, xmm7 ; [ 4- 7]
- movdqa xmm3, xmm1
- punpcklwd xmm1, xmm7 ; [ 8-11]
- punpckhwd xmm3, xmm7 ; [12-15]
-
- ; accumulator
- movdqa xmm4, [rdi]
- movdqa xmm5, [rdi+16]
- movdqa xmm6, [rdi+32]
- movdqa xmm7, [rdi+48]
- ; += modifier
- paddd xmm4, xmm0
- paddd xmm5, xmm2
- paddd xmm6, xmm1
- paddd xmm7, xmm3
- ; write back
- movdqa [rdi], xmm4
- movdqa [rdi+16], xmm5
- movdqa [rdi+32], xmm6
- movdqa [rdi+48], xmm7
- lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
-
- cmp rdx, rcx
- je .temporal_filter_apply_epilog
- pxor xmm7, xmm7 ; zero for extraction
- cmp dword ptr [rsp + block_width], 16
- je .temporal_filter_apply_load_16
- jmp .temporal_filter_apply_load_8
-
-.temporal_filter_apply_epilog:
- ; begin epilog
- mov rbp, [rsp + rbp_backup]
- add rsp, stack_size
- pop rsp
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-_const_3w:
- times 8 dw 3
-align 16
-_const_top_bit:
- times 8 dw 1<<15
-align 16
-_const_16w
- times 8 dw 16
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -100,7 +100,8 @@
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
+
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -135,6 +136,5 @@
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))