ref: f6a002f2a67579cce9d53a4314f0676228517c12
parent: 67c999876e9ae1a0f927c09ef2c64fc8b638eec1
author: sdeng <sdeng@google.com>
date: Tue Oct 30 10:08:07 EDT 2018
Add satd avx2 implementation Speed Test: C/SatdHighbdTest blocksize: 16 time: 138 us blocksize: 64 time: 315 us blocksize: 256 time: 1120 us blocksize: 1024 time: 3955 us AVX2/SatdHighbdTest blocksize: 16 time: 89 us blocksize: 64 time: 189 us blocksize: 256 time: 590 us blocksize: 1024 time: 1912 us Change-Id: I6357174462fccd589a475b13d8114b853cab5383
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -34,6 +34,18 @@
return (value >> 15) & 0xffff;
}
+ int32_t Rand20Signed(void) {
+ // Use 20 bits: values between 524287 and -524288.
+ const uint32_t value = random_.Generate(1048576);
+ return static_cast<int32_t>(value) - 524288;
+ }
+
+ int16_t Rand16Signed(void) {
+ // Use 16 bits: values between 32767 and -32768.
+ const uint32_t value = random_.Generate(65536);
+ return static_cast<int16_t>(value) - 32768;
+ }
+
int16_t Rand13Signed(void) {
// Use 13 bits: values between 4095 and -4096.
const uint32_t value = random_.Generate(8192);
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -250,12 +250,7 @@
for (int i = 0; i < satd_size_; ++i) src_[i] = val;
}
- void FillRandom() {
- for (int i = 0; i < satd_size_; ++i) {
- const int16_t tmp = rnd_.Rand16();
- src_[i] = (tran_low_t)tmp;
- }
- }
+ virtual void FillRandom() = 0;
void Check(const int expected) {
int total;
@@ -266,13 +261,23 @@
tran_low_t *GetCoeff() const { return src_; }
int satd_size_;
+ ACMRandom rnd_;
+ tran_low_t *src_;
private:
- tran_low_t *src_;
SatdFunc satd_func_;
- ACMRandom rnd_;
};
+class SatdLowbdTest : public SatdTest {
+ protected:
+ virtual void FillRandom() {
+ for (int i = 0; i < satd_size_; ++i) {
+ const int16_t tmp = rnd_.Rand16Signed();
+ src_[i] = (tran_low_t)tmp;
+ }
+ }
+};
+
typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
const tran_low_t *dqcoeff, int block_size);
typedef ::testing::tuple<int, BlockErrorFunc> BlockErrorTestFPParam;
@@ -402,7 +407,7 @@
RunComparison();
}
-TEST_P(SatdTest, MinValue) {
+TEST_P(SatdLowbdTest, MinValue) {
const int kMin = -32640;
const int expected = -kMin * satd_size_;
FillConstant(kMin);
@@ -409,7 +414,7 @@
Check(expected);
}
-TEST_P(SatdTest, MaxValue) {
+TEST_P(SatdLowbdTest, MaxValue) {
const int kMax = 32640;
const int expected = kMax * satd_size_;
FillConstant(kMax);
@@ -416,13 +421,13 @@
Check(expected);
}
-TEST_P(SatdTest, Random) {
+TEST_P(SatdLowbdTest, Random) {
int expected;
switch (satd_size_) {
- case 16: expected = 205298; break;
- case 64: expected = 1113950; break;
- case 256: expected = 4268415; break;
- case 1024: expected = 16954082; break;
+ case 16: expected = 263252; break;
+ case 64: expected = 1105420; break;
+ case 256: expected = 4252250; break;
+ case 1024: expected = 16876840; break;
default:
FAIL() << "Invalid satd size (" << satd_size_
<< ") valid: 16/64/256/1024";
@@ -431,7 +436,7 @@
Check(expected);
}
-TEST_P(SatdTest, DISABLED_Speed) {
+TEST_P(SatdLowbdTest, DISABLED_Speed) {
const int kCountSpeedTestBlock = 20000;
vpx_usec_timer timer;
const int blocksize = GET_PARAM(0);
@@ -447,6 +452,62 @@
printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+class SatdHighbdTest : public SatdTest {
+ protected:
+ virtual void FillRandom() {
+ for (int i = 0; i < satd_size_; ++i) {
+ src_[i] = rnd_.Rand20Signed();
+ }
+ }
+};
+
+TEST_P(SatdHighbdTest, MinValue) {
+ const int kMin = -524280;
+ const int expected = -kMin * satd_size_;
+ FillConstant(kMin);
+ Check(expected);
+}
+
+TEST_P(SatdHighbdTest, MaxValue) {
+ const int kMax = 524280;
+ const int expected = kMax * satd_size_;
+ FillConstant(kMax);
+ Check(expected);
+}
+
+TEST_P(SatdHighbdTest, Random) {
+ int expected;
+ switch (satd_size_) {
+ case 16: expected = 5249712; break;
+ case 64: expected = 18362120; break;
+ case 256: expected = 66100520; break;
+ case 1024: expected = 266094734; break;
+ default:
+ FAIL() << "Invalid satd size (" << satd_size_
+ << ") valid: 16/64/256/1024";
+ }
+ FillRandom();
+ Check(expected);
+}
+
+TEST_P(SatdHighbdTest, DISABLED_Speed) {
+ const int kCountSpeedTestBlock = 20000;
+ vpx_usec_timer timer;
+ const int blocksize = GET_PARAM(0);
+ FillRandom();
+ tran_low_t *coeff = GetCoeff();
+
+ vpx_usec_timer_start(&timer);
+ for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+ GET_PARAM(1)(coeff, blocksize);
+ }
+ vpx_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
TEST_P(BlockErrorTestFP, MinValue) {
const int64_t kMin = -32640;
const int64_t expected = kMin * kMin * txfm_size_;
@@ -512,9 +573,15 @@
::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_sse2),
make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2)));
#endif // HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(C, SatdHighbdTest,
+ ::testing::Values(make_tuple(16, &vpx_satd_c),
+ make_tuple(64, &vpx_satd_c),
+ make_tuple(256, &vpx_satd_c),
+ make_tuple(1024, &vpx_satd_c)));
#endif // CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(C, SatdTest,
+INSTANTIATE_TEST_CASE_P(C, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_c),
make_tuple(64, &vpx_satd_c),
make_tuple(256, &vpx_satd_c),
@@ -551,7 +618,7 @@
make_tuple(64, &vpx_int_pro_col_sse2,
&vpx_int_pro_col_c)));
-INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
+INSTANTIATE_TEST_CASE_P(SSE2, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_sse2),
make_tuple(64, &vpx_satd_sse2),
make_tuple(256, &vpx_satd_sse2),
@@ -566,13 +633,22 @@
#endif // HAVE_SSE2
#if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, SatdTest,
+INSTANTIATE_TEST_CASE_P(AVX2, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_avx2),
make_tuple(64, &vpx_satd_avx2),
make_tuple(256, &vpx_satd_avx2),
make_tuple(1024, &vpx_satd_avx2)));
+#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
+ AVX2, SatdHighbdTest,
+ ::testing::Values(make_tuple(16, &vpx_highbd_satd_avx2),
+ make_tuple(64, &vpx_highbd_satd_avx2),
+ make_tuple(256, &vpx_highbd_satd_avx2),
+ make_tuple(1024, &vpx_highbd_satd_avx2)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+INSTANTIATE_TEST_CASE_P(
AVX2, BlockErrorTestFP,
::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),
make_tuple(64, &vp9_block_error_fp_avx2),
@@ -604,7 +680,7 @@
make_tuple(64, &vpx_int_pro_col_neon,
&vpx_int_pro_col_c)));
-INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
+INSTANTIATE_TEST_CASE_P(NEON, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_neon),
make_tuple(64, &vpx_satd_neon),
make_tuple(256, &vpx_satd_neon),
@@ -649,7 +725,7 @@
// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are
// in place.
#if !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
+INSTANTIATE_TEST_CASE_P(MSA, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_msa),
make_tuple(64, &vpx_satd_msa),
make_tuple(256, &vpx_satd_msa),
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -5973,8 +5973,7 @@
vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
dst_stride, xd->bd);
highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
- // TODO(sdeng): Implement SIMD based high bit-depth satd.
- intra_cost = vpx_satd_c(coeff, pix_num);
+ intra_cost = vpx_highbd_satd(coeff, pix_num);
} else {
vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
dst_stride);
@@ -6020,7 +6019,7 @@
bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
- inter_cost = vpx_satd_c(coeff, pix_num);
+ inter_cost = vpx_highbd_satd(coeff, pix_num);
} else {
vp9_build_inter_predictor(
ref_frame[rf_idx]->y_buffer + mb_y_offset,
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -314,6 +314,19 @@
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+// coeff: dynamic range 20 bit.
+// length: value range {16, 64, 256, 1024}.
+int vpx_highbd_satd_c(const tran_low_t *coeff, int length) {
+ int i;
+ int satd = 0;
+ for (i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+ // satd: 30 bits
+ return satd;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
int vpx_satd_c(const tran_low_t *coeff, int length) {
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -796,6 +796,9 @@
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd avx2 sse2 neon/;
+
+ add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length";
+ specialize qw/vpx_highbd_satd avx2/;
} else {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -375,3 +375,26 @@
return _mm_cvtsi128_si32(accum_128);
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 8, coeff += 8) {
+ const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i abs = _mm256_abs_epi32(src_line);
+ accum = _mm256_add_epi32(accum, abs);
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH