ref: 64c4cedd3a93549b2a9c490734a76c9a28542934
parent: b02ac73d8c543fdb04fed525d677059595e76188
author: sdeng <sdeng@google.com>
date: Tue Nov 6 11:20:41 EST 2018
Add high bit Hadamard 32x32 avx2 implementation Speed test: [ RUN ] C/HadamardHighbdTest.DISABLED_Speed/2 Hadamard32x32[ 10 runs]: 9 us Hadamard32x32[ 10000 runs]: 8914 us Hadamard32x32[ 10000000 runs]: 8991776 us [ RUN ] AVX2/HadamardHighbdTest.DISABLED_Speed/2 Hadamard32x32[ 10 runs]: 5 us Hadamard32x32[ 10000 runs]: 4582 us Hadamard32x32[ 10000000 runs]: 4548203 us Change-Id: Ied1b38b510bd033299f05869216d394e3b7f70f1
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -311,8 +311,9 @@
INSTANTIATE_TEST_CASE_P(
AVX2, HadamardHighbdTest,
::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8),
- HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2,
- 16)));
+ HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2, 16),
+ HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_avx2,
+ 32)));
#endif // HAVE_AVX2
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -792,7 +792,7 @@
specialize qw/vpx_highbd_hadamard_16x16 avx2/;
add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_highbd_hadamard_32x32/;
+ specialize qw/vpx_highbd_hadamard_32x32 avx2/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd avx2 sse2 neon/;
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -175,6 +175,47 @@
t_coeff += 8;
}
}
+
+void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+ }
+
+ for (idx = 0; idx < 256; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 2);
+ b1 = _mm256_srai_epi32(b1, 2);
+ b2 = _mm256_srai_epi32(b2, 2);
+ b3 = _mm256_srai_epi32(b3, 2);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
#endif // CONFIG_VP9_HIGHBITDEPTH
static void hadamard_col8x2_avx2(__m256i *in, int iter) {