ref: e858863dda2e242ede57916dae4086a991f618dd
parent: 31f5369808a2c2155474cfb0adb6ea27f1147083
 parent: 94b96e4d163d406d8ac686efaa1c010aad8039ba
	author: Scott LaVarnway <slavarnway@google.com>
	date: Sun Jul 22 19:10:12 EDT 2018
	
Merge "VPX: Add vpx_hadamard_32x32_sse2"
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -302,6 +302,13 @@
 class Hadamard32x32Test : public HadamardTestBase {};+void HadamardSpeedTest32x32(HadamardFunc const func, int times) {+ DECLARE_ALIGNED(16, int16_t, input[1024]);
+ DECLARE_ALIGNED(16, tran_low_t, output[1024]);
+ memset(input, 1, sizeof(input));
+  HadamardSpeedTest("Hadamard32x32", func, input, 32, output, times);+}
+
 TEST_P(Hadamard32x32Test, CompareReferenceRandom) {CompareReferenceRandom<32>();
}
@@ -308,6 +315,17 @@
 TEST_P(Hadamard32x32Test, VaryStride) { VaryStride<32>(); }+TEST_P(Hadamard32x32Test, DISABLED_Speed) {+ HadamardSpeedTest32x32(h_func_, 10);
+ HadamardSpeedTest32x32(h_func_, 10000);
+ HadamardSpeedTest32x32(h_func_, 10000000);
+}
+
INSTANTIATE_TEST_CASE_P(C, Hadamard32x32Test,
::testing::Values(&vpx_hadamard_32x32_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard32x32Test,
+ ::testing::Values(&vpx_hadamard_32x32_sse2));
+#endif // HAVE_SSE2
} // namespace
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -783,7 +783,7 @@
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_32x32/;
+ specialize qw/vpx_hadamard_32x32 sse2/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd avx2 sse2 neon/;
@@ -795,7 +795,7 @@
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_32x32/;
+ specialize qw/vpx_hadamard_32x32 sse2/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
specialize qw/vpx_satd avx2 sse2 neon msa/;
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -372,6 +372,45 @@
}
}
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {+ int idx;
+  for (idx = 0; idx < 4; ++idx) {+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ vpx_hadamard_16x16_sse2(src_ptr, src_stride, coeff + idx * 256);
+ }
+
+  for (idx = 0; idx < 256; idx += 8) {+ __m128i coeff0 = load_tran_low(coeff);
+ __m128i coeff1 = load_tran_low(coeff + 256);
+ __m128i coeff2 = load_tran_low(coeff + 512);
+ __m128i coeff3 = load_tran_low(coeff + 768);
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 2);
+ b1 = _mm_srai_epi16(b1, 2);
+ b2 = _mm_srai_epi16(b2, 2);
+ b3 = _mm_srai_epi16(b3, 2);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ store_tran_low(coeff0, coeff);
+ store_tran_low(coeff1, coeff + 256);
+
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+ store_tran_low(coeff2, coeff + 512);
+ store_tran_low(coeff3, coeff + 768);
+
+ coeff += 8;
+ }
+}
+
 int vpx_satd_sse2(const tran_low_t *coeff, int length) {int i;
const __m128i zero = _mm_setzero_si128();
--
⑨