ref: 60d6eab63d70d6892eea86543f2c84227ec3ac9a
parent: 3e7ab9ff87760e0a1b9507cdd023d5cf4119fd5b
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Tue Jul 6 13:33:33 EDT 2021
Doing a bit of unrolling to speed things up
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -502,7 +502,40 @@
colblocks = *idx++;
y = &out[i];
vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);
- for (j=0;j<colblocks;j++)
+ j=0;
+#if 1 /* Unrolling by 4 gives some gain, comment out if it does not. */
+ for (;j<colblocks-3;j+=4)
+ {+ __m256i tmp;
+ __m256i vxj;
+ __m256i vw;
+ vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
+ vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
+ tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
+ tmp = _mm256_madd_epi16(tmp, ones);
+ vy0 = _mm256_add_epi32(vy0, tmp);
+ w += 32;
+ vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
+ vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
+ tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
+ tmp = _mm256_madd_epi16(tmp, ones);
+ vy0 = _mm256_add_epi32(vy0, tmp);
+ w += 32;
+ vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
+ vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
+ tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
+ tmp = _mm256_madd_epi16(tmp, ones);
+ vy0 = _mm256_add_epi32(vy0, tmp);
+ w += 32;
+ vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
+ vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
+ tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
+ tmp = _mm256_madd_epi16(tmp, ones);
+ vy0 = _mm256_add_epi32(vy0, tmp);
+ w += 32;
+ }
+#endif
+ for (;j<colblocks;j++)
{__m256i tmp;
__m256i vxj;
--
⑨