ref: 1ada7d4d6f838dc0842fc89159747755c516ce24
parent: 166a6c8e49fe1335feae6ffc450325f7f5f628c6
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Thu Nov 2 22:46:38 EDT 2023
Vectorizing sgemv for multiples of 4 with SSE
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -709,6 +709,23 @@
}
_mm256_storeu_ps (&y[0], vy0);
}
+ for (;i<rows-3;i+=4)
+ {+ float *y;
+ __m128 vy0;
+ y = &out[i];
+ vy0 = _mm_setzero_ps();
+ for (j=0;j<cols;j++)
+ {+ __m128 vxj;
+ __m128 vw;
+ vxj = _mm_broadcast_ss(&x[j]);
+
+ vw = _mm_loadu_ps(&weights[j*col_stride + i]);
+ vy0 = _mm_fmadd_ps(vw, vxj, vy0);
+ }
+ _mm_storeu_ps (&y[0], vy0);
+ }
for (;i<rows;i++)
{out[i] = 0;
--
⑨