ref: a09815925a1116847ba6948ca8aa25052816b26f
parent: 2bc20e65c715300f813c71df272f095275a04e3a
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Wed Mar 20 08:57:59 EDT 2019
Neon: Make gcc actually generate VMLA instructions for sparse mul Otherwise it was splitting the mla into a mul and an add
--- a/dnn/vec_neon.h
+++ b/dnn/vec_neon.h
@@ -187,13 +187,13 @@
for (j=0;j<cols;j++)
{- float xj= x[*idx++];
+ float32x4_t xj= vld1q_dup_f32(&x[*idx++]);
float32x4_t wvec;
- wvec = vld1q_f32(&w[0]); y0_3 = vmlaq_n_f32(y0_3, wvec, xj);
- wvec = vld1q_f32(&w[4]); y4_7 = vmlaq_n_f32(y4_7, wvec, xj);
- wvec = vld1q_f32(&w[8]); y8_11 = vmlaq_n_f32(y8_11, wvec, xj);
- wvec = vld1q_f32(&w[12]); y12_15 = vmlaq_n_f32(y12_15, wvec, xj);
+ wvec = vld1q_f32(&w[0]); y0_3 = vmlaq_f32(y0_3, wvec, xj);
+ wvec = vld1q_f32(&w[4]); y4_7 = vmlaq_f32(y4_7, wvec, xj);
+ wvec = vld1q_f32(&w[8]); y8_11 = vmlaq_f32(y8_11, wvec, xj);
+ wvec = vld1q_f32(&w[12]); y12_15 = vmlaq_f32(y12_15, wvec, xj);
w += 16;
}
--
⑨