shithub: aubio

Download patch

ref: 20c876939927652944f45b2f3967bc9d59024e48
parent: 25cf9577c0f942143a95908d25d727cb43b4f819
parent: 799d992f0e2edac2a508919e6eb3b535fb8590c3
author: Paul Brossier <piem@piem.org>
date: Sun Oct 1 16:04:30 EDT 2017

Merge branch 'intel_ipp'

--- a/src/aubio_priv.h
+++ b/src/aubio_priv.h
@@ -123,7 +123,38 @@
 #endif /* HAVE_AUBIO_DOUBLE */
 #endif /* HAVE_ATLAS */
 
-#if !defined(HAVE_MEMCPY_HACKS) && !defined(HAVE_ACCELERATE) && !defined(HAVE_ATLAS)
+#if defined HAVE_INTEL_IPP
+#include <ippcore.h>
+#include <ippvm.h>
+#include <ipps.h>
+#ifndef HAVE_AUBIO_DOUBLE
+#define aubio_ippsSet         ippsSet_32f
+#define aubio_ippsZero        ippsZero_32f
+#define aubio_ippsCopy        ippsCopy_32f
+#define aubio_ippsMul         ippsMul_32f
+#define aubio_ippsMulC        ippsMulC_32f
+#define aubio_ippsAddC        ippsAddC_32f
+#define aubio_ippsLn          ippsLn_32f_A21
+#define aubio_ippsMean(a,b,c) ippsMean_32f(a, b, c, ippAlgHintFast)
+#define aubio_ippsSum(a,b,c)  ippsSum_32f(a, b, c, ippAlgHintFast)
+#define aubio_ippsMax         ippsMax_32f
+#define aubio_ippsMin         ippsMin_32f
+#else /* HAVE_AUBIO_DOUBLE */
+#define aubio_ippsSet         ippsSet_64f
+#define aubio_ippsZero        ippsZero_64f
+#define aubio_ippsCopy        ippsCopy_64f
+#define aubio_ippsMul         ippsMul_64f
+#define aubio_ippsMulC        ippsMulC_64f
+#define aubio_ippsAddC        ippsAddC_64f
+#define aubio_ippsLn          ippsLn_64f_A26
+#define aubio_ippsMean        ippsMean_64f
+#define aubio_ippsSum         ippsSum_64f
+#define aubio_ippsMax         ippsMax_64f
+#define aubio_ippsMin         ippsMin_64f
+#endif /* HAVE_AUBIO_DOUBLE */
+#endif
+
+#if !defined(HAVE_MEMCPY_HACKS) && !defined(HAVE_ACCELERATE) && !defined(HAVE_ATLAS) && !defined(HAVE_INTEL_IPP)
 #define HAVE_NOOPT 1
 #else
 #undef HAVE_NOOPT
--- a/src/cvec.c
+++ b/src/cvec.c
@@ -85,31 +85,40 @@
         s->length, t->length);
     return;
   }
-#ifdef HAVE_MEMCPY_HACKS
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsCopy(s->phas, t->phas, (int)s->length);
+  aubio_ippsCopy(s->norm, t->norm, (int)s->length);
+#elif defined(HAVE_MEMCPY_HACKS)
   memcpy(t->norm, s->norm, t->length * sizeof(smpl_t));
   memcpy(t->phas, s->phas, t->length * sizeof(smpl_t));
-#else /* HAVE_MEMCPY_HACKS */
+#else
   uint_t j;
   for (j=0; j< t->length; j++) {
     t->norm[j] = s->norm[j];
     t->phas[j] = s->phas[j];
   }
-#endif /* HAVE_MEMCPY_HACKS */
+#endif
 }
 
-void cvec_norm_set_all (cvec_t *s, smpl_t val) {
+void cvec_norm_set_all(cvec_t *s, smpl_t val) {
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsSet(val, s->norm, (int)s->length);
+#else
   uint_t j;
   for (j=0; j< s->length; j++) {
     s->norm[j] = val;
   }
+#endif
 }
 
 void cvec_norm_zeros(cvec_t *s) {
-#ifdef HAVE_MEMCPY_HACKS
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsZero(s->norm, (int)s->length);
+#elif defined(HAVE_MEMCPY_HACKS)
   memset(s->norm, 0, s->length * sizeof(smpl_t));
-#else /* HAVE_MEMCPY_HACKS */
+#else
   cvec_norm_set_all (s, 0.);
-#endif /* HAVE_MEMCPY_HACKS */
+#endif
 }
 
 void cvec_norm_ones(cvec_t *s) {
@@ -117,14 +126,20 @@
 }
 
 void cvec_phas_set_all (cvec_t *s, smpl_t val) {
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsSet(val, s->phas, (int)s->length);
+#else
   uint_t j;
   for (j=0; j< s->length; j++) {
     s->phas[j] = val;
   }
+#endif
 }
 
 void cvec_phas_zeros(cvec_t *s) {
-#ifdef HAVE_MEMCPY_HACKS
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsZero(s->phas, (int)s->length);
+#elif defined(HAVE_MEMCPY_HACKS)
   memset(s->phas, 0, s->length * sizeof(smpl_t));
 #else
   cvec_phas_set_all (s, 0.);
@@ -141,8 +156,14 @@
 }
 
 void cvec_logmag(cvec_t *s, smpl_t lambda) {
-  uint_t j;
-  for (j=0; j< s->length; j++) {
-    s->norm[j] = LOG(lambda * s->norm[j] + 1);
-  }
+  #if defined(HAVE_INTEL_IPP)
+    aubio_ippsMulC(s->norm, lambda, s->norm, (int)s->length);
+    aubio_ippsAddC(s->norm, 1.0, s->norm, (int)s->length);
+    aubio_ippsLn(s->norm, s->norm, (int)s->length);
+  #else
+    uint_t j;
+    for (j=0; j< s->length; j++) {
+      s->norm[j] = LOG(lambda * s->norm[j] + 1);
+    }
+  #endif
 }
--- a/src/fvec.c
+++ b/src/fvec.c
@@ -60,28 +60,31 @@
 }
 
 void fvec_set_all (fvec_t *s, smpl_t val) {
-#if !defined(HAVE_ACCELERATE) && !defined(HAVE_ATLAS)
-  uint_t j;
-  for (j=0; j< s->length; j++) {
-    s->data[j] = val;
-  }
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsSet(val, s->data, (int)s->length);
 #elif defined(HAVE_ATLAS)
   aubio_catlas_set(s->length, val, s->data, 1);
 #elif defined(HAVE_ACCELERATE)
   aubio_vDSP_vfill(&val, s->data, 1, s->length);
+#else
+  uint_t j;
+  for ( j = 0; j< s->length; j++ )
+  {
+    s->data[j] = val;
+  }
 #endif
 }
 
 void fvec_zeros(fvec_t *s) {
-#if !defined(HAVE_MEMCPY_HACKS) && !defined(HAVE_ACCELERATE)
-  fvec_set_all (s, 0.);
-#else
-#if defined(HAVE_MEMCPY_HACKS)
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsZero(s->data, (int)s->length);
+#elif defined(HAVE_ACCELERATE)
+  aubio_vDSP_vclr(s->data, 1, s->length);
+#elif defined(HAVE_MEMCPY_HACKS)
   memset(s->data, 0, s->length * sizeof(smpl_t));
 #else
-  aubio_vDSP_vclr(s->data, 1, s->length);
+  fvec_set_all(s, 0.);
 #endif
-#endif
 }
 
 void fvec_ones(fvec_t *s) {
@@ -96,27 +99,31 @@
 }
 
 void fvec_weight(fvec_t *s, const fvec_t *weight) {
-#ifndef HAVE_ACCELERATE
-  uint_t j;
   uint_t length = MIN(s->length, weight->length);
-  for (j=0; j< length; j++) {
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsMul(s->data, weight->data, s->data, (int)length);
+#elif defined(HAVE_ACCELERATE)
+  aubio_vDSP_vmul( s->data, 1, weight->data, 1, s->data, 1, length );
+#else
+  uint_t j;
+  for (j = 0; j < length; j++) {
     s->data[j] *= weight->data[j];
   }
-#else
-  aubio_vDSP_vmul(s->data, 1, weight->data, 1, s->data, 1, s->length);
 #endif /* HAVE_ACCELERATE */
 }
 
 void fvec_weighted_copy(const fvec_t *in, const fvec_t *weight, fvec_t *out) {
-#ifndef HAVE_ACCELERATE
+  uint_t length = MIN(in->length, MIN(out->length, weight->length));
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsMul(in->data, weight->data, out->data, (int)length);
+#elif defined(HAVE_ACCELERATE)
+  aubio_vDSP_vmul(in->data, 1, weight->data, 1, out->data, 1, length);
+#else
   uint_t j;
-  uint_t length = MIN(out->length, weight->length);
-  for (j=0; j< length; j++) {
+  for (j = 0; j < length; j++) {
     out->data[j] = in->data[j] * weight->data[j];
   }
-#else
-  aubio_vDSP_vmul(in->data, 1, weight->data, 1, out->data, 1, out->length);
-#endif /* HAVE_ACCELERATE */
+#endif
 }
 
 void fvec_copy(const fvec_t *s, fvec_t *t) {
@@ -125,16 +132,18 @@
         s->length, t->length);
     return;
   }
-#ifdef HAVE_NOOPT
-  uint_t j;
-  for (j=0; j< t->length; j++) {
-    t->data[j] = s->data[j];
-  }
-#elif defined(HAVE_MEMCPY_HACKS)
-  memcpy(t->data, s->data, t->length * sizeof(smpl_t));
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsCopy(s->data, t->data, (int)s->length);
 #elif defined(HAVE_ATLAS)
   aubio_cblas_copy(s->length, s->data, 1, t->data, 1);
 #elif defined(HAVE_ACCELERATE)
   aubio_vDSP_mmov(s->data, t->data, 1, s->length, 1, 1);
+#elif defined(HAVE_MEMCPY_HACKS)
+  memcpy(t->data, s->data, t->length * sizeof(smpl_t));
+#else
+  uint_t j;
+  for (j = 0; j < t->length; j++) {
+    t->data[j] = s->data[j];
+  }
 #endif
 }
--- a/src/mathutils.c
+++ b/src/mathutils.c
@@ -159,16 +159,19 @@
 fvec_mean (fvec_t * s)
 {
   smpl_t tmp = 0.0;
-#ifndef HAVE_ACCELERATE
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsMean(s->data, (int)s->length, &tmp);
+  return tmp;
+#elif defined(HAVE_ACCELERATE)
+  aubio_vDSP_meanv(s->data, 1, &tmp, s->length);
+  return tmp;
+#else
   uint_t j;
   for (j = 0; j < s->length; j++) {
     tmp += s->data[j];
   }
-  return tmp / (smpl_t) (s->length);
-#else
-  aubio_vDSP_meanv(s->data, 1, &tmp, s->length);
-  return tmp;
-#endif /* HAVE_ACCELERATE */
+  return tmp / (smpl_t)(s->length);
+#endif
 }
 
 smpl_t
@@ -175,14 +178,16 @@
 fvec_sum (fvec_t * s)
 {
   smpl_t tmp = 0.0;
-#ifndef HAVE_ACCELERATE
+#if defined(HAVE_INTEL_IPP)
+  aubio_ippsSum(s->data, (int)s->length, &tmp);
+#elif defined(HAVE_ACCELERATE)
+  aubio_vDSP_sve(s->data, 1, &tmp, s->length);
+#else
   uint_t j;
   for (j = 0; j < s->length; j++) {
     tmp += s->data[j];
   }
-#else
-  aubio_vDSP_sve(s->data, 1, &tmp, s->length);
-#endif /* HAVE_ACCELERATE */
+#endif
   return tmp;
 }
 
@@ -189,15 +194,18 @@
 smpl_t
 fvec_max (fvec_t * s)
 {
-#ifndef HAVE_ACCELERATE
+#if defined(HAVE_INTEL_IPP)
+  smpl_t tmp = 0.;
+  aubio_ippsMax( s->data, (int)s->length, &tmp);
+#elif defined(HAVE_ACCELERATE)
+  smpl_t tmp = 0.;
+  aubio_vDSP_maxv( s->data, 1, &tmp, s->length );
+#else
   uint_t j;
-  smpl_t tmp = 0.0;
-  for (j = 0; j < s->length; j++) {
+  smpl_t tmp = s->data[0];
+  for (j = 1; j < s->length; j++) {
     tmp = (tmp > s->data[j]) ? tmp : s->data[j];
   }
-#else
-  smpl_t tmp = 0.;
-  aubio_vDSP_maxv(s->data, 1, &tmp, s->length);
 #endif
   return tmp;
 }
@@ -205,15 +213,18 @@
 smpl_t
 fvec_min (fvec_t * s)
 {
-#ifndef HAVE_ACCELERATE
+#if defined(HAVE_INTEL_IPP)
+  smpl_t tmp = 0.;
+  aubio_ippsMin(s->data, (int)s->length, &tmp);
+#elif defined(HAVE_ACCELERATE)
+  smpl_t tmp = 0.;
+  aubio_vDSP_minv(s->data, 1, &tmp, s->length);
+#else
   uint_t j;
   smpl_t tmp = s->data[0];
-  for (j = 0; j < s->length; j++) {
+  for (j = 1; j < s->length; j++) {
     tmp = (tmp < s->data[j]) ? tmp : s->data[j];
   }
-#else
-  smpl_t tmp = 0.;
-  aubio_vDSP_minv(s->data, 1, &tmp, s->length);
 #endif
   return tmp;
 }
@@ -572,6 +583,17 @@
   uint_t i = 1;
   while (i < a) i <<= 1;
   return i;
+}
+
+uint_t
+aubio_power_of_two_order (uint_t a)
+{
+  int order = 0;
+  int temp = aubio_next_power_of_two(a);
+  while (temp >>= 1) {
+    ++order;
+  }
+  return order;
 }
 
 smpl_t
--- a/src/mathutils.h
+++ b/src/mathutils.h
@@ -312,6 +312,9 @@
 /** return the next power of power of 2 greater than a */
 uint_t aubio_next_power_of_two(uint_t a);
 
+/** return the log2 factor of the given power of 2 value a */
+uint_t aubio_power_of_two_order(uint_t a);
+
 /** compute normalised autocorrelation function
 
   \param input vector to compute autocorrelation from
--- a/src/spectral/fft.c
+++ b/src/spectral/fft.c
@@ -77,8 +77,7 @@
 // a global mutex for FFTW thread safety
 pthread_mutex_t aubio_fftw_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-#else
-#ifdef HAVE_ACCELERATE        // using ACCELERATE
+#elif defined HAVE_ACCELERATE        // using ACCELERATE
 // https://developer.apple.com/library/mac/#documentation/Accelerate/Reference/vDSPRef/Reference/reference.html
 #include <Accelerate/Accelerate.h>
 
@@ -112,32 +111,65 @@
 #define aubio_vvsqrt                   vvsqrt
 #endif /* HAVE_AUBIO_DOUBLE */
 
-#else                         // using OOURA
+#elif defined HAVE_INTEL_IPP // using INTEL IPP
+
+#if !HAVE_AUBIO_DOUBLE
+#define aubio_IppFloat                 Ipp32f
+#define aubio_IppComplex               Ipp32fc
+#define aubio_FFTSpec                  FFTSpec_R_32f
+#define aubio_ippsMalloc_complex       ippsMalloc_32fc
+#define aubio_ippsFFTInit_R            ippsFFTInit_R_32f
+#define aubio_ippsFFTGetSize_R         ippsFFTGetSize_R_32f
+#define aubio_ippsFFTInv_CCSToR        ippsFFTInv_CCSToR_32f
+#define aubio_ippsFFTFwd_RToCCS        ippsFFTFwd_RToCCS_32f
+#define aubio_ippsAtan2                ippsAtan2_32f_A21
+#else /* HAVE_AUBIO_DOUBLE */
+#define aubio_IppFloat                 Ipp64f
+#define aubio_IppComplex               Ipp64fc
+#define aubio_FFTSpec                  FFTSpec_R_64f
+#define aubio_ippsMalloc_complex       ippsMalloc_64fc
+#define aubio_ippsFFTInit_R            ippsFFTInit_R_64f
+#define aubio_ippsFFTGetSize_R         ippsFFTGetSize_R_64f
+#define aubio_ippsFFTInv_CCSToR        ippsFFTInv_CCSToR_64f
+#define aubio_ippsFFTFwd_RToCCS        ippsFFTFwd_RToCCS_64f
+#define aubio_ippsAtan2                ippsAtan2_64f_A50
+#endif
+
+
+#else // using OOURA
 // let's use ooura instead
 extern void aubio_ooura_rdft(int, int, smpl_t *, int *, smpl_t *);
 
-#endif /* HAVE_ACCELERATE */
-#endif /* HAVE_FFTW3 */
+#endif
 
 struct _aubio_fft_t {
   uint_t winsize;
   uint_t fft_size;
+
 #ifdef HAVE_FFTW3             // using FFTW3
   real_t *in, *out;
   fftw_plan pfw, pbw;
-  fft_data_t * specdata;      /* complex spectral data */
-#else
-#ifdef HAVE_ACCELERATE        // using ACCELERATE
+  fft_data_t * specdata; /* complex spectral data */
+
+#elif defined HAVE_ACCELERATE  // using ACCELERATE
   int log2fftsize;
   aubio_FFTSetup fftSetup;
   aubio_DSPSplitComplex spec;
   smpl_t *in, *out;
+
+#elif defined HAVE_INTEL_IPP  // using Intel IPP
+  smpl_t *in, *out;
+  Ipp8u* memSpec;
+  Ipp8u* memInit;
+  Ipp8u* memBuffer;
+  struct aubio_FFTSpec* fftSpec;
+  aubio_IppComplex* complexOut;
 #else                         // using OOURA
   smpl_t *in, *out;
   smpl_t *w;
   int *ip;
-#endif /* HAVE_ACCELERATE */
-#endif /* HAVE_FFTW3 */
+#endif /* using OOURA */
+
   fvec_t * compspec;
 };
 
@@ -147,6 +179,7 @@
     AUBIO_ERR("fft: got winsize %d, but can not be < 2\n", winsize);
     goto beach;
   }
+
 #ifdef HAVE_FFTW3
   uint_t i;
   s->winsize  = winsize;
@@ -175,17 +208,55 @@
   for (i = 0; i < s->fft_size; i++) {
     s->specdata[i] = 0.;
   }
-#else
-#ifdef HAVE_ACCELERATE        // using ACCELERATE
+
+#elif defined HAVE_ACCELERATE  // using ACCELERATE
   s->winsize = winsize;
   s->fft_size = winsize;
   s->compspec = new_fvec(winsize);
-  s->log2fftsize = (uint_t)log2f(s->fft_size);
+  s->log2fftsize = aubio_power_of_two_order(s->fft_size);
   s->in = AUBIO_ARRAY(smpl_t, s->fft_size);
   s->out = AUBIO_ARRAY(smpl_t, s->fft_size);
   s->spec.realp = AUBIO_ARRAY(smpl_t, s->fft_size/2);
   s->spec.imagp = AUBIO_ARRAY(smpl_t, s->fft_size/2);
   s->fftSetup = aubio_vDSP_create_fftsetup(s->log2fftsize, FFT_RADIX2);
+
+#elif defined HAVE_INTEL_IPP  // using Intel IPP
+  const IppHintAlgorithm qualityHint = ippAlgHintAccurate; // OR ippAlgHintFast;
+  const int flags = IPP_FFT_NODIV_BY_ANY; // we're scaling manually afterwards
+  int order = aubio_power_of_two_order(winsize);
+  int sizeSpec, sizeInit, sizeBuffer;
+  IppStatus status;
+
+  if (winsize <= 4 || aubio_is_power_of_two(winsize) != 1)
+  {
+    AUBIO_ERR("intel IPP fft: can only create with sizes > 4 and power of two, requested %d,"
+      " try recompiling aubio with --enable-fftw3\n", winsize);
+    goto beach;
+  }
+
+  status = aubio_ippsFFTGetSize_R(order, flags, qualityHint,
+      &sizeSpec, &sizeInit, &sizeBuffer);
+  if (status != ippStsNoErr) {
+    AUBIO_ERR("fft: failed to initialize fft. IPP error: %d\n", status);
+    goto beach;
+  }
+  s->fft_size = s->winsize = winsize;
+  s->compspec = new_fvec(winsize);
+  s->in = AUBIO_ARRAY(smpl_t, s->winsize);
+  s->out = AUBIO_ARRAY(smpl_t, s->winsize);
+  s->memSpec = ippsMalloc_8u(sizeSpec);
+  s->memBuffer = ippsMalloc_8u(sizeBuffer);
+  if (sizeInit > 0 ) {
+    s->memInit = ippsMalloc_8u(sizeInit);
+  }
+  s->complexOut = aubio_ippsMalloc_complex(s->fft_size / 2 + 1);
+  status = aubio_ippsFFTInit_R(
+    &s->fftSpec, order, flags, qualityHint, s->memSpec, s->memInit);
+  if (status != ippStsNoErr) {
+    AUBIO_ERR("fft: failed to initialize. IPP error: %d\n", status);
+    goto beach;
+  }
+
 #else                         // using OOURA
   if (aubio_is_power_of_two(winsize) != 1) {
     AUBIO_ERR("fft: can only create with sizes power of two, requested %d,"
@@ -200,9 +271,10 @@
   s->ip    = AUBIO_ARRAY(int   , s->fft_size);
   s->w     = AUBIO_ARRAY(smpl_t, s->fft_size);
   s->ip[0] = 0;
-#endif /* HAVE_ACCELERATE */
-#endif /* HAVE_FFTW3 */
+#endif /* using OOURA */
+
   return s;
+
 beach:
   AUBIO_FREE(s);
   return NULL;
@@ -210,7 +282,6 @@
 
 void del_aubio_fft(aubio_fft_t * s) {
   /* destroy data */
-  del_fvec(s->compspec);
 #ifdef HAVE_FFTW3             // using FFTW3
   pthread_mutex_lock(&aubio_fftw_mutex);
   fftw_destroy_plan(s->pfw);
@@ -217,18 +288,26 @@
   fftw_destroy_plan(s->pbw);
   fftw_free(s->specdata);
   pthread_mutex_unlock(&aubio_fftw_mutex);
-#else /* HAVE_FFTW3 */
-#ifdef HAVE_ACCELERATE        // using ACCELERATE
+
+#elif defined HAVE_ACCELERATE // using ACCELERATE
   AUBIO_FREE(s->spec.realp);
   AUBIO_FREE(s->spec.imagp);
   aubio_vDSP_destroy_fftsetup(s->fftSetup);
+
+#elif defined HAVE_INTEL_IPP  // using Intel IPP
+  ippFree(s->memSpec);
+  ippFree(s->memInit);
+  ippFree(s->memBuffer);
+  ippFree(s->complexOut);
+
 #else                         // using OOURA
   AUBIO_FREE(s->w);
   AUBIO_FREE(s->ip);
-#endif /* HAVE_ACCELERATE */
-#endif /* HAVE_FFTW3 */
-  AUBIO_FREE(s->out);
+#endif
+
+  del_fvec(s->compspec);
   AUBIO_FREE(s->in);
+  AUBIO_FREE(s->out);
   AUBIO_FREE(s);
 }
 
@@ -251,6 +330,7 @@
 #else
   memcpy(s->in, input->data, s->winsize * sizeof(smpl_t));
 #endif /* HAVE_MEMCPY_HACKS */
+
 #ifdef HAVE_FFTW3             // using FFTW3
   fftw_execute(s->pfw);
 #ifdef HAVE_COMPLEX_H
@@ -265,8 +345,8 @@
     compspec->data[i] = s->specdata[i];
   }
 #endif /* HAVE_COMPLEX_H */
-#else /* HAVE_FFTW3 */
-#ifdef HAVE_ACCELERATE        // using ACCELERATE
+
+#elif defined HAVE_ACCELERATE // using ACCELERATE
   // convert real data to even/odd format used in vDSP
   aubio_vDSP_ctoz((aubio_DSPComplex*)s->in, 2, &s->spec, 1, s->fft_size/2);
   // compute the FFT
@@ -281,6 +361,19 @@
   // apply scaling
   smpl_t scale = 1./2.;
   aubio_vDSP_vsmul(compspec->data, 1, &scale, compspec->data, 1, s->fft_size);
+
+#elif defined HAVE_INTEL_IPP  // using Intel IPP
+
+  // apply fft
+  aubio_ippsFFTFwd_RToCCS(s->in, (aubio_IppFloat*)s->complexOut, s->fftSpec, s->memBuffer);
+  // convert complex buffer to [ r0, r1, ..., rN, iN-1, .., i2, i1]
+  compspec->data[0] = s->complexOut[0].re;
+  compspec->data[s->fft_size / 2] = s->complexOut[s->fft_size / 2].re;
+  for (i = 1; i < s->fft_size / 2; i++) {
+    compspec->data[i] = s->complexOut[i].re;
+    compspec->data[s->fft_size - i] = s->complexOut[i].im;
+  }
+
 #else                         // using OOURA
   aubio_ooura_rdft(s->winsize, 1, s->in, s->ip, s->w);
   compspec->data[0] = s->in[0];
@@ -289,8 +382,7 @@
     compspec->data[i] = s->in[2 * i];
     compspec->data[s->winsize - i] = - s->in[2 * i + 1];
   }
-#endif /* HAVE_ACCELERATE */
-#endif /* HAVE_FFTW3 */
+#endif /* using OOURA */
 }
 
 void aubio_fft_rdo_complex(aubio_fft_t * s, const fvec_t * compspec, fvec_t * output) {
@@ -313,8 +405,8 @@
   for (i = 0; i < output->length; i++) {
     output->data[i] = s->out[i]*renorm;
   }
-#else /* HAVE_FFTW3 */
-#ifdef HAVE_ACCELERATE        // using ACCELERATE
+
+#elif defined HAVE_ACCELERATE // using ACCELERATE
   // convert from real imag  [ r0, r1, ..., rN, iN-1, .., i2, i1]
   // to vDSP packed format   [ r0, rN, r1, i1, ..., rN-1, iN-1 ]
   s->out[0] = compspec->data[0];
@@ -332,6 +424,23 @@
   // apply scaling
   smpl_t scale = 1.0 / s->winsize;
   aubio_vDSP_vsmul(output->data, 1, &scale, output->data, 1, s->fft_size);
+
+#elif defined HAVE_INTEL_IPP  // using Intel IPP
+
+  // convert from real imag  [ r0, 0, ..., rN, iN-1, .., i2, i1, iN-1] to complex format
+  s->complexOut[0].re = compspec->data[0];
+  s->complexOut[0].im = 0;
+  s->complexOut[s->fft_size / 2].re = compspec->data[s->fft_size / 2];
+  s->complexOut[s->fft_size / 2].im = 0.0;
+  for (i = 1; i < s->fft_size / 2; i++) {
+    s->complexOut[i].re = compspec->data[i];
+    s->complexOut[i].im = compspec->data[s->fft_size - i];
+  }
+  // apply fft
+  aubio_ippsFFTInv_CCSToR((const aubio_IppFloat *)s->complexOut, output->data, s->fftSpec, s->memBuffer);
+  // apply scaling
+  aubio_ippsMulC(output->data, 1.0 / s->winsize, output->data, s->fft_size);
+
 #else                         // using OOURA
   smpl_t scale = 2.0 / s->winsize;
   s->out[0] = compspec->data[0];
@@ -344,8 +453,7 @@
   for (i=0; i < s->winsize; i++) {
     output->data[i] = s->out[i] * scale;
   }
-#endif /* HAVE_ACCELERATE */
-#endif /* HAVE_FFTW3 */
+#endif
 }
 
 void aubio_fft_get_spectrum(const fvec_t * compspec, cvec_t * spectrum) {
@@ -365,10 +473,26 @@
   } else {
     spectrum->phas[0] = 0.;
   }
+#if defined(HAVE_INTEL_IPP)
+  // convert from real imag  [ r0, r1, ..., rN, iN-1, ..., i2, i1, i0]
+  //                     to  [ r0, r1, ..., rN, i0, i1, i2, ..., iN-1]
+  for (i = 1; i < spectrum->length / 2; i++) {
+    ELEM_SWAP(compspec->data[compspec->length - i],
+        compspec->data[spectrum->length + i - 1]);
+  }
+  aubio_ippsAtan2(compspec->data + spectrum->length,
+      compspec->data + 1, spectrum->phas + 1, spectrum->length - 1);
+  // revert the imaginary part back again
+  for (i = 1; i < spectrum->length / 2; i++) {
+    ELEM_SWAP(compspec->data[spectrum->length + i - 1],
+        compspec->data[compspec->length - i]);
+  }
+#else
   for (i=1; i < spectrum->length - 1; i++) {
     spectrum->phas[i] = ATAN2(compspec->data[compspec->length-i],
         compspec->data[i]);
   }
+#endif
   if (compspec->data[compspec->length/2] < 0) {
     spectrum->phas[spectrum->length - 1] = PI;
   } else {
--- a/src/wscript_build
+++ b/src/wscript_build
@@ -3,6 +3,7 @@
 uselib = []
 uselib += ['M']
 uselib += ['FFTW3', 'FFTW3F']
+uselib += ['INTEL_IPP']
 uselib += ['SAMPLERATE']
 uselib += ['SNDFILE']
 uselib += ['AVCODEC']
--- a/wscript
+++ b/wscript
@@ -50,6 +50,9 @@
     add_option_enable_disable(ctx, 'fftw3', default = False,
             help_str = 'compile with fftw3 instead of ooura',
             help_disable_str = 'do not compile with fftw3')
+    add_option_enable_disable(ctx, 'intelipp', default = False,
+            help_str = 'use Intel IPP libraries (auto)',
+            help_disable_str = 'do not use Intel IPP libraries')
     add_option_enable_disable(ctx, 'complex', default = False,
             help_str ='compile with C99 complex',
             help_disable_str = 'do not use C99 complex (default)' )
@@ -155,6 +158,10 @@
         ctx.env.LINKFLAGS += ['/DEBUG', '/INCREMENTAL:NO']
         # configure warnings
         ctx.env.CFLAGS += ['/W4', '/D_CRT_SECURE_NO_WARNINGS']
+        # ignore "possible loss of data" warnings
+        ctx.env.CFLAGS += ['/wd4305', '/wd4244', '/wd4245', '/wd4267']
+        # ignore "unreferenced formal parameter" warnings
+        ctx.env.CFLAGS += ['/wd4100']
         # set optimization level and runtime libs
         if (ctx.options.build_type == "release"):
             ctx.env.CFLAGS += ['/Ox']
@@ -244,7 +251,9 @@
         ctx.env.cstlib_PATTERN = '%s.a'
 
         # tell emscripten functions we want to expose
-        from python.lib.gen_external import get_c_declarations, get_cpp_objects_from_c_declarations, get_all_func_names_from_lib, generate_lib_from_c_declarations
+        from python.lib.gen_external import get_c_declarations, \
+                get_cpp_objects_from_c_declarations, get_all_func_names_from_lib, \
+                generate_lib_from_c_declarations
         c_decls = get_c_declarations(usedouble=False)  # emscripten can't use double
         objects = list(get_cpp_objects_from_c_declarations(c_decls))
         # ensure that aubio structs are exported
@@ -283,6 +292,21 @@
     else:
         ctx.msg('Checking if complex.h is enabled', 'no')
 
+    # check for Intel IPP
+    if (ctx.options.enable_intelipp != False):
+        has_ipp_headers = ctx.check(header_name=['ippcore.h', 'ippvm.h', 'ipps.h'],
+                mandatory = False)
+        has_ipp_libs = ctx.check(lib=['ippcore', 'ippvm', 'ipps'],
+                uselib_store='INTEL_IPP', mandatory = False)
+        if (has_ipp_headers and has_ipp_libs):
+            ctx.msg('Checking if Intel IPP is available', 'yes')
+            ctx.define('HAVE_INTEL_IPP', 1)
+            if ctx.env.CC_NAME == 'msvc':
+                # force linking multi-threaded static IPP libraries on Windows with msvc
+                ctx.define('_IPP_SEQUENTIAL_STATIC', 1)
+        else:
+            ctx.msg('Checking if Intel IPP is available', 'no')
+
     # check for fftw3
     if (ctx.options.enable_fftw3 != False or ctx.options.enable_fftw3f != False):
         # one of fftwf or fftw3f
@@ -306,7 +330,7 @@
                         mandatory = ctx.options.enable_fftw3)
         ctx.define('HAVE_FFTW3', 1)
 
-    # fftw not enabled, use vDSP or ooura
+    # fftw not enabled, use vDSP, intelIPP or ooura
     if 'HAVE_FFTW3F' in ctx.env.define_key:
         ctx.msg('Checking for FFT implementation', 'fftw3f')
     elif 'HAVE_FFTW3' in ctx.env.define_key:
@@ -313,6 +337,8 @@
         ctx.msg('Checking for FFT implementation', 'fftw3')
     elif 'HAVE_ACCELERATE' in ctx.env.define_key:
         ctx.msg('Checking for FFT implementation', 'vDSP')
+    elif 'HAVE_INTEL_IPP' in ctx.env.define_key:
+        ctx.msg('Checking for FFT implementation', 'Intel IPP')
     else:
         ctx.msg('Checking for FFT implementation', 'ooura')