shithub: sox

Download patch

ref: 35eebaf602ffd1477da1b604d345fab37aa8d42c
parent: 8131c24b679175a8a10ab4fd39be766c4625ab4a
author: Rob Sykes <robs@users.sourceforge.net>
date: Tue Aug 7 15:55:47 EDT 2012

1) Restore multi-threaded FFT performance lost in 14.3.1.
2) Use lrint and loop-unrolling to improve performance of rate & dft_filter.

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,6 +75,7 @@
 
 check_include_files("byteswap.h"         HAVE_BYTESWAP_H)
 check_include_files("inttypes.h"         HAVE_INTTYPES_H)
+check_include_files("fenv.h"             HAVE_FENV_H)
 check_include_files("glob.h"             HAVE_GLOB_H)
 check_include_files("io.h"               HAVE_IO_H)
 #check_include_files("ltdl.h"             HAVE_LTDL_H) # no plug-ins as yet
@@ -102,6 +103,9 @@
 optional(NEED_LIBM math.h m pow "")
 if(NEED_LIBM)
   set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} -lm)
+  optional(HAVE_LRINT math.h m lrint "")
+else(NEED_LIBM)
+  check_function_exists("lrint" HAVE_LRINT)
 endif(NEED_LIBM)
 optional(EXTERNAL_GSM gsm/gsm.h gsm gsm_create "")
 optional(EXTERNAL_LPC10 lpc10/lpc10.h lpc10 lpc10_create "")
--- a/configure.ac
+++ b/configure.ac
@@ -138,10 +138,13 @@
 
 dnl Checks for header files.
 AC_HEADER_STDC
-AC_CHECK_HEADERS(fcntl.h unistd.h byteswap.h sys/stat.h sys/time.h sys/timeb.h sys/types.h sys/utsname.h termios.h glob.h)
+AC_CHECK_HEADERS(fcntl.h unistd.h byteswap.h sys/stat.h sys/time.h sys/timeb.h sys/types.h sys/utsname.h termios.h glob.h fenv.h)
 
 dnl Checks for library functions.
-AC_CHECK_FUNCS(strcasecmp strdup popen vsnprintf gettimeofday mkstemp fmemopen)
+AC_CHECK_FUNCS(strcasecmp strdup popen vsnprintf gettimeofday mkstemp fmemopen lrint)
+if test "$ac_cv_func_lrint" = no; then
+  AC_CHECK_LIB(m, lrint, AC_DEFINE(HAVE_LRINT, 1))
+fi
 
 dnl Check if math library is needed.
 AC_CHECK_FUNC(pow)
--- a/src/dft_filter.c
+++ b/src/dft_filter.c
@@ -79,20 +79,16 @@
                 sox_sample_t * obuf, size_t * isamp, size_t * osamp)
 {
   priv_t * p = (priv_t *)effp->priv;
-  size_t i, odone = min(*osamp, (size_t)fifo_occupancy(&p->output_fifo));
-  double const * s = fifo_read(&p->output_fifo, (int)odone, NULL);
-  SOX_SAMPLE_LOCALS;
+  size_t odone = min(*osamp, (size_t)fifo_occupancy(&p->output_fifo));
 
-  for (i = 0; i < odone; ++i)
-    *obuf++ = SOX_FLOAT_64BIT_TO_SAMPLE(*s++, effp->clips);
+  double const * s = fifo_read(&p->output_fifo, (int)odone, NULL);
+  lsx_save_samples(obuf, s, odone, &effp->clips);
   p->samples_out += odone;
 
   if (*isamp && odone < *osamp) {
     double * t = fifo_write(&p->input_fifo, (int)*isamp, NULL);
     p->samples_in += *isamp;
-
-    for (i = *isamp; i; --i)
-      *t++ = SOX_SAMPLE_TO_FLOAT_64BIT(*ibuf++, effp->clips);
+    lsx_load_samples(t, ibuf, *isamp);
     filter(p);
   }
   else *isamp = 0;
--- a/src/effects_i_dsp.c
+++ b/src/effects_i_dsp.c
@@ -26,8 +26,69 @@
 #include <assert.h>
 #include <string.h>
 
-/* Numerical Recipes cubic spline */
+/* Concurrent Control with "Readers" and "Writers", P.J. Courtois et al, 1971:*/
 
+#if defined HAVE_OPENMP
+
+typedef struct {
+  int readcount, writecount; /* initial value = 0 */
+  omp_lock_t mutex_1, mutex_2, mutex_3, w, r; /* initial value = 1 */
+} ccrw2_t; /* Problem #2: `writers-preference' */
+
+#define ccrw2_become_reader(p) do {\
+  omp_set_lock(&p.mutex_3);\
+    omp_set_lock(&p.r);\
+      omp_set_lock(&p.mutex_1);\
+        if (++p.readcount == 1) omp_set_lock(&p.w);\
+      omp_unset_lock(&p.mutex_1);\
+    omp_unset_lock(&p.r);\
+  omp_unset_lock(&p.mutex_3);\
+} while (0)
+#define ccrw2_cease_reading(p) do {\
+  omp_set_lock(&p.mutex_1);\
+    if (!--p.readcount) omp_unset_lock(&p.w);\
+  omp_unset_lock(&p.mutex_1);\
+} while (0)
+#define ccrw2_become_writer(p) do {\
+  omp_set_lock(&p.mutex_2);\
+    if (++p.writecount == 1) omp_set_lock(&p.r);\
+  omp_unset_lock(&p.mutex_2);\
+  omp_set_lock(&p.w);\
+} while (0)
+#define ccrw2_cease_writing(p) do {\
+  omp_unset_lock(&p.w);\
+  omp_set_lock(&p.mutex_2);\
+    if (!--p.writecount) omp_unset_lock(&p.r);\
+  omp_unset_lock(&p.mutex_2);\
+} while (0)
+#define ccrw2_init(p) do {\
+  omp_init_lock(&p.mutex_1);\
+  omp_init_lock(&p.mutex_2);\
+  omp_init_lock(&p.mutex_3);\
+  omp_init_lock(&p.w);\
+  omp_init_lock(&p.r);\
+} while (0)
+#define ccrw2_clear(p) do {\
+  omp_destroy_lock(&p.r);\
+  omp_destroy_lock(&p.w);\
+  omp_destroy_lock(&p.mutex_3);\
+  omp_destroy_lock(&p.mutex_2);\
+  omp_destroy_lock(&p.mutex_1);\
+} while (0)
+
+#else
+
+#define ccrw2_become_reader(x) (void)0
+#define ccrw2_cease_reading(x) (void)0
+#define ccrw2_become_writer(x) (void)0
+#define ccrw2_cease_writing(x) (void)0
+#define ccrw2_init(x) (void)0
+#define ccrw2_clear(x) (void)0
+
+#endif /* HAVE_OPENMP */
+
+/* Numerical Recipes cubic spline: */
+
 void lsx_prepare_spline3(double const * x, double const * y, int n,
     double start_1d, double end_1d, double * y_2d)
 {
@@ -98,8 +159,8 @@
 static int * lsx_fft_br;
 static double * lsx_fft_sc;
 static int fft_len = -1;
-#ifdef HAVE_OPENMP
-static omp_lock_t fft_cache_lock;
+#if defined HAVE_OPENMP
+static ccrw2_t fft_cache_ccrw;
 #endif
 
 void init_fft_cache(void)
@@ -107,7 +168,7 @@
   assert(lsx_fft_br == NULL);
   assert(lsx_fft_sc == NULL);
   assert(fft_len == -1);
-  omp_init_lock(&fft_cache_lock);
+  ccrw2_init(fft_cache_ccrw);
   fft_len = 0;
 }
 
@@ -114,7 +175,7 @@
 void clear_fft_cache(void)
 {
   assert(fft_len >= 0);
-  omp_destroy_lock(&fft_cache_lock);
+  ccrw2_clear(fft_cache_ccrw);
   free(lsx_fft_br);
   free(lsx_fft_sc);
   lsx_fft_sc = NULL;
@@ -122,33 +183,48 @@
   fft_len = -1;
 }
 
-static void update_fft_cache(int len)
+static sox_bool update_fft_cache(int len)
 {
   assert(lsx_is_power_of_2(len));
   assert(fft_len >= 0);
-  omp_set_lock(&fft_cache_lock);
+  ccrw2_become_reader(fft_cache_ccrw);
   if (len > fft_len) {
-    int old_n = fft_len;
-    fft_len = len;
-    lsx_fft_br = lsx_realloc(lsx_fft_br, dft_br_len(fft_len) * sizeof(*lsx_fft_br));
-    lsx_fft_sc = lsx_realloc(lsx_fft_sc, dft_sc_len(fft_len) * sizeof(*lsx_fft_sc));
-    if (!old_n)
-      lsx_fft_br[0] = 0;
+    ccrw2_cease_reading(fft_cache_ccrw);
+    ccrw2_become_writer(fft_cache_ccrw);
+    if (len > fft_len) {
+      int old_n = fft_len;
+      fft_len = len;
+      lsx_fft_br = lsx_realloc(lsx_fft_br, dft_br_len(fft_len) * sizeof(*lsx_fft_br));
+      lsx_fft_sc = lsx_realloc(lsx_fft_sc, dft_sc_len(fft_len) * sizeof(*lsx_fft_sc));
+      if (!old_n)
+        lsx_fft_br[0] = 0;
+      return sox_true;
+    }
+    ccrw2_cease_writing(fft_cache_ccrw);
+    ccrw2_become_reader(fft_cache_ccrw);
   }
+  return sox_false;
 }
 
+static void done_with_fft_cache(sox_bool is_writer)
+{
+  if (is_writer)
+    ccrw2_cease_writing(fft_cache_ccrw);
+  else ccrw2_cease_reading(fft_cache_ccrw);
+}
+
 void lsx_safe_rdft(int len, int type, double * d)
 {
-  update_fft_cache(len);
+  sox_bool is_writer = update_fft_cache(len);
   lsx_rdft(len, type, d, lsx_fft_br, lsx_fft_sc);
-  omp_unset_lock(&fft_cache_lock);
+  done_with_fft_cache(is_writer);
 }
 
 void lsx_safe_cdft(int len, int type, double * d)
 {
-  update_fft_cache(len);
+  sox_bool is_writer = update_fft_cache(len);
   lsx_cdft(len, type, d, lsx_fft_br, lsx_fft_sc);
-  omp_unset_lock(&fft_cache_lock);
+  done_with_fft_cache(is_writer);
 }
 
 void lsx_power_spectrum(int n, double const * in, double * out)
@@ -473,3 +549,82 @@
       printf("%24.16e\n", h[i]);
   }
 }
+
+#if HAVE_FENV_H
+  #include <fenv.h>
+  #if defined FE_INVALID
+    #if HAVE_LRINT && LONG_MAX == 2147483647
+      #define lrint32 lrint
+    #elif defined __GNUC__ && defined __x86_64__
+      #define lrint32 lrint32
+      static __inline sox_int32_t lrint32(double input) {
+        sox_int32_t result;
+        __asm__ __volatile__("fistpl %0": "=m"(result): "t"(input): "st");
+        return result;
+      }
+    #endif
+  #endif
+#endif
+
+#if defined lrint32
+#define _ dest[i] = lrint32(src[i]), ++i,
+#pragma STDC FENV_ACCESS ON
+
+static void rint_clip(sox_sample_t * const dest, double const * const src,
+    size_t i, size_t const n, sox_uint64_t * const clips)
+{
+  for (; i < n; ++i) {
+    dest[i] = lrint32(src[i]);
+    if (fetestexcept(FE_INVALID)) {
+      feclearexcept(FE_INVALID);
+      dest[i] = src[i] > 0? SOX_SAMPLE_MAX : SOX_SAMPLE_MIN;
+      ++*clips;
+    }
+  }
+}
+
+void lsx_save_samples(sox_sample_t * const dest, double const * const src,
+    size_t const n, sox_uint64_t * const clips)
+{
+  size_t i;
+  feclearexcept(FE_INVALID);
+  for (i = 0; i < (n & ~7);) {
+    _ _ _ _ _ _ _ _ 0;
+    if (fetestexcept(FE_INVALID)) {
+      feclearexcept(FE_INVALID);
+      rint_clip(dest, src, i - 8, i, clips);
+    }
+  }
+  rint_clip(dest, src, i, n, clips);
+}
+
+void lsx_load_samples(double * const dest, sox_sample_t const * const src,
+    size_t const n)
+{
+  size_t i;
+  for (i = 0; i < n; ++i)
+    dest[i] = src[i];
+}
+
+#pragma STDC FENV_ACCESS OFF
+#undef _
+#else
+
+void lsx_save_samples(sox_sample_t * const dest, double const * const src,
+    size_t const n, sox_uint64_t * const clips)
+{
+  SOX_SAMPLE_LOCALS;
+  size_t i;
+  for (i = 0; i < n; ++i)
+    dest[i] = SOX_FLOAT_64BIT_TO_SAMPLE(src[i], *clips);
+}
+
+void lsx_load_samples(double * const dest, sox_sample_t const * const src,
+    size_t const n)
+{
+  size_t i;
+  for (i = 0; i < n; ++i)
+    dest[i] = SOX_SAMPLE_TO_FLOAT_64BIT(src[i],);
+}
+
+#endif
--- a/src/rate.c
+++ b/src/rate.c
@@ -638,16 +638,14 @@
                 sox_sample_t * obuf, size_t * isamp, size_t * osamp)
 {
   priv_t * p = (priv_t *)effp->priv;
-  size_t i, odone = *osamp;
-  SOX_SAMPLE_LOCALS;
+  size_t odone = *osamp;
 
   sample_t const * s = rate_output(&p->rate, NULL, &odone);
-  for (i = 0; i < odone; ++i)
-    *obuf++ = SOX_FLOAT_64BIT_TO_SAMPLE(*s++, effp->clips);
+  lsx_save_samples(obuf, s, odone, &effp->clips);
 
   if (*isamp && odone < *osamp) {
     sample_t * t = rate_input(&p->rate, NULL, *isamp);
-    for (i = *isamp; i; --i) *t++ = SOX_SAMPLE_TO_FLOAT_64BIT(*ibuf++,);
+    lsx_load_samples(t, ibuf, *isamp);
     rate_process(&p->rate);
   }
   else *isamp = 0;
--- a/src/sox_i.h
+++ b/src/sox_i.h
@@ -115,6 +115,10 @@
 void lsx_fir_to_phase(double * * h, int * len,
     int * post_len, double phase0);
 void lsx_plot_fir(double * h, int num_points, sox_rate_t rate, sox_plot_t type, char const * title, double y1, double y2);
+void lsx_save_samples(sox_sample_t * const dest, double const * const src,
+    size_t const n, sox_uint64_t * const clips);
+void lsx_load_samples(double * const dest, sox_sample_t const * const src,
+    size_t const n);
 
 #ifdef HAVE_BYTESWAP_H
 #include <byteswap.h>
--- a/src/soxconfig.h.cmake
+++ b/src/soxconfig.h.cmake
@@ -8,6 +8,7 @@
 #cmakedefine HAVE_AO                  1
 #cmakedefine HAVE_BYTESWAP_H          1
 #cmakedefine HAVE_COREAUDIO           1
+#cmakedefine HAVE_FENV_H              1
 #cmakedefine HAVE_FFMPEG              1
 #cmakedefine HAVE_FLAC                1
 #cmakedefine HAVE_FMEMOPEN            1
@@ -21,6 +22,7 @@
 #cmakedefine HAVE_LAME_LAME_H         1
 #cmakedefine HAVE_LAME_SET_VBR_QUALITY 1
 #define HAVE_LPC10                    1
+#cmakedefine HAVE_LRINT               1
 #cmakedefine HAVE_LTDL_H              1
 #cmakedefine HAVE_MACHINE_SOUNDCARD_H 1
 #cmakedefine HAVE_MAD_H               1