ref: 35eebaf602ffd1477da1b604d345fab37aa8d42c
parent: 8131c24b679175a8a10ab4fd39be766c4625ab4a
author: Rob Sykes <robs@users.sourceforge.net>
date: Tue Aug 7 15:55:47 EDT 2012
1) Restore multi-threaded FFT performance lost in 14.3.1. 2) Use lrint and loop-unrolling to improve performance of rate & dft_filter.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,6 +75,7 @@
check_include_files("byteswap.h" HAVE_BYTESWAP_H)
check_include_files("inttypes.h" HAVE_INTTYPES_H)
+check_include_files("fenv.h" HAVE_FENV_H)
check_include_files("glob.h" HAVE_GLOB_H)
check_include_files("io.h" HAVE_IO_H)
#check_include_files("ltdl.h" HAVE_LTDL_H) # no plug-ins as yet
@@ -102,6 +103,9 @@
optional(NEED_LIBM math.h m pow "")
if(NEED_LIBM)
set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} -lm)
+ optional(HAVE_LRINT math.h m lrint "")
+else(NEED_LIBM)
+ check_function_exists("lrint" HAVE_LRINT)
endif(NEED_LIBM)
optional(EXTERNAL_GSM gsm/gsm.h gsm gsm_create "")
optional(EXTERNAL_LPC10 lpc10/lpc10.h lpc10 lpc10_create "")
--- a/configure.ac
+++ b/configure.ac
@@ -138,10 +138,13 @@
dnl Checks for header files.
AC_HEADER_STDC
-AC_CHECK_HEADERS(fcntl.h unistd.h byteswap.h sys/stat.h sys/time.h sys/timeb.h sys/types.h sys/utsname.h termios.h glob.h)
+AC_CHECK_HEADERS(fcntl.h unistd.h byteswap.h sys/stat.h sys/time.h sys/timeb.h sys/types.h sys/utsname.h termios.h glob.h fenv.h)
dnl Checks for library functions.
-AC_CHECK_FUNCS(strcasecmp strdup popen vsnprintf gettimeofday mkstemp fmemopen)
+AC_CHECK_FUNCS(strcasecmp strdup popen vsnprintf gettimeofday mkstemp fmemopen lrint)
+if test "$ac_cv_func_lrint" = no; then
+ AC_CHECK_LIB(m, lrint, AC_DEFINE(HAVE_LRINT, 1))
+fi
dnl Check if math library is needed.
AC_CHECK_FUNC(pow)
--- a/src/dft_filter.c
+++ b/src/dft_filter.c
@@ -79,20 +79,16 @@
sox_sample_t * obuf, size_t * isamp, size_t * osamp)
{
priv_t * p = (priv_t *)effp->priv;
- size_t i, odone = min(*osamp, (size_t)fifo_occupancy(&p->output_fifo));
- double const * s = fifo_read(&p->output_fifo, (int)odone, NULL);
- SOX_SAMPLE_LOCALS;
+ size_t odone = min(*osamp, (size_t)fifo_occupancy(&p->output_fifo));
- for (i = 0; i < odone; ++i)
- *obuf++ = SOX_FLOAT_64BIT_TO_SAMPLE(*s++, effp->clips);
+ double const * s = fifo_read(&p->output_fifo, (int)odone, NULL);
+ lsx_save_samples(obuf, s, odone, &effp->clips);
p->samples_out += odone;
if (*isamp && odone < *osamp) {
double * t = fifo_write(&p->input_fifo, (int)*isamp, NULL);
p->samples_in += *isamp;
-
- for (i = *isamp; i; --i)
- *t++ = SOX_SAMPLE_TO_FLOAT_64BIT(*ibuf++, effp->clips);
+ lsx_load_samples(t, ibuf, *isamp);
filter(p);
}
else *isamp = 0;
--- a/src/effects_i_dsp.c
+++ b/src/effects_i_dsp.c
@@ -26,8 +26,69 @@
#include <assert.h>
#include <string.h>
-/* Numerical Recipes cubic spline */
+/* Concurrent Control with "Readers" and "Writers", P.J. Courtois et al, 1971:*/
+#if defined HAVE_OPENMP
+
+typedef struct {
+ int readcount, writecount; /* initial value = 0 */
+ omp_lock_t mutex_1, mutex_2, mutex_3, w, r; /* initial value = 1 */
+} ccrw2_t; /* Problem #2: `writers-preference' */
+
+#define ccrw2_become_reader(p) do {\
+ omp_set_lock(&p.mutex_3);\
+ omp_set_lock(&p.r);\
+ omp_set_lock(&p.mutex_1);\
+ if (++p.readcount == 1) omp_set_lock(&p.w);\
+ omp_unset_lock(&p.mutex_1);\
+ omp_unset_lock(&p.r);\
+ omp_unset_lock(&p.mutex_3);\
+} while (0)
+#define ccrw2_cease_reading(p) do {\
+ omp_set_lock(&p.mutex_1);\
+ if (!--p.readcount) omp_unset_lock(&p.w);\
+ omp_unset_lock(&p.mutex_1);\
+} while (0)
+#define ccrw2_become_writer(p) do {\
+ omp_set_lock(&p.mutex_2);\
+ if (++p.writecount == 1) omp_set_lock(&p.r);\
+ omp_unset_lock(&p.mutex_2);\
+ omp_set_lock(&p.w);\
+} while (0)
+#define ccrw2_cease_writing(p) do {\
+ omp_unset_lock(&p.w);\
+ omp_set_lock(&p.mutex_2);\
+ if (!--p.writecount) omp_unset_lock(&p.r);\
+ omp_unset_lock(&p.mutex_2);\
+} while (0)
+#define ccrw2_init(p) do {\
+ omp_init_lock(&p.mutex_1);\
+ omp_init_lock(&p.mutex_2);\
+ omp_init_lock(&p.mutex_3);\
+ omp_init_lock(&p.w);\
+ omp_init_lock(&p.r);\
+} while (0)
+#define ccrw2_clear(p) do {\
+ omp_destroy_lock(&p.r);\
+ omp_destroy_lock(&p.w);\
+ omp_destroy_lock(&p.mutex_3);\
+ omp_destroy_lock(&p.mutex_2);\
+ omp_destroy_lock(&p.mutex_1);\
+} while (0)
+
+#else
+
+#define ccrw2_become_reader(x) (void)0
+#define ccrw2_cease_reading(x) (void)0
+#define ccrw2_become_writer(x) (void)0
+#define ccrw2_cease_writing(x) (void)0
+#define ccrw2_init(x) (void)0
+#define ccrw2_clear(x) (void)0
+
+#endif /* HAVE_OPENMP */
+
+/* Numerical Recipes cubic spline: */
+
void lsx_prepare_spline3(double const * x, double const * y, int n,
double start_1d, double end_1d, double * y_2d)
{
@@ -98,8 +159,8 @@
static int * lsx_fft_br;
static double * lsx_fft_sc;
static int fft_len = -1;
-#ifdef HAVE_OPENMP
-static omp_lock_t fft_cache_lock;
+#if defined HAVE_OPENMP
+static ccrw2_t fft_cache_ccrw;
#endif
void init_fft_cache(void)
@@ -107,7 +168,7 @@
assert(lsx_fft_br == NULL);
assert(lsx_fft_sc == NULL);
assert(fft_len == -1);
- omp_init_lock(&fft_cache_lock);
+ ccrw2_init(fft_cache_ccrw);
fft_len = 0;
}
@@ -114,7 +175,7 @@
void clear_fft_cache(void)
{
assert(fft_len >= 0);
- omp_destroy_lock(&fft_cache_lock);
+ ccrw2_clear(fft_cache_ccrw);
free(lsx_fft_br);
free(lsx_fft_sc);
lsx_fft_sc = NULL;
@@ -122,33 +183,48 @@
fft_len = -1;
}
-static void update_fft_cache(int len)
+static sox_bool update_fft_cache(int len)
{
assert(lsx_is_power_of_2(len));
assert(fft_len >= 0);
- omp_set_lock(&fft_cache_lock);
+ ccrw2_become_reader(fft_cache_ccrw);
if (len > fft_len) {
- int old_n = fft_len;
- fft_len = len;
- lsx_fft_br = lsx_realloc(lsx_fft_br, dft_br_len(fft_len) * sizeof(*lsx_fft_br));
- lsx_fft_sc = lsx_realloc(lsx_fft_sc, dft_sc_len(fft_len) * sizeof(*lsx_fft_sc));
- if (!old_n)
- lsx_fft_br[0] = 0;
+ ccrw2_cease_reading(fft_cache_ccrw);
+ ccrw2_become_writer(fft_cache_ccrw);
+ if (len > fft_len) {
+ int old_n = fft_len;
+ fft_len = len;
+ lsx_fft_br = lsx_realloc(lsx_fft_br, dft_br_len(fft_len) * sizeof(*lsx_fft_br));
+ lsx_fft_sc = lsx_realloc(lsx_fft_sc, dft_sc_len(fft_len) * sizeof(*lsx_fft_sc));
+ if (!old_n)
+ lsx_fft_br[0] = 0;
+ return sox_true;
+ }
+ ccrw2_cease_writing(fft_cache_ccrw);
+ ccrw2_become_reader(fft_cache_ccrw);
}
+ return sox_false;
}
+static void done_with_fft_cache(sox_bool is_writer)
+{
+ if (is_writer)
+ ccrw2_cease_writing(fft_cache_ccrw);
+ else ccrw2_cease_reading(fft_cache_ccrw);
+}
+
void lsx_safe_rdft(int len, int type, double * d)
{
- update_fft_cache(len);
+ sox_bool is_writer = update_fft_cache(len);
lsx_rdft(len, type, d, lsx_fft_br, lsx_fft_sc);
- omp_unset_lock(&fft_cache_lock);
+ done_with_fft_cache(is_writer);
}
void lsx_safe_cdft(int len, int type, double * d)
{
- update_fft_cache(len);
+ sox_bool is_writer = update_fft_cache(len);
lsx_cdft(len, type, d, lsx_fft_br, lsx_fft_sc);
- omp_unset_lock(&fft_cache_lock);
+ done_with_fft_cache(is_writer);
}
void lsx_power_spectrum(int n, double const * in, double * out)
@@ -473,3 +549,82 @@
printf("%24.16e\n", h[i]);
}
}
+
+#if HAVE_FENV_H
+ #include <fenv.h>
+ #if defined FE_INVALID
+ #if HAVE_LRINT && LONG_MAX == 2147483647
+ #define lrint32 lrint
+ #elif defined __GNUC__ && defined __x86_64__
+ #define lrint32 lrint32
+ static __inline sox_int32_t lrint32(double input) {
+ sox_int32_t result;
+ __asm__ __volatile__("fistpl %0": "=m"(result): "t"(input): "st");
+ return result;
+ }
+ #endif
+ #endif
+#endif
+
+#if defined lrint32
+#define _ dest[i] = lrint32(src[i]), ++i,
+#pragma STDC FENV_ACCESS ON
+
+static void rint_clip(sox_sample_t * const dest, double const * const src,
+ size_t i, size_t const n, sox_uint64_t * const clips)
+{
+ for (; i < n; ++i) {
+ dest[i] = lrint32(src[i]);
+ if (fetestexcept(FE_INVALID)) {
+ feclearexcept(FE_INVALID);
+ dest[i] = src[i] > 0? SOX_SAMPLE_MAX : SOX_SAMPLE_MIN;
+ ++*clips;
+ }
+ }
+}
+
+void lsx_save_samples(sox_sample_t * const dest, double const * const src,
+ size_t const n, sox_uint64_t * const clips)
+{
+ size_t i;
+ feclearexcept(FE_INVALID);
+ for (i = 0; i < (n & ~7);) {
+ _ _ _ _ _ _ _ _ 0;
+ if (fetestexcept(FE_INVALID)) {
+ feclearexcept(FE_INVALID);
+ rint_clip(dest, src, i - 8, i, clips);
+ }
+ }
+ rint_clip(dest, src, i, n, clips);
+}
+
+void lsx_load_samples(double * const dest, sox_sample_t const * const src,
+ size_t const n)
+{
+ size_t i;
+ for (i = 0; i < n; ++i)
+ dest[i] = src[i];
+}
+
+#pragma STDC FENV_ACCESS OFF
+#undef _
+#else
+
+void lsx_save_samples(sox_sample_t * const dest, double const * const src,
+ size_t const n, sox_uint64_t * const clips)
+{
+ SOX_SAMPLE_LOCALS;
+ size_t i;
+ for (i = 0; i < n; ++i)
+ dest[i] = SOX_FLOAT_64BIT_TO_SAMPLE(src[i], *clips);
+}
+
+void lsx_load_samples(double * const dest, sox_sample_t const * const src,
+ size_t const n)
+{
+ size_t i;
+ for (i = 0; i < n; ++i)
+ dest[i] = SOX_SAMPLE_TO_FLOAT_64BIT(src[i],);
+}
+
+#endif
--- a/src/rate.c
+++ b/src/rate.c
@@ -638,16 +638,14 @@
sox_sample_t * obuf, size_t * isamp, size_t * osamp)
{
priv_t * p = (priv_t *)effp->priv;
- size_t i, odone = *osamp;
- SOX_SAMPLE_LOCALS;
+ size_t odone = *osamp;
sample_t const * s = rate_output(&p->rate, NULL, &odone);
- for (i = 0; i < odone; ++i)
- *obuf++ = SOX_FLOAT_64BIT_TO_SAMPLE(*s++, effp->clips);
+ lsx_save_samples(obuf, s, odone, &effp->clips);
if (*isamp && odone < *osamp) {
sample_t * t = rate_input(&p->rate, NULL, *isamp);
- for (i = *isamp; i; --i) *t++ = SOX_SAMPLE_TO_FLOAT_64BIT(*ibuf++,);
+ lsx_load_samples(t, ibuf, *isamp);
rate_process(&p->rate);
}
else *isamp = 0;
--- a/src/sox_i.h
+++ b/src/sox_i.h
@@ -115,6 +115,10 @@
void lsx_fir_to_phase(double * * h, int * len,
int * post_len, double phase0);
void lsx_plot_fir(double * h, int num_points, sox_rate_t rate, sox_plot_t type, char const * title, double y1, double y2);
+void lsx_save_samples(sox_sample_t * const dest, double const * const src,
+ size_t const n, sox_uint64_t * const clips);
+void lsx_load_samples(double * const dest, sox_sample_t const * const src,
+ size_t const n);
#ifdef HAVE_BYTESWAP_H
#include <byteswap.h>
--- a/src/soxconfig.h.cmake
+++ b/src/soxconfig.h.cmake
@@ -8,6 +8,7 @@
#cmakedefine HAVE_AO 1
#cmakedefine HAVE_BYTESWAP_H 1
#cmakedefine HAVE_COREAUDIO 1
+#cmakedefine HAVE_FENV_H 1
#cmakedefine HAVE_FFMPEG 1
#cmakedefine HAVE_FLAC 1
#cmakedefine HAVE_FMEMOPEN 1
@@ -21,6 +22,7 @@
#cmakedefine HAVE_LAME_LAME_H 1
#cmakedefine HAVE_LAME_SET_VBR_QUALITY 1
#define HAVE_LPC10 1
+#cmakedefine HAVE_LRINT 1
#cmakedefine HAVE_LTDL_H 1
#cmakedefine HAVE_MACHINE_SOUNDCARD_H 1
#cmakedefine HAVE_MAD_H 1