shithub: sox

Download patch

ref: 5a3edce57e604c0a54d7373619b65c2d9ec46881
parent: fc724b42fe0d335f2124f49bcfc1c38e887072bc
author: robs <robs>
date: Sat Jun 13 11:23:08 EDT 2009

vad updates

--- a/ChangeLog
+++ b/ChangeLog
@@ -85,6 +85,7 @@
   o New `fir' filter effect using external coefficients/file.  (robs)
   o New `biquad' filter effect using external coefficients.  (robs)
   o New `overdrive' effect.  (robs)
+  o New `vad' Voice Activity Detector effect (undocumented as yet).  (robs)
   o `synth' enhancements: can now set common parameters for multiple
     channels, new `pluck' and `tpdf' types, `scientific' note
     notation, [2778142] just intonation.  (robs)
--- a/FEATURES.in
+++ b/FEATURES.in
@@ -92,7 +92,7 @@
 ** silence: Remove portions of silence from the audio
 ** splice: Perform the equivalent of a cross-faded tape splice
 ** trim: Trim the ends of the audio
-** vad+: Prototype `silence' variant (W.I.P.)
+** vad: Voice activity detector
 
 * Mixing effects
 ** channels: Auto mix or duplicate to change number of channels
--- a/src/vad.c
+++ b/src/vad.c
@@ -20,58 +20,72 @@
 #include <string.h>
 
 typedef struct {
-  double        last_meas;
-  double        meas, slope1, slope2; /* TC -controlled */
+  double    * dft_buf, * noise_buf, * spectrum, * meas_buf, mean_meas;
 } chan_t;
 
 typedef struct {                /* Configuration parameters: */
-  double        hp_freq, lp_freq, measure_freq, search_step_time;
-  double        measure_duration, search_time, pre_trigger_time, trigger_level;
-  double        trigger_tc, slope_tc1, slope_tc2;
+  double    noise_tc_up, noise_tc_down, noise_reduction_amount;
+  double    measure_freq, measure_duration, measure_tc, pre_trigger_time;
+  double    hp_filter_freq, lp_filter_freq, hp_lifter_freq, lp_lifter_freq;
+  double    trigger_tc, trigger_level1, search_time, gap_time;
                                 /* Working variables: */
   sox_sample_t  * buffer;
-  unsigned      search_len, buffer_len, buffer_ptr, flush_done, search_step_len;
-
-  double        * dft_buf, * window1, * window2;
-  unsigned      dft_len, measure_period, measure_timer, measure_len;
-  chan_t        * channels;
-  double        trigger_meas_tc_mult, trigger_slope_tc_mult1, trigger_slope_tc_mult2;
-  double        search_slope_tc_mult1, search_slope_tc_mult2;
-  unsigned      start_bin, end_bin;
+  unsigned  dft_len, buffer_len, buffer_ptr, flush_done, gap_count;
+  unsigned  measure_period_len, measure_len, search_count, search_ptr;
+  unsigned  spectrum_start, spectrum_end, cepstrum_start, cepstrum_end;
+  int       measure_timer, booting;
+  double    measure_tc_mult, trigger_meas_tc_mult;
+  double    noise_tc_up_mult, noise_tc_down_mult;
+  double    * spectrum_window, * cepstrum_window;
+  chan_t    * channels;
 } priv_t;
 
+#define GETOPT_FREQ(c, name, min) \
+    case c: p->name = lsx_parse_frequency(lsx_optarg, &parse_ptr); \
+      if (p->name < min || *parse_ptr) return lsx_usage(effp); \
+      break;
+
 static int create(sox_effect_t * effp, int argc, char * * argv)
 {
   priv_t * p = (priv_t *)effp->priv;
+  #define opt_str "+N:n:r:f:m:M:h:l:H:L:T:t:s:g:p:"
   int c;
 
-  p->hp_freq          = 120;
-  p->lp_freq          = 12500;
-  p->measure_duration = .2;
-  p->measure_freq     = 10;
-  p->trigger_tc       = .2;
-  p->trigger_level    = 33;
+  p->noise_tc_up      = .1;
+  p->noise_tc_down    = .01;
+  p->noise_reduction_amount = 1.35;
+
+  p->measure_freq     = 20;
+  p->measure_duration = 2 / p->measure_freq;
+  p->measure_tc       = .4;
+
+  p->hp_filter_freq   = 50;
+  p->lp_filter_freq   = 6000;
+  p->hp_lifter_freq   = 150;
+  p->lp_lifter_freq   = 2000;
+
+  p->trigger_tc       = .25;
+  p->trigger_level1   = 7;
+
   p->search_time      = 1;
-  p->search_step_time = .05;
-  p->slope_tc1        = .35;
-  p->slope_tc2        = .075;
+  p->gap_time         = .25;
 
-  while ((c = lsx_getopt(argc, argv, "+h:l:m:f:T:t:s:q:S:F:p:")) != -1) switch (c) {
+  while ((c = lsx_getopt(argc, argv, opt_str)) != -1) switch (c) {
     char * parse_ptr;
-    case 'h': p->hp_freq = lsx_parse_frequency(lsx_optarg, &parse_ptr);
-      if (p->hp_freq < 10 || *parse_ptr) return lsx_usage(effp);
-      break;
-    case 'l': p->lp_freq = lsx_parse_frequency(lsx_optarg, &parse_ptr);
-      if (p->lp_freq < 1000 || *parse_ptr) return lsx_usage(effp);
-      break;
-    GETOPT_NUMERIC('m', measure_duration,  .02, 2)
-    GETOPT_NUMERIC('f', measure_freq    ,   1 ,100)
-    GETOPT_NUMERIC('T', trigger_tc      , .001, 1)
-    GETOPT_NUMERIC('t', trigger_level   ,   0, 100)
-    GETOPT_NUMERIC('s', search_time     ,   0 , 4)
-    GETOPT_NUMERIC('q', search_step_time, .002, .02)
-    GETOPT_NUMERIC('S', slope_tc1       , .001, 1)
-    GETOPT_NUMERIC('F', slope_tc2       , .001, 1)
+    GETOPT_NUMERIC('N', noise_tc_up     ,  .1 , 10)
+    GETOPT_NUMERIC('n', noise_tc_down   ,.001 , .1)
+    GETOPT_NUMERIC('r', noise_reduction_amount   ,0 , 2)
+    GETOPT_NUMERIC('f', measure_freq    ,   5 , 50)
+    GETOPT_NUMERIC('m', measure_duration, .01 , 1)
+    GETOPT_NUMERIC('M', measure_tc      ,  .1 , 1)
+    GETOPT_FREQ(   'h', hp_filter_freq  ,  10)
+    GETOPT_FREQ(   'l', lp_filter_freq  ,  1000)
+    GETOPT_FREQ(   'H', hp_lifter_freq  ,  10)
+    GETOPT_FREQ(   'L', lp_lifter_freq  ,  1000)
+    GETOPT_NUMERIC('T', trigger_tc      , .01 , 1)
+    GETOPT_NUMERIC('t', trigger_level1  ,   0 , 10)
+    GETOPT_NUMERIC('s', search_time     ,  .1 , 4)
+    GETOPT_NUMERIC('g', gap_time        ,  .1 , 1)
     GETOPT_NUMERIC('p', pre_trigger_time,   0 , 4)
     default: lsx_fail("invalid option `-%c'", optopt); return lsx_usage(effp);
   }
@@ -81,48 +95,63 @@
 static int start(sox_effect_t * effp)
 {
   priv_t * p = (priv_t *)effp->priv;
-  unsigned i;
+  unsigned i, pre_trigger_len, search_len;
 
-  unsigned pre_trigger_len = p->pre_trigger_time * effp->in_signal.rate + .5;
+  pre_trigger_len = p->pre_trigger_time * effp->in_signal.rate + .5;
   pre_trigger_len *= effp->in_signal.channels;
 
   p->measure_len = effp->in_signal.rate * p->measure_duration + .5;
   p->measure_len *= effp->in_signal.channels;
-  p->search_step_len = effp->in_signal.rate * p->search_step_time + .5;
-  p->search_step_len *= effp->in_signal.channels;
 
-  p->search_len = p->search_time * effp->in_signal.rate + .5;
-  p->search_len *= effp->in_signal.channels;
-  p->search_len += p->measure_len;
+  p->measure_period_len = effp->in_signal.rate / p->measure_freq + .5;
+  p->measure_period_len *= effp->in_signal.channels;
+  p->search_count = ceil(p->search_time * p->measure_freq);
+  search_len = p->search_count * p->measure_period_len;
+  p->gap_count = p->gap_time * p->measure_freq + .5;
 
-  p->buffer_len = pre_trigger_len + p->search_len;
-  p->buffer = lsx_calloc(p->buffer_len, sizeof(*p->buffer));
+  p->buffer_len = pre_trigger_len + p->measure_len + search_len;
+  lsx_Calloc(p->buffer, p->buffer_len);
 
   for (p->dft_len = 16; p->dft_len < p->measure_len; p->dft_len <<= 1);
-  p->dft_buf = lsx_calloc(p->dft_len, sizeof(*p->dft_buf));
+  lsx_debug("dft_len=%u measure_len=%u", p->dft_len, p->measure_len);
 
-  p->window1 = lsx_calloc(p->measure_len, sizeof(*p->window1));
+  lsx_Calloc(p->channels, effp->in_signal.channels);
+  for (i = 0; i < effp->in_signal.channels; ++i) {
+    chan_t * c = &p->channels[i];
+    lsx_Calloc(c->dft_buf, p->dft_len);
+    lsx_Calloc(c->spectrum, p->dft_len);
+    lsx_Calloc(c->noise_buf, p->dft_len);
+    lsx_Calloc(c->meas_buf, p->search_count);
+  }
+
+  lsx_Calloc(p->spectrum_window, p->measure_len);
   for (i = 0; i < p->measure_len; ++i)
-    p->window1[i] = -2. / SOX_SAMPLE_MIN / p->measure_len;
-  lsx_apply_hann(p->window1, (int)p->measure_len);
+    p->spectrum_window[i] = -2. / SOX_SAMPLE_MIN / sqrt((double)p->measure_len);
+  lsx_apply_hann(p->spectrum_window, (int)p->measure_len);
 
-  p->start_bin = p->hp_freq / effp->in_signal.rate * p->dft_len + .5;
-  p->end_bin = p->lp_freq / effp->in_signal.rate * p->dft_len + .5;
-  p->end_bin = min(p->end_bin, p->dft_len / 2);
-  p->window2 = lsx_calloc(p->end_bin - p->start_bin, sizeof(*p->window2));
-  for (i = 0; i < p->end_bin - p->start_bin; ++i)
-    p->window2[i] = 2 * (p->dft_len / 2 + 1.) / (p->end_bin - p->start_bin);
-  lsx_apply_hann(p->window2, (int)(p->end_bin - p->start_bin));
+  p->spectrum_start = p->hp_filter_freq / effp->in_signal.rate * p->dft_len + .5;
+  p->spectrum_start = max(p->spectrum_start, 1);
+  p->spectrum_end = p->lp_filter_freq / effp->in_signal.rate * p->dft_len + .5;
+  p->spectrum_end = min(p->spectrum_end, p->dft_len / 2);
 
+  lsx_Calloc(p->cepstrum_window, p->spectrum_end - p->spectrum_start);
+  for (i = 0; i < p->spectrum_end - p->spectrum_start; ++i)
+    p->cepstrum_window[i] = 2 / sqrt((double)p->spectrum_end - p->spectrum_start);
+  lsx_apply_hann(p->cepstrum_window, (int)(p->spectrum_end - p->spectrum_start));
+  
+  p->cepstrum_start = ceil(effp->in_signal.rate * .5 / p->lp_lifter_freq);
+  p->cepstrum_end = floor(effp->in_signal.rate * .5 / p->hp_lifter_freq);
+  p->cepstrum_end = min(p->cepstrum_end, p->dft_len / 4);
+  if (p->cepstrum_end <= p->cepstrum_start)
+    return SOX_EOF;
+
+  p->noise_tc_up_mult     = exp(-1 / (p->noise_tc_up   * p->measure_freq));
+  p->noise_tc_down_mult   = exp(-1 / (p->noise_tc_down * p->measure_freq));
+  p->measure_tc_mult      = exp(-1 / (p->measure_tc    * p->measure_freq));
+  p->trigger_meas_tc_mult = exp(-1 / (p->trigger_tc    * p->measure_freq));
+
+  p->measure_timer = -p->measure_len;
   p->flush_done = p->buffer_ptr = 0;
-  p->measure_period = effp->in_signal.rate / p->measure_freq + .5;
-  p->channels = lsx_calloc(effp->in_signal.channels, sizeof(*p->channels));
-  p->trigger_meas_tc_mult = exp(-1 / (p->trigger_tc * p->measure_freq));
-  p->trigger_slope_tc_mult1 = exp(-1 / (p->slope_tc1 * p->measure_freq));
-  p->trigger_slope_tc_mult2 = exp(-1 / (p->slope_tc2 * p->measure_freq));
-  p->search_slope_tc_mult1 = exp(-1 / (p->slope_tc1 / p->search_step_time));
-  p->search_slope_tc_mult2 = exp(-1 / (p->slope_tc2 / p->search_step_time));
-  lsx_debug("dft_len=%u measure_len=%u", p->dft_len, p->measure_len);
   return SOX_SUCCESS;
 }
 
@@ -148,37 +177,36 @@
   return SOX_SUCCESS;
 }
 
-static double measure(sox_effect_t * effp, size_t x)
+static double measure(
+    priv_t * p, chan_t * c, size_t index, size_t step, int booting)
 {
-  priv_t * p = (priv_t *)effp->priv;
-  double * buf = p->dft_buf;
   double mult, result = 0;
-  size_t i, n;
+  size_t i;
 
-  for (i = 0; i < p->measure_len; ++i) {
-    buf[i] = p->buffer[x] * p->window1[i];
-    x = (x + effp->in_signal.channels) % p->buffer_len;
+  for (i = 0; i < p->measure_len; ++i, index = (index + step) % p->buffer_len)
+    c->dft_buf[i] = p->buffer[index] * p->spectrum_window[i];
+  memset(c->dft_buf + i, 0, (p->dft_len - i) * sizeof(*c->dft_buf));
+  lsx_safe_rdft((int)p->dft_len, 1, c->dft_buf);
+
+  memset(c->dft_buf, 0, p->spectrum_start * sizeof(*c->dft_buf));
+  for (i = p->spectrum_start; i < p->spectrum_end; ++i) {
+    double d = sqrt(sqr(c->dft_buf[2 * i]) + sqr(c->dft_buf[2 * i + 1]));
+    mult = booting >= 0? booting / (1. + booting) : p->measure_tc_mult;
+    c->spectrum[i] = c->spectrum[i] * mult + d * (1 - mult);
+    d = sqr(c->spectrum[i]);
+    mult = booting >= 0? 0 :
+        d > c->noise_buf[i]? p->noise_tc_up_mult : p->noise_tc_down_mult;
+    c->noise_buf[i] = c->noise_buf[i] * mult + d * (1 - mult);
+    d = sqrt(max(0, d - p->noise_reduction_amount * c->noise_buf[i]));
+    c->dft_buf[i] = d * p->cepstrum_window[i - p->spectrum_start];
   }
-  memset(buf + i, 0, (p->dft_len - i) * sizeof(*buf));
-  lsx_safe_rdft((int)p->dft_len, 1, buf);
+  memset(c->dft_buf + i, 0, ((p->dft_len >> 1) - i) * sizeof(*c->dft_buf));
+  lsx_safe_rdft((int)p->dft_len >> 1, 1, c->dft_buf);
 
-  memset(buf, 0, p->start_bin * sizeof(*buf));
-  for (i = p->start_bin; i < p->end_bin; ++i)
-    buf[i] = (sqr(buf[2*i]) + sqr(buf[2*i+1])) * p->window2[i-p->start_bin];
-  memset(buf + i, 0, ((p->dft_len >> 1) - i) * sizeof(*buf));
-  lsx_safe_rdft((int)p->dft_len >> 1, 1, buf);
-
-  i = max(1, (size_t)(.006 * p->dft_len + .5));
-  n = (size_t)(.014 * p->dft_len + .5);
-  mult = (p->dft_len / 4 + 1.) / (n - i);
-  for (; i < n; ++i)
-    result += sqr(buf[2*i]) + sqr(buf[2*i+1]);
-  result = log(mult * result);
-  result = max(result + 50, 0);
-#if 0
-  fprintf(stderr, "%g\n", result);
-#endif
-  return result;
+  for (i = p->cepstrum_start; i < p->cepstrum_end; ++i)
+    result += sqr(c->dft_buf[2 * i]) + sqr(c->dft_buf[2 * i + 1]);
+  result = log(result / (p->cepstrum_end - p->cepstrum_start));
+  return max(0, 21 + result);
 }
 
 static int flow_trigger(sox_effect_t * effp, sox_sample_t const * ibuf,
@@ -189,59 +217,43 @@
   size_t i, idone = 0, to_flush = 0;
 
   while (idone < *ilen && !triggered) {
+    p->measure_timer += effp->in_signal.channels;
     for (i = 0; i < effp->in_signal.channels; ++i, ++idone) {
       chan_t * c = &p->channels[i];
       p->buffer[p->buffer_ptr++] = *ibuf++;
-      if (p->measure_timer == p->measure_period - 1) {
-        size_t flush = p->measure_len;
-        size_t x = (p->buffer_ptr + p->buffer_len - flush) % p->buffer_len;
-        double slope, meas, meas0 = measure(effp, x);
-        c->meas = c->meas * p->trigger_meas_tc_mult + meas0 *(1 - p->trigger_meas_tc_mult);
-        if (c->last_meas) {
-          slope = (meas0 - c->last_meas) * p->measure_freq;
-          c->slope1 = c->slope1? c->slope1 * p->trigger_slope_tc_mult1 + slope
-            * (1  - p->trigger_slope_tc_mult1) : slope;
-          c->slope2 = c->slope2? c->slope2 * p->trigger_slope_tc_mult2 + slope
-            * (1  - p->trigger_slope_tc_mult2) : slope;
+      if (!p->measure_timer) {
+        size_t x = (p->buffer_ptr + p->buffer_len - p->measure_len) % p->buffer_len;
+        double meas = measure(p, c, x, effp->in_signal.channels, p->booting);
+        c->meas_buf[p->search_ptr] = meas;
+        c->mean_meas = c->mean_meas * p->trigger_meas_tc_mult +
+            meas *(1 - p->trigger_meas_tc_mult);
+
+        if (triggered |= c->mean_meas > p->trigger_level1) {
+          unsigned n = p->search_count, ptr = p->search_ptr;
+          unsigned j, trigger_j = n, zero_j = n;
+          for (j = 0; j < n; ++j, ptr = (ptr + n - 1) % n)
+            if (c->meas_buf[ptr] > p->trigger_level1 && j <= trigger_j + p->gap_count)
+              zero_j = trigger_j = j;
+            else if (!c->meas_buf[ptr] && trigger_j >= zero_j)
+              zero_j = j;
+          j = min(j, zero_j);
+          to_flush = range_limit(j, to_flush, n);
         }
-        c->last_meas = meas0;
-#if 0
-        if (c->meas)
-          fprintf(stderr, "%g\n", c->meas);
-#endif
-        if (triggered |= c->meas > p->trigger_level) {
-          sox_bool started = sox_false;
-          do {
-            x = (x + p->buffer_len - p->search_step_len) % p->buffer_len;
-            flush += p->search_step_len;
-            meas = measure(effp, x);
-#if 0
-            fprintf(stderr, "%g %g %g\n", meas, c->slope1, c->slope2);
-#endif
-            slope = -(meas - c->last_meas) / p->search_step_time;
-            c->last_meas = meas;
-            if (slope > 0 || started) {
-              c->slope1 = c->slope1 * p->search_slope_tc_mult1 +
-                slope * (1  - p->search_slope_tc_mult1);
-              c->slope2 = c->slope2 * p->search_slope_tc_mult2 +
-                slope * (1  - p->search_slope_tc_mult2);
-              started = sox_true;
-            }
-          } while (flush < p->search_len && (
-                (meas > p->trigger_level - 12 && (c->slope1 > 4 || c->slope2 > 2)) ||
-                meas > p->trigger_level));
-          to_flush = range_limit(flush, to_flush, p->search_len);
-        }
+        lsx_debug_more("%12g %12g %u", meas, c->mean_meas, to_flush);
       }
     }
     if (p->buffer_ptr == p->buffer_len)
       p->buffer_ptr = 0;
-    if (++p->measure_timer == p->measure_period)
-      p->measure_timer = 0;
+    if (!p->measure_timer) {
+      p->measure_timer = -p->measure_period_len;
+      p->search_ptr = (p->search_ptr + 1) % p->search_count;
+      if (p->booting >= 0)
+        p->booting = p->booting == 6? -1 : p->booting + 1;
+    }
   }
   if (triggered) {
     size_t ilen1 = *ilen - idone;
-    p->flush_done = p->search_len - to_flush;
+    p->flush_done = (p->search_count - to_flush) * p->measure_period_len;
     p->buffer_ptr = (p->buffer_ptr + p->flush_done) % p->buffer_len;
     (effp->handler.flow = flow_flush)(effp, ibuf, obuf, &ilen1, olen);
     idone += ilen1;
@@ -260,10 +272,18 @@
 static int stop(sox_effect_t * effp)
 {
   priv_t * p = (priv_t *)effp->priv;
+  unsigned i;
+
+  for (i = 0; i < effp->in_signal.channels; ++i) {
+    chan_t * c = &p->channels[i];
+    free(c->meas_buf);
+    free(c->noise_buf);
+    free(c->spectrum);
+    free(c->dft_buf);
+  }
   free(p->channels);
-  free(p->window2);
-  free(p->window1);
-  free(p->dft_buf);
+  free(p->cepstrum_window);
+  free(p->spectrum_window);
   free(p->buffer);
   return SOX_SUCCESS;
 }
@@ -270,20 +290,29 @@
 
 sox_effect_handler_t const * lsx_vad_effect_fn(void)
 {
-  static sox_effect_handler_t handler = {"vad", "[options]"
-    "\n\t-h high-pass-filter         (300 Hz)"
-    "\n\t-l low-pass-filter          (12500 Hz)"
-    "\n\t-m measure-duration         (0.2 s)"
-    "\n\t-f measure-frequency        (10 Hz)"
-    "\n\t-T trigger-time-constant    (0.2 s)"
-    "\n\t-t trigger-level            (33)"
-    "\n\t-s search-time              (1 s)"
-    "\n\t-q search-step-time         (0.05 s)"
-    "\n\t-S slope-slow-time-constant (0.35 s)"
-    "\n\t-F slope-fast-time-constant (0.075 s)"
-    "\n\t-p pre-trigger-buffer       (0 s)"
-    , SOX_EFF_MCHAN | SOX_EFF_LENGTH | SOX_EFF_MODIFY | SOX_EFF_ALPHA,
+  static sox_effect_handler_t handler = {"vad", NULL,
+    SOX_EFF_MCHAN | SOX_EFF_LENGTH | SOX_EFF_MODIFY,
     create, start, flow_trigger, drain, stop, NULL, sizeof(priv_t)
   };
+  static char const * lines[] = {
+    "[options]",
+    "\t-N noise-tc-up              (0.1 s)",
+    "\t-n noise-tc-down            (0.01 s)",
+    "\t-r noise-reduction-amount   (1.35)",
+    "\t-f measure-frequency        (20 Hz)",
+    "\t-m measure-duration         (0.1 s)",
+    "\t-M measure-tc               (0.4 s)",
+    "\t-h high-pass-filter         (50 Hz)",
+    "\t-l low-pass-filter          (6000 Hz)",
+    "\t-H high-pass-lifter         (150 Hz)",
+    "\t-L low-pass-lifter          (2000 Hz)",
+    "\t-T trigger-time-constant    (0.25 s)",
+    "\t-t trigger-level            (7)",
+    "\t-s search-time              (1 s)",
+    "\t-g allowed-gap              (0.25 s)",
+    "\t-p pre-trigger-buffer       (0 s)",
+  };
+  static char * usage;
+  handler.usage = lsx_usage_lines(&usage, lines, array_length(lines));
   return &handler;
 }