shithub: sox

Download patch

ref: 46b32db78ce8fc0bc2e2e9fd66e9dcdda2a2e503
parent: b70926425660d226a8c5a9e4881d0ecd471126eb
author: robs <robs>
date: Sun Jun 7 15:47:22 EDT 2009

new algorithm

--- a/src/vad.c
+++ b/src/vad.c
@@ -20,19 +20,24 @@
 #include <string.h>
 
 typedef struct {
-  double mean_sqr, * log_mean_sqrs, min, held_min;
-  unsigned power_boot_done, trigger_done, count;
+  double        last_meas;
+  double        meas, slope1, slope2; /* TC -controlled */
 } chan_t;
 
-typedef struct {                /* Configuation parameters: */
-  double        power_boot_mult;
-  double        power_tc, buffer_time, power_dt, trigger_rise, trigger_time;
+typedef struct {                /* Configuration parameters: */
+  double        hp_freq, lp_freq, measure_freq, search_step_time;
+  double        measure_duration, search_time, pre_trigger_time, trigger_level;
+  double        trigger_tc, slope_tc1, slope_tc2;
                                 /* Working variables: */
-  double        tc_mult;   /* Multiplier for decay time constant */
   sox_sample_t  * buffer;
-  unsigned      buffer_len, buffer_ptr, flush_done, power_boot_len;
-  unsigned      trigger_len, log_mean_sqrs_len, log_mean_sqrs_ptr;
+  unsigned      search_len, buffer_len, buffer_ptr, flush_done, search_step_len;
+
+  double        * dft_buf, * window1, * window2;
+  unsigned      dft_len, measure_period, measure_timer, measure_len;
   chan_t        * channels;
+  double        trigger_meas_tc_mult, trigger_slope_tc_mult1, trigger_slope_tc_mult2;
+  double        search_slope_tc_mult1, search_slope_tc_mult2;
+  unsigned      start_bin, end_bin;
 } priv_t;
 
 static int create(sox_effect_t * effp, int argc, char * * argv)
@@ -40,20 +45,36 @@
   priv_t * p = (priv_t *)effp->priv;
   int c;
 
-  p->power_tc       = .01;  p->trigger_rise = 20;
-  p->power_boot_mult= 3;    p->trigger_time = .05;
-  p->power_dt       = .1;   p->buffer_time  = .05;
+  p->hp_freq          = 300;
+  p->lp_freq          = 12500;
+  p->measure_duration = .2;
+  p->measure_freq     = 10;
+  p->trigger_tc       = .2;
+  p->trigger_level    = 33;
+  p->search_time      = 1;
+  p->search_step_time = .05;
+  p->slope_tc1        = .35;
+  p->slope_tc2        = .075;
 
-  while ((c = lsx_getopt(argc, argv, "+c:b:d:r:u:p:")) != -1) switch (c) {
-    GETOPT_NUMERIC('c', power_tc        ,.001 , 10)
-    GETOPT_NUMERIC('b', power_boot_mult ,   0 , 10)
-    GETOPT_NUMERIC('d', power_dt        ,.001 , 10)
-    GETOPT_NUMERIC('r', trigger_rise    ,   1 , 100)
-    GETOPT_NUMERIC('u', trigger_time    ,   0 , 10)
-    GETOPT_NUMERIC('p', buffer_time     ,   0 , 10)
+  while ((c = lsx_getopt(argc, argv, "+h:l:m:f:T:t:s:q:S:F:p:")) != -1) switch (c) {
+    char * parse_ptr;
+    case 'h': p->hp_freq = lsx_parse_frequency(lsx_optarg, &parse_ptr);
+      if (p->hp_freq < 10 || *parse_ptr) return lsx_usage(effp);
+      break;
+    case 'l': p->lp_freq = lsx_parse_frequency(lsx_optarg, &parse_ptr);
+      if (p->lp_freq < 1000 || *parse_ptr) return lsx_usage(effp);
+      break;
+    GETOPT_NUMERIC('m', measure_duration,  .02, 2)
+    GETOPT_NUMERIC('f', measure_freq    ,   1 ,100)
+    GETOPT_NUMERIC('T', trigger_tc      , .001, 1)
+    GETOPT_NUMERIC('t', trigger_level   ,   0, 100)
+    GETOPT_NUMERIC('s', search_time     ,   0 , 4)
+    GETOPT_NUMERIC('q', search_step_time, .002, .02)
+    GETOPT_NUMERIC('S', slope_tc1       , .001, 1)
+    GETOPT_NUMERIC('F', slope_tc2       , .001, 1)
+    GETOPT_NUMERIC('p', pre_trigger_time,   0 , 4)
     default: lsx_fail("invalid option `-%c'", optopt); return lsx_usage(effp);
   }
-  p->trigger_rise *= .1 * log(10.); /* Convert to natural log */
   return lsx_optind !=argc? lsx_usage(effp) : SOX_SUCCESS;
 }
 
@@ -60,20 +81,48 @@
 static int start(sox_effect_t * effp)
 {
   priv_t * p = (priv_t *)effp->priv;
-  size_t i;
+  unsigned i;
 
-  p->tc_mult = exp(-1 / (p->power_tc * effp->in_signal.rate));
-  p->power_boot_len = (p->power_tc * p->power_boot_mult + p->power_dt) * effp->in_signal.rate + .5;
-  p->trigger_len = 1 + p->trigger_time * effp->in_signal.rate + .5;
+  unsigned pre_trigger_len = p->pre_trigger_time * effp->in_signal.rate + .5;
+  pre_trigger_len *= effp->in_signal.channels;
 
-  p->log_mean_sqrs_len = p->power_dt * effp->in_signal.rate + .5;
-  p->channels = lsx_calloc(effp->in_signal.channels, sizeof(*p->channels));
-  for (i = 0; i < effp->in_signal.channels; ++i)
-    lsx_Calloc(p->channels[i].log_mean_sqrs, p->log_mean_sqrs_len);
-  p->buffer_len = p->trigger_len + p->buffer_time * effp->in_signal.rate + .5;
-  p->buffer_len *= effp->in_signal.channels;
+  p->measure_len = effp->in_signal.rate * p->measure_duration + .5;
+  p->measure_len *= effp->in_signal.channels;
+  p->search_step_len = effp->in_signal.rate * p->search_step_time + .5;
+  p->search_step_len *= effp->in_signal.channels;
+
+  p->search_len = p->search_time * effp->in_signal.rate + .5;
+  p->search_len *= effp->in_signal.channels;
+  p->search_len += p->measure_len;
+
+  p->buffer_len = pre_trigger_len + p->search_len;
   p->buffer = lsx_calloc(p->buffer_len, sizeof(*p->buffer));
-  p->flush_done = p->log_mean_sqrs_ptr = p->buffer_ptr = 0;
+
+  for (p->dft_len = 16; p->dft_len < p->measure_len; p->dft_len <<= 1);
+  p->dft_buf = lsx_calloc(p->dft_len, sizeof(*p->dft_buf));
+
+  p->window1 = lsx_calloc(p->measure_len, sizeof(*p->window1));
+  for (i = 0; i < p->measure_len; ++i)
+    p->window1[i] = -2. / SOX_SAMPLE_MIN / p->measure_len;
+  lsx_apply_hann(p->window1, (int)p->measure_len);
+
+  p->start_bin = p->hp_freq / effp->in_signal.rate * p->dft_len + .5;
+  p->end_bin = p->lp_freq / effp->in_signal.rate * p->dft_len + .5;
+  p->end_bin = min(p->end_bin, p->dft_len / 2);
+  p->window2 = lsx_calloc(p->end_bin - p->start_bin, sizeof(*p->window2));
+  for (i = 0; i < p->end_bin - p->start_bin; ++i)
+    p->window2[i] = 2 * (p->dft_len / 2 + 1.) / (p->end_bin - p->start_bin);
+  lsx_apply_hann(p->window2, (int)(p->end_bin - p->start_bin));
+
+  p->flush_done = p->buffer_ptr = 0;
+  p->measure_period = effp->in_signal.rate / p->measure_freq + .5;
+  p->channels = lsx_calloc(effp->in_signal.channels, sizeof(*p->channels));
+  p->trigger_meas_tc_mult = exp(-1 / (p->trigger_tc * p->measure_freq));
+  p->trigger_slope_tc_mult1 = exp(-1 / (p->slope_tc1 * p->measure_freq));
+  p->trigger_slope_tc_mult2 = exp(-1 / (p->slope_tc2 * p->measure_freq));
+  p->search_slope_tc_mult1 = exp(-1 / (p->slope_tc1 / p->search_step_time));
+  p->search_slope_tc_mult2 = exp(-1 / (p->slope_tc2 / p->search_step_time));
+  lsx_warn("dft_len=%u measure_len=%u", p->dft_len, p->measure_len);
   return SOX_SUCCESS;
 }
 
@@ -99,53 +148,100 @@
   return SOX_SUCCESS;
 }
 
+static double measure(sox_effect_t * effp, size_t x)
+{
+  priv_t * p = (priv_t *)effp->priv;
+  double * buf = p->dft_buf;
+  double mult, result = 0;
+  size_t i;
+
+  for (i = 0; i < p->measure_len; ++i) {
+    buf[i] = p->buffer[x] * p->window1[i];
+    x = (x + effp->in_signal.channels) % p->buffer_len;
+  }
+  memset(buf + i, 0, (p->dft_len - i) * sizeof(*buf));
+  lsx_safe_rdft((int)p->dft_len, 1, buf);
+
+  memset(buf, 0, p->start_bin * sizeof(*buf));
+  for (i = p->start_bin; i < p->end_bin; ++i)
+    buf[i] = (sqr(buf[2*i]) + sqr(buf[2*i+1])) * p->window2[i-p->start_bin];
+  memset(buf + i, 0, ((p->dft_len >> 1) - i) * sizeof(*buf));
+  lsx_safe_rdft((int)p->dft_len >> 1, 1, buf);
+
+  i = max(1, (size_t)(.01 * p->dft_len + .5));
+  mult = (p->dft_len / 4 + 1.) / (p->dft_len / 4 - i);
+  for (; i < p->dft_len >> 2; ++i)
+    result += sqr(buf[2*i]) + sqr(buf[2*i+1]);
+  result = log(mult * result);
+  result = max(result + 50, 0);
+#if 0
+  fprintf(stderr, "%g\n", result);
+#endif
+  return result;
+}
+
 static int flow_trigger(sox_effect_t * effp, sox_sample_t const * ibuf,
     sox_sample_t * obuf, size_t * ilen, size_t * olen)
 {
   priv_t * p = (priv_t *)effp->priv;
   sox_bool triggered = sox_false;
-  size_t i, idone = 0;
+  size_t i, idone = 0, to_flush = 0;
 
   while (idone < *ilen && !triggered) {
     for (i = 0; i < effp->in_signal.channels; ++i, ++idone) {
       chan_t * c = &p->channels[i];
-      double tmp, d = SOX_SAMPLE_TO_FLOAT_64BIT(*ibuf,);
       p->buffer[p->buffer_ptr++] = *ibuf++;
-      /* Might need to add high-pass (e.g. for mains-hum or DC) and/or
-       * low-pass (e.g. for noise-shaped dither) filters at this point. */
-      c->mean_sqr = p->tc_mult * c->mean_sqr + (1 - p->tc_mult) * sqr(d);
+      if (p->measure_timer == p->measure_period - 1) {
+        size_t flush = p->measure_len;
+        size_t x = (p->buffer_ptr + p->buffer_len - flush) % p->buffer_len;
+        double slope, meas, meas0 = measure(effp, x);
+        c->meas = c->meas * p->trigger_meas_tc_mult + meas0 *(1 - p->trigger_meas_tc_mult);
+        if (c->last_meas) {
+          slope = (meas0 - c->last_meas) * p->measure_freq;
+          c->slope1 = c->slope1? c->slope1 * p->trigger_slope_tc_mult1 + slope
+            * (1  - p->trigger_slope_tc_mult1) : slope;
+          c->slope2 = c->slope2? c->slope2 * p->trigger_slope_tc_mult2 + slope
+            * (1  - p->trigger_slope_tc_mult2) : slope;
+        }
+        c->last_meas = meas0;
+#if 1
+        if (c->meas)
+          fprintf(stderr, "%g\n", c->meas);
+#endif
+        if (triggered |= c->meas > p->trigger_level) {
+          sox_bool started = sox_false;
+          do {
+            x = (x + p->buffer_len - p->search_step_len) % p->buffer_len;
+            flush += p->search_step_len;
+            meas = measure(effp, x);
 #if 0
-      if (++c->count == 48) {
-        fprintf(stderr, "%g\n", 10 * log10(c->mean_sqr));
-        c->count = 0;
-      }
+            fprintf(stderr, "%g %g %g\n", meas, c->slope1, c->slope2);
 #endif
-      if (c->mean_sqr >= sqr(1. / SOX_SAMPLE_MIN)) {
-        d = log(c->mean_sqr);
-        if (c->power_boot_done == p->power_boot_len) {
-          if (d - c->held_min < p->trigger_rise)
-            c->trigger_done = 0;
-          else triggered |= ++c->trigger_done == p->trigger_len;
+            slope = -(meas - c->last_meas) / p->search_step_time;
+            c->last_meas = meas;
+            if (slope > 0 || started) {
+              c->slope1 = c->slope1 * p->search_slope_tc_mult1 +
+                slope * (1  - p->search_slope_tc_mult1);
+              c->slope2 = c->slope2 * p->search_slope_tc_mult2 +
+                slope * (1  - p->search_slope_tc_mult2);
+              started = sox_true;
+            }
+          } while (flush < p->search_len && (
+                (meas > meas0 - 12 && (c->slope1 > 4 || c->slope2 > 2)) ||
+                meas > p->trigger_level));
+          to_flush = range_limit(flush, to_flush, p->search_len);
         }
-        else ++c->power_boot_done;
-        tmp = c->log_mean_sqrs[p->log_mean_sqrs_ptr];
-        c->log_mean_sqrs[p->log_mean_sqrs_ptr] = d;
-        if (tmp <= c->min)
-          for (c->min = i = 0; i < p->log_mean_sqrs_len; ++i)
-            c->min = min(c->min, c->log_mean_sqrs[i]);
-        else c->min = min(c->min, d);
-        if (!c->trigger_done)
-          c->held_min = c->min;
       }
-      else c->min = c->power_boot_done = c->trigger_done = 0;
     }
     if (p->buffer_ptr == p->buffer_len)
       p->buffer_ptr = 0;
-    if (++p->log_mean_sqrs_ptr == p->log_mean_sqrs_len)
-      p->log_mean_sqrs_ptr = 0;
+    if (++p->measure_timer == p->measure_period)
+      p->measure_timer = 0;
   }
   if (triggered) {
     size_t ilen1 = *ilen - idone;
+    p->flush_done = p->search_len - to_flush;
+    p->buffer_ptr = (p->buffer_ptr + p->flush_done) % p->buffer_len;
     (effp->handler.flow = flow_flush)(effp, ibuf, obuf, &ilen1, olen);
     idone += ilen1;
   }
@@ -163,12 +259,11 @@
 static int stop(sox_effect_t * effp)
 {
   priv_t * p = (priv_t *)effp->priv;
-  size_t i;
-
-  free(p->buffer);
-  for (i = 0; i < effp->in_signal.channels; ++i)
-    free(p->channels[i].log_mean_sqrs);
   free(p->channels);
+  free(p->window2);
+  free(p->window1);
+  free(p->dft_buf);
+  free(p->buffer);
   return SOX_SUCCESS;
 }
 
@@ -175,11 +270,17 @@
 sox_effect_handler_t const * lsx_vad_effect_fn(void)
 {
   static sox_effect_handler_t handler = {"vad", "[options]"
-    "\n\t-c power-time-constant      (0.01 s)"
-    "\n\t-d max. trigger-rise-time   (0.1 s)"
-    "\n\t-r trigger-rise             (20 dB)"
-    "\n\t-u trigger-up-time          (0.05 s)"
-    "\n\t-p pre-trigger-buffer       (0.05 s)"
+    "\n\t-h high-pass-filter         (300 Hz)"
+    "\n\t-l low-pass-filter          (12500 Hz)"
+    "\n\t-m measure-duration         (0.2 s)"
+    "\n\t-f measure-frequency        (10 Hz)"
+    "\n\t-T trigger-time-constant    (0.2 s)"
+    "\n\t-t trigger-level            (33)"
+    "\n\t-s search-time              (1 s)"
+    "\n\t-q search-step-time         (0.05 s)"
+    "\n\t-S slope-slow-time-constant (0.35 s)"
+    "\n\t-F slope-fast-time-constant (0.075 s)"
+    "\n\t-p pre-trigger-buffer       (0 s)"
     , SOX_EFF_MCHAN | SOX_EFF_LENGTH | SOX_EFF_MODIFY | SOX_EFF_ALPHA,
     create, start, flow_trigger, drain, stop, NULL, sizeof(priv_t)
   };