shithub: sox

ref: 2076827f848f595b5506bc5927db1489e8e896be
dir: /src/vad.c/

View raw version
/* libSoX effect: Voice Activity Detector  (c) 2009 robs@users.sourceforge.net
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or (at
 * your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 * General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include "sox_i.h"
#include "sgetopt.h"
#include <string.h>

typedef struct {
  double        last_meas;
  double        meas, slope1, slope2; /* TC -controlled */
} chan_t;

typedef struct {                /* Configuration parameters: */
  double        hp_freq, lp_freq, measure_freq, search_step_time;
  double        measure_duration, search_time, pre_trigger_time, trigger_level;
  double        trigger_tc, slope_tc1, slope_tc2;
                                /* Working variables: */
  sox_sample_t  * buffer;
  unsigned      search_len, buffer_len, buffer_ptr, flush_done, search_step_len;

  double        * dft_buf, * window1, * window2;
  unsigned      dft_len, measure_period, measure_timer, measure_len;
  chan_t        * channels;
  double        trigger_meas_tc_mult, trigger_slope_tc_mult1, trigger_slope_tc_mult2;
  double        search_slope_tc_mult1, search_slope_tc_mult2;
  unsigned      start_bin, end_bin;
} priv_t;

static int create(sox_effect_t * effp, int argc, char * * argv)
{
  priv_t * p = (priv_t *)effp->priv;
  int c;

  p->hp_freq          = 300;
  p->lp_freq          = 12500;
  p->measure_duration = .2;
  p->measure_freq     = 10;
  p->trigger_tc       = .2;
  p->trigger_level    = 33;
  p->search_time      = 1;
  p->search_step_time = .05;
  p->slope_tc1        = .35;
  p->slope_tc2        = .075;

  while ((c = lsx_getopt(argc, argv, "+h:l:m:f:T:t:s:q:S:F:p:")) != -1) switch (c) {
    char * parse_ptr;
    case 'h': p->hp_freq = lsx_parse_frequency(lsx_optarg, &parse_ptr);
      if (p->hp_freq < 10 || *parse_ptr) return lsx_usage(effp);
      break;
    case 'l': p->lp_freq = lsx_parse_frequency(lsx_optarg, &parse_ptr);
      if (p->lp_freq < 1000 || *parse_ptr) return lsx_usage(effp);
      break;
    GETOPT_NUMERIC('m', measure_duration,  .02, 2)
    GETOPT_NUMERIC('f', measure_freq    ,   1 ,100)
    GETOPT_NUMERIC('T', trigger_tc      , .001, 1)
    GETOPT_NUMERIC('t', trigger_level   ,   0, 100)
    GETOPT_NUMERIC('s', search_time     ,   0 , 4)
    GETOPT_NUMERIC('q', search_step_time, .002, .02)
    GETOPT_NUMERIC('S', slope_tc1       , .001, 1)
    GETOPT_NUMERIC('F', slope_tc2       , .001, 1)
    GETOPT_NUMERIC('p', pre_trigger_time,   0 , 4)
    default: lsx_fail("invalid option `-%c'", optopt); return lsx_usage(effp);
  }
  return lsx_optind !=argc? lsx_usage(effp) : SOX_SUCCESS;
}

static int start(sox_effect_t * effp)
{
  priv_t * p = (priv_t *)effp->priv;
  unsigned i;

  unsigned pre_trigger_len = p->pre_trigger_time * effp->in_signal.rate + .5;
  pre_trigger_len *= effp->in_signal.channels;

  p->measure_len = effp->in_signal.rate * p->measure_duration + .5;
  p->measure_len *= effp->in_signal.channels;
  p->search_step_len = effp->in_signal.rate * p->search_step_time + .5;
  p->search_step_len *= effp->in_signal.channels;

  p->search_len = p->search_time * effp->in_signal.rate + .5;
  p->search_len *= effp->in_signal.channels;
  p->search_len += p->measure_len;

  p->buffer_len = pre_trigger_len + p->search_len;
  p->buffer = lsx_calloc(p->buffer_len, sizeof(*p->buffer));

  for (p->dft_len = 16; p->dft_len < p->measure_len; p->dft_len <<= 1);
  p->dft_buf = lsx_calloc(p->dft_len, sizeof(*p->dft_buf));

  p->window1 = lsx_calloc(p->measure_len, sizeof(*p->window1));
  for (i = 0; i < p->measure_len; ++i)
    p->window1[i] = -2. / SOX_SAMPLE_MIN / p->measure_len;
  lsx_apply_hann(p->window1, (int)p->measure_len);

  p->start_bin = p->hp_freq / effp->in_signal.rate * p->dft_len + .5;
  p->end_bin = p->lp_freq / effp->in_signal.rate * p->dft_len + .5;
  p->end_bin = min(p->end_bin, p->dft_len / 2);
  p->window2 = lsx_calloc(p->end_bin - p->start_bin, sizeof(*p->window2));
  for (i = 0; i < p->end_bin - p->start_bin; ++i)
    p->window2[i] = 2 * (p->dft_len / 2 + 1.) / (p->end_bin - p->start_bin);
  lsx_apply_hann(p->window2, (int)(p->end_bin - p->start_bin));

  p->flush_done = p->buffer_ptr = 0;
  p->measure_period = effp->in_signal.rate / p->measure_freq + .5;
  p->channels = lsx_calloc(effp->in_signal.channels, sizeof(*p->channels));
  p->trigger_meas_tc_mult = exp(-1 / (p->trigger_tc * p->measure_freq));
  p->trigger_slope_tc_mult1 = exp(-1 / (p->slope_tc1 * p->measure_freq));
  p->trigger_slope_tc_mult2 = exp(-1 / (p->slope_tc2 * p->measure_freq));
  p->search_slope_tc_mult1 = exp(-1 / (p->slope_tc1 / p->search_step_time));
  p->search_slope_tc_mult2 = exp(-1 / (p->slope_tc2 / p->search_step_time));
  lsx_debug("dft_len=%u measure_len=%u", p->dft_len, p->measure_len);
  return SOX_SUCCESS;
}

static int flow_flush(sox_effect_t * effp, sox_sample_t const * ibuf,
    sox_sample_t * obuf, size_t * ilen, size_t * olen)
{
  priv_t * p = (priv_t *)effp->priv;
  size_t odone = min(p->buffer_len - p->flush_done, *olen);
  size_t odone1 = min(odone, p->buffer_len - p->buffer_ptr);

  memcpy(obuf, p->buffer + p->buffer_ptr, odone1 * sizeof(*obuf));
  if ((p->buffer_ptr += odone1) == p->buffer_len) {
    memcpy(obuf + odone1, p->buffer, (odone - odone1) * sizeof(*obuf));
    p->buffer_ptr = odone - odone1;
  }
  if ((p->flush_done += odone) == p->buffer_len) {
    size_t olen1 = *olen - odone;
    (effp->handler.flow = lsx_flow_copy)(effp, ibuf, obuf +odone, ilen, &olen1);
    odone += olen1;
  }
  else *ilen = 0;
  *olen = odone;
  return SOX_SUCCESS;
}

static double measure(sox_effect_t * effp, size_t x)
{
  priv_t * p = (priv_t *)effp->priv;
  double * buf = p->dft_buf;
  double mult, result = 0;
  size_t i;

  for (i = 0; i < p->measure_len; ++i) {
    buf[i] = p->buffer[x] * p->window1[i];
    x = (x + effp->in_signal.channels) % p->buffer_len;
  }
  memset(buf + i, 0, (p->dft_len - i) * sizeof(*buf));
  lsx_safe_rdft((int)p->dft_len, 1, buf);

  memset(buf, 0, p->start_bin * sizeof(*buf));
  for (i = p->start_bin; i < p->end_bin; ++i)
    buf[i] = (sqr(buf[2*i]) + sqr(buf[2*i+1])) * p->window2[i-p->start_bin];
  memset(buf + i, 0, ((p->dft_len >> 1) - i) * sizeof(*buf));
  lsx_safe_rdft((int)p->dft_len >> 1, 1, buf);

  i = max(1, (size_t)(.01 * p->dft_len + .5));
  mult = (p->dft_len / 4 + 1.) / (p->dft_len / 4 - i);
  for (; i < p->dft_len >> 2; ++i)
    result += sqr(buf[2*i]) + sqr(buf[2*i+1]);
  result = log(mult * result);
  result = max(result + 50, 0);
#if 0
  fprintf(stderr, "%g\n", result);
#endif
  return result;
}

static int flow_trigger(sox_effect_t * effp, sox_sample_t const * ibuf,
    sox_sample_t * obuf, size_t * ilen, size_t * olen)
{
  priv_t * p = (priv_t *)effp->priv;
  sox_bool triggered = sox_false;
  size_t i, idone = 0, to_flush = 0;

  while (idone < *ilen && !triggered) {
    for (i = 0; i < effp->in_signal.channels; ++i, ++idone) {
      chan_t * c = &p->channels[i];
      p->buffer[p->buffer_ptr++] = *ibuf++;
      if (p->measure_timer == p->measure_period - 1) {
        size_t flush = p->measure_len;
        size_t x = (p->buffer_ptr + p->buffer_len - flush) % p->buffer_len;
        double slope, meas, meas0 = measure(effp, x);
        c->meas = c->meas * p->trigger_meas_tc_mult + meas0 *(1 - p->trigger_meas_tc_mult);
        if (c->last_meas) {
          slope = (meas0 - c->last_meas) * p->measure_freq;
          c->slope1 = c->slope1? c->slope1 * p->trigger_slope_tc_mult1 + slope
            * (1  - p->trigger_slope_tc_mult1) : slope;
          c->slope2 = c->slope2? c->slope2 * p->trigger_slope_tc_mult2 + slope
            * (1  - p->trigger_slope_tc_mult2) : slope;
        }
        c->last_meas = meas0;
#if 0
        if (c->meas)
          fprintf(stderr, "%g\n", c->meas);
#endif
        if (triggered |= c->meas > p->trigger_level) {
          sox_bool started = sox_false;
          do {
            x = (x + p->buffer_len - p->search_step_len) % p->buffer_len;
            flush += p->search_step_len;
            meas = measure(effp, x);
#if 0
            fprintf(stderr, "%g %g %g\n", meas, c->slope1, c->slope2);
#endif
            slope = -(meas - c->last_meas) / p->search_step_time;
            c->last_meas = meas;
            if (slope > 0 || started) {
              c->slope1 = c->slope1 * p->search_slope_tc_mult1 +
                slope * (1  - p->search_slope_tc_mult1);
              c->slope2 = c->slope2 * p->search_slope_tc_mult2 +
                slope * (1  - p->search_slope_tc_mult2);
              started = sox_true;
            }
          } while (flush < p->search_len && (
                (meas > meas0 - 12 && (c->slope1 > 4 || c->slope2 > 2)) ||
                meas > p->trigger_level));
          to_flush = range_limit(flush, to_flush, p->search_len);
        }
      }
    }
    if (p->buffer_ptr == p->buffer_len)
      p->buffer_ptr = 0;
    if (++p->measure_timer == p->measure_period)
      p->measure_timer = 0;
  }
  if (triggered) {
    size_t ilen1 = *ilen - idone;
    p->flush_done = p->search_len - to_flush;
    p->buffer_ptr = (p->buffer_ptr + p->flush_done) % p->buffer_len;
    (effp->handler.flow = flow_flush)(effp, ibuf, obuf, &ilen1, olen);
    idone += ilen1;
  }
  else *olen = 0;
  *ilen = idone;
  return SOX_SUCCESS;
}

static int drain(sox_effect_t * effp, sox_sample_t * obuf, size_t * olen)
{
  size_t ilen = 0;
  return effp->handler.flow(effp, NULL, obuf, &ilen, olen);
}

static int stop(sox_effect_t * effp)
{
  priv_t * p = (priv_t *)effp->priv;
  free(p->channels);
  free(p->window2);
  free(p->window1);
  free(p->dft_buf);
  free(p->buffer);
  return SOX_SUCCESS;
}

sox_effect_handler_t const * lsx_vad_effect_fn(void)
{
  static sox_effect_handler_t handler = {"vad", "[options]"
    "\n\t-h high-pass-filter         (300 Hz)"
    "\n\t-l low-pass-filter          (12500 Hz)"
    "\n\t-m measure-duration         (0.2 s)"
    "\n\t-f measure-frequency        (10 Hz)"
    "\n\t-T trigger-time-constant    (0.2 s)"
    "\n\t-t trigger-level            (33)"
    "\n\t-s search-time              (1 s)"
    "\n\t-q search-step-time         (0.05 s)"
    "\n\t-S slope-slow-time-constant (0.35 s)"
    "\n\t-F slope-fast-time-constant (0.075 s)"
    "\n\t-p pre-trigger-buffer       (0 s)"
    , SOX_EFF_MCHAN | SOX_EFF_LENGTH | SOX_EFF_MODIFY | SOX_EFF_ALPHA,
    create, start, flow_trigger, drain, stop, NULL, sizeof(priv_t)
  };
  return &handler;
}