shithub: sox

ref: dc40c19c801405e04f0202fcaffe87278c0dbe30
dir: /src/vad.c/

View raw version
/* libSoX effect: Voice Activity Detector  (c) 2009 robs@users.sourceforge.net
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or (at
 * your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 * General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include "sox_i.h"
#include "sgetopt.h"
#include <string.h>

typedef struct {
  double    * dft_buf, * noise_buf, * spectrum, * meas_buf, mean_meas;
} chan_t;

typedef struct {                /* Configuration parameters: */
  double    noise_tc_up, noise_tc_down, noise_reduction_amount;
  double    measure_freq, measure_duration, measure_tc, pre_trigger_time;
  double    hp_filter_freq, lp_filter_freq, hp_lifter_freq, lp_lifter_freq;
  double    trigger_tc, trigger_level1, search_time, gap_time;
                                /* Working variables: */
  sox_sample_t  * buffer;
  unsigned  dft_len, buffer_len, buffer_ptr, flush_done, gap_count;
  unsigned  measure_period_len, measure_len, search_count, search_ptr;
  unsigned  spectrum_start, spectrum_end, cepstrum_start, cepstrum_end;
  int       measure_timer, booting;
  double    measure_tc_mult, trigger_meas_tc_mult;
  double    noise_tc_up_mult, noise_tc_down_mult;
  double    * spectrum_window, * cepstrum_window;
  chan_t    * channels;
} priv_t;

#define GETOPT_FREQ(c, name, min) \
    case c: p->name = lsx_parse_frequency(lsx_optarg, &parse_ptr); \
      if (p->name < min || *parse_ptr) return lsx_usage(effp); \
      break;

static int create(sox_effect_t * effp, int argc, char * * argv)
{
  priv_t * p = (priv_t *)effp->priv;
  #define opt_str "+N:n:r:f:m:M:h:l:H:L:T:t:s:g:p:"
  int c;

  p->noise_tc_up      = .1;
  p->noise_tc_down    = .01;
  p->noise_reduction_amount = 1.35;

  p->measure_freq     = 20;
  p->measure_duration = 2 / p->measure_freq;
  p->measure_tc       = .4;

  p->hp_filter_freq   = 50;
  p->lp_filter_freq   = 6000;
  p->hp_lifter_freq   = 150;
  p->lp_lifter_freq   = 2000;

  p->trigger_tc       = .25;
  p->trigger_level1   = 7;

  p->search_time      = 1;
  p->gap_time         = .25;

  while ((c = lsx_getopt(argc, argv, opt_str)) != -1) switch (c) {
    char * parse_ptr;
    GETOPT_NUMERIC('N', noise_tc_up     ,  .1 , 10)
    GETOPT_NUMERIC('n', noise_tc_down   ,.001 , .1)
    GETOPT_NUMERIC('r', noise_reduction_amount   ,0 , 2)
    GETOPT_NUMERIC('f', measure_freq    ,   5 , 50)
    GETOPT_NUMERIC('m', measure_duration, .01 , 1)
    GETOPT_NUMERIC('M', measure_tc      ,  .1 , 1)
    GETOPT_FREQ(   'h', hp_filter_freq  ,  10)
    GETOPT_FREQ(   'l', lp_filter_freq  ,  1000)
    GETOPT_FREQ(   'H', hp_lifter_freq  ,  10)
    GETOPT_FREQ(   'L', lp_lifter_freq  ,  1000)
    GETOPT_NUMERIC('T', trigger_tc      , .01 , 1)
    GETOPT_NUMERIC('t', trigger_level1  ,   0 , 10)
    GETOPT_NUMERIC('s', search_time     ,  .1 , 4)
    GETOPT_NUMERIC('g', gap_time        ,  .1 , 1)
    GETOPT_NUMERIC('p', pre_trigger_time,   0 , 4)
    default: lsx_fail("invalid option `-%c'", optopt); return lsx_usage(effp);
  }
  return lsx_optind !=argc? lsx_usage(effp) : SOX_SUCCESS;
}

static int start(sox_effect_t * effp)
{
  priv_t * p = (priv_t *)effp->priv;
  unsigned i, pre_trigger_len, search_len;

  pre_trigger_len = p->pre_trigger_time * effp->in_signal.rate + .5;
  pre_trigger_len *= effp->in_signal.channels;

  p->measure_len = effp->in_signal.rate * p->measure_duration + .5;
  p->measure_len *= effp->in_signal.channels;

  p->measure_period_len = effp->in_signal.rate / p->measure_freq + .5;
  p->measure_period_len *= effp->in_signal.channels;
  p->search_count = ceil(p->search_time * p->measure_freq);
  search_len = p->search_count * p->measure_period_len;
  p->gap_count = p->gap_time * p->measure_freq + .5;

  p->buffer_len = pre_trigger_len + p->measure_len + search_len;
  lsx_Calloc(p->buffer, p->buffer_len);

  for (p->dft_len = 16; p->dft_len < p->measure_len; p->dft_len <<= 1);
  lsx_debug("dft_len=%u measure_len=%u", p->dft_len, p->measure_len);

  lsx_Calloc(p->channels, effp->in_signal.channels);
  for (i = 0; i < effp->in_signal.channels; ++i) {
    chan_t * c = &p->channels[i];
    lsx_Calloc(c->dft_buf, p->dft_len);
    lsx_Calloc(c->spectrum, p->dft_len);
    lsx_Calloc(c->noise_buf, p->dft_len);
    lsx_Calloc(c->meas_buf, p->search_count);
  }

  lsx_Calloc(p->spectrum_window, p->measure_len);
  for (i = 0; i < p->measure_len; ++i)
    p->spectrum_window[i] = -2. / SOX_SAMPLE_MIN / sqrt((double)p->measure_len);
  lsx_apply_hann(p->spectrum_window, (int)p->measure_len);

  p->spectrum_start = p->hp_filter_freq / effp->in_signal.rate * p->dft_len + .5;
  p->spectrum_start = max(p->spectrum_start, 1);
  p->spectrum_end = p->lp_filter_freq / effp->in_signal.rate * p->dft_len + .5;
  p->spectrum_end = min(p->spectrum_end, p->dft_len / 2);

  lsx_Calloc(p->cepstrum_window, p->spectrum_end - p->spectrum_start);
  for (i = 0; i < p->spectrum_end - p->spectrum_start; ++i)
    p->cepstrum_window[i] = 2 / sqrt((double)p->spectrum_end - p->spectrum_start);
  lsx_apply_hann(p->cepstrum_window, (int)(p->spectrum_end - p->spectrum_start));
  
  p->cepstrum_start = ceil(effp->in_signal.rate * .5 / p->lp_lifter_freq);
  p->cepstrum_end = floor(effp->in_signal.rate * .5 / p->hp_lifter_freq);
  p->cepstrum_end = min(p->cepstrum_end, p->dft_len / 4);
  if (p->cepstrum_end <= p->cepstrum_start)
    return SOX_EOF;

  p->noise_tc_up_mult     = exp(-1 / (p->noise_tc_up   * p->measure_freq));
  p->noise_tc_down_mult   = exp(-1 / (p->noise_tc_down * p->measure_freq));
  p->measure_tc_mult      = exp(-1 / (p->measure_tc    * p->measure_freq));
  p->trigger_meas_tc_mult = exp(-1 / (p->trigger_tc    * p->measure_freq));

  p->measure_timer = -p->measure_len;
  p->flush_done = p->buffer_ptr = 0;
  return SOX_SUCCESS;
}

static int flow_flush(sox_effect_t * effp, sox_sample_t const * ibuf,
    sox_sample_t * obuf, size_t * ilen, size_t * olen)
{
  priv_t * p = (priv_t *)effp->priv;
  size_t odone = min(p->buffer_len - p->flush_done, *olen);
  size_t odone1 = min(odone, p->buffer_len - p->buffer_ptr);

  memcpy(obuf, p->buffer + p->buffer_ptr, odone1 * sizeof(*obuf));
  if ((p->buffer_ptr += odone1) == p->buffer_len) {
    memcpy(obuf + odone1, p->buffer, (odone - odone1) * sizeof(*obuf));
    p->buffer_ptr = odone - odone1;
  }
  if ((p->flush_done += odone) == p->buffer_len) {
    size_t olen1 = *olen - odone;
    (effp->handler.flow = lsx_flow_copy)(effp, ibuf, obuf +odone, ilen, &olen1);
    odone += olen1;
  }
  else *ilen = 0;
  *olen = odone;
  return SOX_SUCCESS;
}

static double measure(
    priv_t * p, chan_t * c, size_t index, unsigned step, int booting)
{
  double mult, result = 0;
  size_t i;

  for (i = 0; i < p->measure_len; ++i, index = (index + step) % p->buffer_len)
    c->dft_buf[i] = p->buffer[index] * p->spectrum_window[i];
  memset(c->dft_buf + i, 0, (p->dft_len - i) * sizeof(*c->dft_buf));
  lsx_safe_rdft((int)p->dft_len, 1, c->dft_buf);

  memset(c->dft_buf, 0, p->spectrum_start * sizeof(*c->dft_buf));
  for (i = p->spectrum_start; i < p->spectrum_end; ++i) {
    double d = sqrt(sqr(c->dft_buf[2 * i]) + sqr(c->dft_buf[2 * i + 1]));
    mult = booting >= 0? booting / (1. + booting) : p->measure_tc_mult;
    c->spectrum[i] = c->spectrum[i] * mult + d * (1 - mult);
    d = sqr(c->spectrum[i]);
    mult = booting >= 0? 0 :
        d > c->noise_buf[i]? p->noise_tc_up_mult : p->noise_tc_down_mult;
    c->noise_buf[i] = c->noise_buf[i] * mult + d * (1 - mult);
    d = sqrt(max(0, d - p->noise_reduction_amount * c->noise_buf[i]));
    c->dft_buf[i] = d * p->cepstrum_window[i - p->spectrum_start];
  }
  memset(c->dft_buf + i, 0, ((p->dft_len >> 1) - i) * sizeof(*c->dft_buf));
  lsx_safe_rdft((int)p->dft_len >> 1, 1, c->dft_buf);

  for (i = p->cepstrum_start; i < p->cepstrum_end; ++i)
    result += sqr(c->dft_buf[2 * i]) + sqr(c->dft_buf[2 * i + 1]);
  result = log(result / (p->cepstrum_end - p->cepstrum_start));
  return max(0, 21 + result);
}

static int flow_trigger(sox_effect_t * effp, sox_sample_t const * ibuf,
    sox_sample_t * obuf, size_t * ilen, size_t * olen)
{
  priv_t * p = (priv_t *)effp->priv;
  sox_bool triggered = sox_false;
  size_t i, idone = 0, to_flush = 0;

  while (idone < *ilen && !triggered) {
    p->measure_timer += effp->in_signal.channels;
    for (i = 0; i < effp->in_signal.channels; ++i, ++idone) {
      chan_t * c = &p->channels[i];
      p->buffer[p->buffer_ptr++] = *ibuf++;
      if (!p->measure_timer) {
        size_t x = (p->buffer_ptr + p->buffer_len - p->measure_len) % p->buffer_len;
        double meas = measure(p, c, x, effp->in_signal.channels, p->booting);
        c->meas_buf[p->search_ptr] = meas;
        c->mean_meas = c->mean_meas * p->trigger_meas_tc_mult +
            meas *(1 - p->trigger_meas_tc_mult);

        if (triggered |= c->mean_meas > p->trigger_level1) {
          unsigned n = p->search_count, ptr = p->search_ptr;
          unsigned j, trigger_j = n, zero_j = n;
          for (j = 0; j < n; ++j, ptr = (ptr + n - 1) % n)
            if (c->meas_buf[ptr] > p->trigger_level1 && j <= trigger_j + p->gap_count)
              zero_j = trigger_j = j;
            else if (!c->meas_buf[ptr] && trigger_j >= zero_j)
              zero_j = j;
          j = min(j, zero_j);
          to_flush = range_limit(j, to_flush, n);
        }
        lsx_debug_more("%12g %12g %u", meas, c->mean_meas, (unsigned)to_flush);
      }
    }
    if (p->buffer_ptr == p->buffer_len)
      p->buffer_ptr = 0;
    if (!p->measure_timer) {
      p->measure_timer = -p->measure_period_len;
      p->search_ptr = (p->search_ptr + 1) % p->search_count;
      if (p->booting >= 0)
        p->booting = p->booting == 6? -1 : p->booting + 1;
    }
  }
  if (triggered) {
    size_t ilen1 = *ilen - idone;
    p->flush_done = (p->search_count - to_flush) * p->measure_period_len;
    p->buffer_ptr = (p->buffer_ptr + p->flush_done) % p->buffer_len;
    (effp->handler.flow = flow_flush)(effp, ibuf, obuf, &ilen1, olen);
    idone += ilen1;
  }
  else *olen = 0;
  *ilen = idone;
  return SOX_SUCCESS;
}

static int drain(sox_effect_t * effp, sox_sample_t * obuf, size_t * olen)
{
  size_t ilen = 0;
  return effp->handler.flow(effp, NULL, obuf, &ilen, olen);
}

static int stop(sox_effect_t * effp)
{
  priv_t * p = (priv_t *)effp->priv;
  unsigned i;

  for (i = 0; i < effp->in_signal.channels; ++i) {
    chan_t * c = &p->channels[i];
    free(c->meas_buf);
    free(c->noise_buf);
    free(c->spectrum);
    free(c->dft_buf);
  }
  free(p->channels);
  free(p->cepstrum_window);
  free(p->spectrum_window);
  free(p->buffer);
  return SOX_SUCCESS;
}

sox_effect_handler_t const * lsx_vad_effect_fn(void)
{
  static sox_effect_handler_t handler = {"vad", NULL,
    SOX_EFF_MCHAN | SOX_EFF_LENGTH | SOX_EFF_MODIFY,
    create, start, flow_trigger, drain, stop, NULL, sizeof(priv_t)
  };
  static char const * lines[] = {
    "[options]",
    "\t-N noise-tc-up              (0.1 s)",
    "\t-n noise-tc-down            (0.01 s)",
    "\t-r noise-reduction-amount   (1.35)",
    "\t-f measure-frequency        (20 Hz)",
    "\t-m measure-duration         (0.1 s)",
    "\t-M measure-tc               (0.4 s)",
    "\t-h high-pass-filter         (50 Hz)",
    "\t-l low-pass-filter          (6000 Hz)",
    "\t-H high-pass-lifter         (150 Hz)",
    "\t-L low-pass-lifter          (2000 Hz)",
    "\t-T trigger-time-constant    (0.25 s)",
    "\t-t trigger-level            (7)",
    "\t-s search-time              (1 s)",
    "\t-g allowed-gap              (0.25 s)",
    "\t-p pre-trigger-buffer       (0 s)",
  };
  static char * usage;
  handler.usage = lsx_usage_lines(&usage, lines, array_length(lines));
  return &handler;
}