ref: 2076827f848f595b5506bc5927db1489e8e896be
dir: /src/vad.c/
/* libSoX effect: Voice Activity Detector (c) 2009 robs@users.sourceforge.net * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser * General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "sox_i.h" #include "sgetopt.h" #include <string.h> typedef struct { double last_meas; double meas, slope1, slope2; /* TC -controlled */ } chan_t; typedef struct { /* Configuration parameters: */ double hp_freq, lp_freq, measure_freq, search_step_time; double measure_duration, search_time, pre_trigger_time, trigger_level; double trigger_tc, slope_tc1, slope_tc2; /* Working variables: */ sox_sample_t * buffer; unsigned search_len, buffer_len, buffer_ptr, flush_done, search_step_len; double * dft_buf, * window1, * window2; unsigned dft_len, measure_period, measure_timer, measure_len; chan_t * channels; double trigger_meas_tc_mult, trigger_slope_tc_mult1, trigger_slope_tc_mult2; double search_slope_tc_mult1, search_slope_tc_mult2; unsigned start_bin, end_bin; } priv_t; static int create(sox_effect_t * effp, int argc, char * * argv) { priv_t * p = (priv_t *)effp->priv; int c; p->hp_freq = 300; p->lp_freq = 12500; p->measure_duration = .2; p->measure_freq = 10; p->trigger_tc = .2; p->trigger_level = 33; p->search_time = 1; p->search_step_time = .05; p->slope_tc1 = .35; p->slope_tc2 = .075; while ((c = lsx_getopt(argc, argv, "+h:l:m:f:T:t:s:q:S:F:p:")) != -1) switch (c) { char * parse_ptr; case 'h': p->hp_freq = lsx_parse_frequency(lsx_optarg, &parse_ptr); if (p->hp_freq < 10 || *parse_ptr) return lsx_usage(effp); break; case 'l': p->lp_freq = lsx_parse_frequency(lsx_optarg, &parse_ptr); if (p->lp_freq < 1000 || *parse_ptr) return lsx_usage(effp); break; GETOPT_NUMERIC('m', measure_duration, .02, 2) GETOPT_NUMERIC('f', measure_freq , 1 ,100) GETOPT_NUMERIC('T', trigger_tc , .001, 1) GETOPT_NUMERIC('t', trigger_level , 0, 100) GETOPT_NUMERIC('s', search_time , 0 , 4) GETOPT_NUMERIC('q', search_step_time, .002, .02) GETOPT_NUMERIC('S', slope_tc1 , .001, 1) GETOPT_NUMERIC('F', slope_tc2 , .001, 1) GETOPT_NUMERIC('p', pre_trigger_time, 0 , 4) default: lsx_fail("invalid option `-%c'", optopt); return lsx_usage(effp); } return lsx_optind !=argc? lsx_usage(effp) : SOX_SUCCESS; } static int start(sox_effect_t * effp) { priv_t * p = (priv_t *)effp->priv; unsigned i; unsigned pre_trigger_len = p->pre_trigger_time * effp->in_signal.rate + .5; pre_trigger_len *= effp->in_signal.channels; p->measure_len = effp->in_signal.rate * p->measure_duration + .5; p->measure_len *= effp->in_signal.channels; p->search_step_len = effp->in_signal.rate * p->search_step_time + .5; p->search_step_len *= effp->in_signal.channels; p->search_len = p->search_time * effp->in_signal.rate + .5; p->search_len *= effp->in_signal.channels; p->search_len += p->measure_len; p->buffer_len = pre_trigger_len + p->search_len; p->buffer = lsx_calloc(p->buffer_len, sizeof(*p->buffer)); for (p->dft_len = 16; p->dft_len < p->measure_len; p->dft_len <<= 1); p->dft_buf = lsx_calloc(p->dft_len, sizeof(*p->dft_buf)); p->window1 = lsx_calloc(p->measure_len, sizeof(*p->window1)); for (i = 0; i < p->measure_len; ++i) p->window1[i] = -2. / SOX_SAMPLE_MIN / p->measure_len; lsx_apply_hann(p->window1, (int)p->measure_len); p->start_bin = p->hp_freq / effp->in_signal.rate * p->dft_len + .5; p->end_bin = p->lp_freq / effp->in_signal.rate * p->dft_len + .5; p->end_bin = min(p->end_bin, p->dft_len / 2); p->window2 = lsx_calloc(p->end_bin - p->start_bin, sizeof(*p->window2)); for (i = 0; i < p->end_bin - p->start_bin; ++i) p->window2[i] = 2 * (p->dft_len / 2 + 1.) / (p->end_bin - p->start_bin); lsx_apply_hann(p->window2, (int)(p->end_bin - p->start_bin)); p->flush_done = p->buffer_ptr = 0; p->measure_period = effp->in_signal.rate / p->measure_freq + .5; p->channels = lsx_calloc(effp->in_signal.channels, sizeof(*p->channels)); p->trigger_meas_tc_mult = exp(-1 / (p->trigger_tc * p->measure_freq)); p->trigger_slope_tc_mult1 = exp(-1 / (p->slope_tc1 * p->measure_freq)); p->trigger_slope_tc_mult2 = exp(-1 / (p->slope_tc2 * p->measure_freq)); p->search_slope_tc_mult1 = exp(-1 / (p->slope_tc1 / p->search_step_time)); p->search_slope_tc_mult2 = exp(-1 / (p->slope_tc2 / p->search_step_time)); lsx_debug("dft_len=%u measure_len=%u", p->dft_len, p->measure_len); return SOX_SUCCESS; } static int flow_flush(sox_effect_t * effp, sox_sample_t const * ibuf, sox_sample_t * obuf, size_t * ilen, size_t * olen) { priv_t * p = (priv_t *)effp->priv; size_t odone = min(p->buffer_len - p->flush_done, *olen); size_t odone1 = min(odone, p->buffer_len - p->buffer_ptr); memcpy(obuf, p->buffer + p->buffer_ptr, odone1 * sizeof(*obuf)); if ((p->buffer_ptr += odone1) == p->buffer_len) { memcpy(obuf + odone1, p->buffer, (odone - odone1) * sizeof(*obuf)); p->buffer_ptr = odone - odone1; } if ((p->flush_done += odone) == p->buffer_len) { size_t olen1 = *olen - odone; (effp->handler.flow = lsx_flow_copy)(effp, ibuf, obuf +odone, ilen, &olen1); odone += olen1; } else *ilen = 0; *olen = odone; return SOX_SUCCESS; } static double measure(sox_effect_t * effp, size_t x) { priv_t * p = (priv_t *)effp->priv; double * buf = p->dft_buf; double mult, result = 0; size_t i; for (i = 0; i < p->measure_len; ++i) { buf[i] = p->buffer[x] * p->window1[i]; x = (x + effp->in_signal.channels) % p->buffer_len; } memset(buf + i, 0, (p->dft_len - i) * sizeof(*buf)); lsx_safe_rdft((int)p->dft_len, 1, buf); memset(buf, 0, p->start_bin * sizeof(*buf)); for (i = p->start_bin; i < p->end_bin; ++i) buf[i] = (sqr(buf[2*i]) + sqr(buf[2*i+1])) * p->window2[i-p->start_bin]; memset(buf + i, 0, ((p->dft_len >> 1) - i) * sizeof(*buf)); lsx_safe_rdft((int)p->dft_len >> 1, 1, buf); i = max(1, (size_t)(.01 * p->dft_len + .5)); mult = (p->dft_len / 4 + 1.) / (p->dft_len / 4 - i); for (; i < p->dft_len >> 2; ++i) result += sqr(buf[2*i]) + sqr(buf[2*i+1]); result = log(mult * result); result = max(result + 50, 0); #if 0 fprintf(stderr, "%g\n", result); #endif return result; } static int flow_trigger(sox_effect_t * effp, sox_sample_t const * ibuf, sox_sample_t * obuf, size_t * ilen, size_t * olen) { priv_t * p = (priv_t *)effp->priv; sox_bool triggered = sox_false; size_t i, idone = 0, to_flush = 0; while (idone < *ilen && !triggered) { for (i = 0; i < effp->in_signal.channels; ++i, ++idone) { chan_t * c = &p->channels[i]; p->buffer[p->buffer_ptr++] = *ibuf++; if (p->measure_timer == p->measure_period - 1) { size_t flush = p->measure_len; size_t x = (p->buffer_ptr + p->buffer_len - flush) % p->buffer_len; double slope, meas, meas0 = measure(effp, x); c->meas = c->meas * p->trigger_meas_tc_mult + meas0 *(1 - p->trigger_meas_tc_mult); if (c->last_meas) { slope = (meas0 - c->last_meas) * p->measure_freq; c->slope1 = c->slope1? c->slope1 * p->trigger_slope_tc_mult1 + slope * (1 - p->trigger_slope_tc_mult1) : slope; c->slope2 = c->slope2? c->slope2 * p->trigger_slope_tc_mult2 + slope * (1 - p->trigger_slope_tc_mult2) : slope; } c->last_meas = meas0; #if 0 if (c->meas) fprintf(stderr, "%g\n", c->meas); #endif if (triggered |= c->meas > p->trigger_level) { sox_bool started = sox_false; do { x = (x + p->buffer_len - p->search_step_len) % p->buffer_len; flush += p->search_step_len; meas = measure(effp, x); #if 0 fprintf(stderr, "%g %g %g\n", meas, c->slope1, c->slope2); #endif slope = -(meas - c->last_meas) / p->search_step_time; c->last_meas = meas; if (slope > 0 || started) { c->slope1 = c->slope1 * p->search_slope_tc_mult1 + slope * (1 - p->search_slope_tc_mult1); c->slope2 = c->slope2 * p->search_slope_tc_mult2 + slope * (1 - p->search_slope_tc_mult2); started = sox_true; } } while (flush < p->search_len && ( (meas > meas0 - 12 && (c->slope1 > 4 || c->slope2 > 2)) || meas > p->trigger_level)); to_flush = range_limit(flush, to_flush, p->search_len); } } } if (p->buffer_ptr == p->buffer_len) p->buffer_ptr = 0; if (++p->measure_timer == p->measure_period) p->measure_timer = 0; } if (triggered) { size_t ilen1 = *ilen - idone; p->flush_done = p->search_len - to_flush; p->buffer_ptr = (p->buffer_ptr + p->flush_done) % p->buffer_len; (effp->handler.flow = flow_flush)(effp, ibuf, obuf, &ilen1, olen); idone += ilen1; } else *olen = 0; *ilen = idone; return SOX_SUCCESS; } static int drain(sox_effect_t * effp, sox_sample_t * obuf, size_t * olen) { size_t ilen = 0; return effp->handler.flow(effp, NULL, obuf, &ilen, olen); } static int stop(sox_effect_t * effp) { priv_t * p = (priv_t *)effp->priv; free(p->channels); free(p->window2); free(p->window1); free(p->dft_buf); free(p->buffer); return SOX_SUCCESS; } sox_effect_handler_t const * lsx_vad_effect_fn(void) { static sox_effect_handler_t handler = {"vad", "[options]" "\n\t-h high-pass-filter (300 Hz)" "\n\t-l low-pass-filter (12500 Hz)" "\n\t-m measure-duration (0.2 s)" "\n\t-f measure-frequency (10 Hz)" "\n\t-T trigger-time-constant (0.2 s)" "\n\t-t trigger-level (33)" "\n\t-s search-time (1 s)" "\n\t-q search-step-time (0.05 s)" "\n\t-S slope-slow-time-constant (0.35 s)" "\n\t-F slope-fast-time-constant (0.075 s)" "\n\t-p pre-trigger-buffer (0 s)" , SOX_EFF_MCHAN | SOX_EFF_LENGTH | SOX_EFF_MODIFY | SOX_EFF_ALPHA, create, start, flow_trigger, drain, stop, NULL, sizeof(priv_t) }; return &handler; }