ref: 252db62cc75ec8a82354e3184ce3272079493755
parent: 92cb3bdbce88273529348958e92649c94782fc2e
author: David Bryant <david@wavpack.com>
date: Tue Oct 18 16:57:40 EDT 2022
issue #9: experimental version with silence/gap detection and processing - allows specification of a different stretch ratio for detected gaps/silence - includes configurable threshold level (in dB) and window size (in ms) - silence look-ahead incorporated to reduce artifacts
--- a/main.c
+++ b/main.c
@@ -18,15 +18,21 @@
#include "stretch.h"
+#define SILENCE_THRESHOLD_DB -40
+#define AUDIO_WINDOW_MS 25
+
static const char *sign_on = "\n"
-" AUDIO-STRETCH Time Domain Harmonic Scaling Demo Version 0.3\n"
+" AUDIO-STRETCH Time Domain Harmonic Scaling Demo Version 0.4\n"
" Copyright (c) 2022 David Bryant. All Rights Reserved.\n\n";
static const char *usage =
" Usage: AUDIO-STRETCH [-options] infile.wav outfile.wav\n\n"
" Options: -r<n.n> = stretch ratio (0.25 to 4.0, default = 1.0)\n"
+" -g<n.n> = gap/silence stretch ratio (if different)\n"
" -u<n> = upper freq period limit (default = 333 Hz)\n"
" -l<n> = lower freq period limit (default = 55 Hz)\n"
+" -b<n> = audio buffer/window length (ms, default = 25)\n"
+" -t<n> = gap/silence threshold (dB re FS, default = -40)\n"
" -c = cycle through all ratios, starting higher\n"
" -cc = cycle through all ratios, starting lower\n"
" -d = force dual instance even for shallow ratios\n"
@@ -68,23 +74,23 @@
#define WAVE_FORMAT_EXTENSIBLE 0xfffe
static int write_pcm_wav_header (FILE *outfile, uint32_t num_samples, int num_channels, int bytes_per_sample, uint32_t sample_rate);
+double rms_level_dB (int16_t *audio, int samples, int channels);
-#define BUFFER_SAMPLES 1024
-
static int verbose_mode, quiet_mode;
int main (argc, argv) int argc; char **argv;
{
int asked_help = 0, overwrite = 0, scale_rate = 0, force_fast = 0, force_normal = 0, force_dual = 0, cycle_ratio = 0;
- int buffer_samples = BUFFER_SAMPLES, upper_frequency = 333, lower_frequency = 55, min_period, max_period;
+ float ratio = 1.0, silence_ratio = 0.0, silence_threshold_dB = SILENCE_THRESHOLD_DB;
uint32_t samples_to_process, insamples = 0, outsamples = 0;
+ int upper_frequency = 333, lower_frequency = 55;
char *infilename = NULL, *outfilename = NULL;
+ int audio_window_ms = AUDIO_WINDOW_MS;
RiffChunkHeader riff_chunk_header;
WaveHeader WaveHeader = { 0 };
ChunkHeader chunk_header;
StretchHandle stretcher;
FILE *infile, *outfile;
- float ratio = 1.0;
// loop through command-line arguments
@@ -119,6 +125,17 @@
--*argv;
break;
+ case 'B': case 'b':
+ audio_window_ms = strtol (++*argv, argv, 10);
+
+ if (audio_window_ms < 1 || audio_window_ms > 100) {
+ fprintf (stderr, "\naudio window is from 1 to 100 ms!\n");
+ return -1;
+ }
+
+ --*argv;
+ break;
+
case 'R': case 'r':
ratio = strtod (++*argv, argv);
@@ -130,6 +147,28 @@
--*argv;
break;
+ case 'G': case 'g':
+ silence_ratio = strtod (++*argv, argv);
+
+ if (silence_ratio < 0.25 || silence_ratio > 4.0) {
+ fprintf (stderr, "\ngap/silence ratio must be from 0.25 to 4.0!\n");
+ return -1;
+ }
+
+ --*argv;
+ break;
+
+ case 'T': case 't':
+ silence_threshold_dB = strtod (++*argv, argv);
+
+ if (silence_threshold_dB < -70 || silence_threshold_dB > -10) {
+ fprintf (stderr, "\nsilence threshold must be from -10 to -70 dB!\n");
+ return -1;
+ }
+
+ --*argv;
+ break;
+
case 'S': case 's':
scale_rate = 1;
break;
@@ -311,22 +350,28 @@
return 1;
}
- min_period = WaveHeader.SampleRate / upper_frequency;
- max_period = WaveHeader.SampleRate / lower_frequency;
- int flags = 0;
+ int flags = 0, silence_mode = silence_ratio && !cycle_ratio && silence_ratio != ratio;
+ int buffer_samples = WaveHeader.SampleRate * (audio_window_ms / 1000.0);
+ int min_period = WaveHeader.SampleRate / upper_frequency;
+ int max_period = WaveHeader.SampleRate / lower_frequency;
+ float max_ratio = ratio;
- if (force_dual || ratio < 0.5 || ratio > 2.0)
- flags |= STRETCH_DUAL_FLAG;
+ if (force_dual || ratio < 0.5 || ratio > 2.0 ||
+ (silence_mode && (silence_ratio < 0.5 || silence_ratio > 2.0)))
+ flags |= STRETCH_DUAL_FLAG;
if ((force_fast || WaveHeader.SampleRate >= 32000) && !force_normal)
flags |= STRETCH_FAST_FLAG;
- if (verbose_mode)
- fprintf (stderr, "initializing stretch library with period range = %d to %d, %d channels, %s, %s\n",
+ if (verbose_mode) {
+ fprintf (stderr, "file sample rate is %lu Hz (%s), buffer size is %d samples\n",
+ (unsigned long) WaveHeader.SampleRate, WaveHeader.NumChannels == 2 ? "stereo" : "mono", buffer_samples);
+ fprintf (stderr, "stretch period range = %d to %d, %d channels, %s, %s\n",
min_period, max_period, WaveHeader.NumChannels, (flags & STRETCH_FAST_FLAG) ? "fast mode" : "normal mode",
(flags & STRETCH_DUAL_FLAG) ? "dual instance" : "single instance");
+ }
- if (!quiet_mode && ratio == 1.0 && !cycle_ratio)
+ if (!quiet_mode && ratio == 1.0 && !silence_mode && !cycle_ratio)
fprintf (stderr, "warning: a ratio of 1.0 will do nothing but copy the WAV file!\n");
if (!quiet_mode && ratio != 1.0 && cycle_ratio && !scale_rate)
@@ -350,65 +395,127 @@
write_pcm_wav_header (outfile, 0, WaveHeader.NumChannels, 2, scaled_rate);
if (cycle_ratio)
- ratio = (flags & STRETCH_DUAL_FLAG) ? 4.0 : 2.0;
+ max_ratio = (flags & STRETCH_DUAL_FLAG) ? 4.0 : 2.0;
+ else if (silence_mode && silence_ratio > max_ratio)
+ max_ratio = silence_ratio;
- int max_expected_samples = stretch_output_capacity (stretcher, buffer_samples, ratio);
+ int max_expected_samples = stretch_output_capacity (stretcher, buffer_samples, max_ratio);
+ int16_t *inbuffer = malloc (buffer_samples * WaveHeader.BlockAlign), *prebuffer = NULL;
int16_t *outbuffer = malloc (max_expected_samples * WaveHeader.BlockAlign);
- int16_t *inbuffer = malloc (buffer_samples * WaveHeader.BlockAlign);
+ int non_silence_frames = 0, silence_frames = 0, used_silence_frames = 0;
int max_generated_stretch = 0, max_generated_flush = 0;
+ int samples_to_stretch = 0, consecutive_silence_frames = 1;
- if (!inbuffer || !outbuffer) {
+ /* in the gap/silence mode we need an additional buffer to scan the "next" buffer for level */
+
+ if (silence_mode)
+ prebuffer = malloc (buffer_samples * WaveHeader.BlockAlign);
+
+ if (!inbuffer || !outbuffer || (silence_mode && !prebuffer)) {
fprintf (stderr, "can't allocate required memory!\n");
fclose (infile);
return 1;
}
+ /* read the entire file in frames and process with stretch */
+
while (1) {
- int samples_read = fread (inbuffer, WaveHeader.BlockAlign,
+ int samples_read = fread (silence_mode ? prebuffer : inbuffer, WaveHeader.BlockAlign,
samples_to_process >= buffer_samples ? buffer_samples : samples_to_process, infile);
- int samples_generated;
+ if (!silence_mode && !samples_read)
+ break;
+
insamples += samples_read;
samples_to_process -= samples_read;
+ /* this is where we scan the frame we just read to see if it's below the silence threshold */
+
+ if (silence_mode) {
+ if (samples_read) {
+ double level = rms_level_dB (prebuffer, samples_read, WaveHeader.NumChannels);
+
+ if (level > silence_threshold_dB) {
+ consecutive_silence_frames = 0;
+ non_silence_frames++;
+ }
+ else {
+ consecutive_silence_frames++;
+ silence_frames++;
+ }
+ }
+ }
+ else
+ samples_to_stretch = samples_read;
+
if (cycle_ratio) {
if (flags & STRETCH_DUAL_FLAG)
- ratio = (sin ((double) outsamples / WaveHeader.SampleRate) * (cycle_ratio & 1 ? 1.875 : -1.875)) + 2.125;
+ ratio = (sin ((double) outsamples / WaveHeader.SampleRate / 2.0) * (cycle_ratio & 1 ? 1.875 : -1.875)) + 2.125;
else
ratio = (sin ((double) outsamples / WaveHeader.SampleRate) * (cycle_ratio & 1 ? 0.75 : -0.75)) + 1.25;
}
- if (samples_read) {
- samples_generated = stretch_samples (stretcher, inbuffer, samples_read, outbuffer, ratio);
+ if (samples_to_stretch) {
+ int samples_generated;
- if (samples_generated > max_generated_stretch)
- max_generated_stretch = samples_generated;
- }
- else {
- samples_generated = stretch_flush (stretcher, outbuffer);
+ /* we use the gap/silence stretch ratio if the current frame, and the ones on either side, measure below the threshold */
- if (samples_generated > max_generated_flush)
- max_generated_flush = samples_generated;
- }
+ if (consecutive_silence_frames >= 3) {
+ samples_generated = stretch_samples (stretcher, inbuffer, samples_to_stretch, outbuffer, silence_ratio);
+ used_silence_frames++;
+ }
+ else
+ samples_generated = stretch_samples (stretcher, inbuffer, samples_to_stretch, outbuffer, ratio);
- if (samples_generated) {
- fwrite (outbuffer, WaveHeader.BlockAlign, samples_generated, outfile);
- outsamples += samples_generated;
+ if (samples_generated) {
+ if (samples_generated > max_generated_stretch)
+ max_generated_stretch = samples_generated;
- if (samples_generated > max_expected_samples) {
- fprintf (stderr, "%s: generated samples (%d) exceeded expected (%d)!\n", samples_read ? "stretch" : "flush",
- samples_generated, max_expected_samples);
- fclose (infile);
- return 1;
+ fwrite (outbuffer, WaveHeader.BlockAlign, samples_generated, outfile);
+ outsamples += samples_generated;
+
+ if (samples_generated > max_expected_samples) {
+ fprintf (stderr, "stretch: generated samples (%d) exceeded expected (%d)!\n", samples_generated, max_expected_samples);
+ fclose (infile);
+ return 1;
+ }
}
}
- if (!samples_read && !samples_generated)
+ if (silence_mode) {
+ if (samples_read) {
+ memcpy (inbuffer, prebuffer, samples_read * WaveHeader.BlockAlign);
+ samples_to_stretch = samples_read;
+ }
+ else
+ break;
+ }
+ }
+
+ /* next call the stretch flush function until it returns zero */
+
+ while (1) {
+ int samples_flushed = stretch_flush (stretcher, outbuffer);
+
+ if (!samples_flushed)
break;
+
+ if (samples_flushed > max_generated_flush)
+ max_generated_flush = samples_flushed;
+
+ fwrite (outbuffer, WaveHeader.BlockAlign, samples_flushed, outfile);
+ outsamples += samples_flushed;
+
+ if (samples_flushed > max_expected_samples) {
+ fprintf (stderr, "flush: generated samples (%d) exceeded expected (%d)!\n", samples_flushed, max_expected_samples);
+ fclose (infile);
+ return 1;
+ }
}
free (inbuffer);
free (outbuffer);
+ free (prebuffer);
stretch_deinit (stretcher);
fclose (infile);
@@ -425,6 +532,12 @@
(unsigned long) WaveHeader.SampleRate, (unsigned long) scaled_rate);
fprintf (stderr, "max expected samples = %d, actually seen = %d stretch, %d flush\n",
max_expected_samples, max_generated_stretch, max_generated_flush);
+ if (silence_frames || non_silence_frames) {
+ int total_frames = silence_frames + non_silence_frames;
+ fprintf (stderr, "%d silence frames detected (%.2f%%), %d actually used (%.2f%%)\n",
+ silence_frames, silence_frames * 100.0 / total_frames,
+ used_silence_frames, used_silence_frames * 100.0 / total_frames);
+ }
}
return 0;
@@ -461,4 +574,21 @@
fwrite (&fmthdr, sizeof (fmthdr), 1, outfile) &&
fwrite (&wavhdr, wavhdrsize, 1, outfile) &&
fwrite (&datahdr, sizeof (datahdr), 1, outfile);
+}
+
+double rms_level_dB (int16_t *audio, int samples, int channels)
+{
+ double rms_sum = 0.0;
+ int i;
+
+ if (channels == 1)
+ for (i = 0; i < samples; ++i)
+ rms_sum += (double) audio [i] * audio [i];
+ else
+ for (i = 0; i < samples; ++i) {
+ double average = (audio [i * 2] + audio [i * 2 + 1]) / 2.0;
+ rms_sum += average * average;
+ }
+
+ return log10 (rms_sum / samples / (32768.0 * 32767.0 * 0.5)) * 10.0;
}