shithub: audio-stretch

Download patch

ref: 252db62cc75ec8a82354e3184ce3272079493755
parent: 92cb3bdbce88273529348958e92649c94782fc2e
author: David Bryant <david@wavpack.com>
date: Tue Oct 18 16:57:40 EDT 2022

issue #9: experimental version with silence/gap detection and processing

- allows specification of a different stretch ratio for detected gaps/silence
- includes configurable threshold level (in dB) and window size (in ms)
- silence look-ahead incorporated to reduce artifacts

--- a/main.c
+++ b/main.c
@@ -18,15 +18,21 @@
 
 #include "stretch.h"
 
+#define SILENCE_THRESHOLD_DB    -40
+#define AUDIO_WINDOW_MS         25
+
 static const char *sign_on = "\n"
-" AUDIO-STRETCH  Time Domain Harmonic Scaling Demo  Version 0.3\n"
+" AUDIO-STRETCH  Time Domain Harmonic Scaling Demo  Version 0.4\n"
 " Copyright (c) 2022 David Bryant. All Rights Reserved.\n\n";
 
 static const char *usage =
 " Usage:     AUDIO-STRETCH [-options] infile.wav outfile.wav\n\n"
 " Options:  -r<n.n> = stretch ratio (0.25 to 4.0, default = 1.0)\n"
+"           -g<n.n> = gap/silence stretch ratio (if different)\n"
 "           -u<n>   = upper freq period limit (default = 333 Hz)\n"
 "           -l<n>   = lower freq period limit (default = 55 Hz)\n"
+"           -b<n>   = audio buffer/window length (ms, default = 25)\n"
+"           -t<n>   = gap/silence threshold (dB re FS, default = -40)\n"
 "           -c      = cycle through all ratios, starting higher\n"
 "           -cc     = cycle through all ratios, starting lower\n"
 "           -d      = force dual instance even for shallow ratios\n"
@@ -68,23 +74,23 @@
 #define WAVE_FORMAT_EXTENSIBLE  0xfffe
 
 static int write_pcm_wav_header (FILE *outfile, uint32_t num_samples, int num_channels, int bytes_per_sample, uint32_t sample_rate);
+double rms_level_dB (int16_t *audio, int samples, int channels);
 
-#define BUFFER_SAMPLES 1024
-
 static int verbose_mode, quiet_mode;
 
 int main (argc, argv) int argc; char **argv;
 {
     int asked_help = 0, overwrite = 0, scale_rate = 0, force_fast = 0, force_normal = 0, force_dual = 0, cycle_ratio = 0;
-    int buffer_samples = BUFFER_SAMPLES, upper_frequency = 333, lower_frequency = 55, min_period, max_period;
+    float ratio = 1.0, silence_ratio = 0.0, silence_threshold_dB = SILENCE_THRESHOLD_DB;
     uint32_t samples_to_process, insamples = 0, outsamples = 0;
+    int upper_frequency = 333, lower_frequency = 55;
     char *infilename = NULL, *outfilename = NULL;
+    int audio_window_ms = AUDIO_WINDOW_MS;
     RiffChunkHeader riff_chunk_header;
     WaveHeader WaveHeader = { 0 };
     ChunkHeader chunk_header;
     StretchHandle stretcher;
     FILE *infile, *outfile;
-    float ratio = 1.0;
 
     // loop through command-line arguments
 
@@ -119,6 +125,17 @@
                         --*argv;
                         break;
 
+                    case 'B': case 'b':
+                        audio_window_ms = strtol (++*argv, argv, 10);
+
+                        if (audio_window_ms < 1 || audio_window_ms > 100) {
+                            fprintf (stderr, "\naudio window is from 1 to 100 ms!\n");
+                            return -1;
+                        }
+
+                        --*argv;
+                        break;
+
                     case 'R': case 'r':
                         ratio = strtod (++*argv, argv);
 
@@ -130,6 +147,28 @@
                         --*argv;
                         break;
 
+                    case 'G': case 'g':
+                        silence_ratio = strtod (++*argv, argv);
+
+                        if (silence_ratio < 0.25 || silence_ratio > 4.0) {
+                            fprintf (stderr, "\ngap/silence ratio must be from 0.25 to 4.0!\n");
+                            return -1;
+                        }
+
+                        --*argv;
+                        break;
+
+                    case 'T': case 't':
+                        silence_threshold_dB = strtod (++*argv, argv);
+
+                        if (silence_threshold_dB < -70 || silence_threshold_dB > -10) {
+                            fprintf (stderr, "\nsilence threshold must be from -10 to -70 dB!\n");
+                            return -1;
+                        }
+
+                        --*argv;
+                        break;
+
                     case 'S': case 's':
                         scale_rate = 1;
                         break;
@@ -311,22 +350,28 @@
         return 1;
     }
 
-    min_period = WaveHeader.SampleRate / upper_frequency;
-    max_period = WaveHeader.SampleRate / lower_frequency;
-    int flags = 0;
+    int flags = 0, silence_mode = silence_ratio && !cycle_ratio && silence_ratio != ratio;
+    int buffer_samples = WaveHeader.SampleRate * (audio_window_ms / 1000.0);
+    int min_period = WaveHeader.SampleRate / upper_frequency;
+    int max_period = WaveHeader.SampleRate / lower_frequency;
+    float max_ratio = ratio;
 
-    if (force_dual || ratio < 0.5 || ratio > 2.0)
-        flags |= STRETCH_DUAL_FLAG;
+    if (force_dual || ratio < 0.5 || ratio > 2.0 ||
+        (silence_mode && (silence_ratio < 0.5 || silence_ratio > 2.0)))
+            flags |= STRETCH_DUAL_FLAG;
 
     if ((force_fast || WaveHeader.SampleRate >= 32000) && !force_normal)
         flags |= STRETCH_FAST_FLAG;
 
-    if (verbose_mode)
-        fprintf (stderr, "initializing stretch library with period range = %d to %d, %d channels, %s, %s\n",
+    if (verbose_mode) {
+        fprintf (stderr, "file sample rate is %lu Hz (%s), buffer size is %d samples\n",
+            (unsigned long) WaveHeader.SampleRate, WaveHeader.NumChannels == 2 ? "stereo" : "mono", buffer_samples);
+        fprintf (stderr, "stretch period range = %d to %d, %d channels, %s, %s\n",
             min_period, max_period, WaveHeader.NumChannels, (flags & STRETCH_FAST_FLAG) ? "fast mode" : "normal mode",
             (flags & STRETCH_DUAL_FLAG) ? "dual instance" : "single instance");
+    }
 
-    if (!quiet_mode && ratio == 1.0 && !cycle_ratio)
+    if (!quiet_mode && ratio == 1.0 && !silence_mode && !cycle_ratio)
         fprintf (stderr, "warning: a ratio of 1.0 will do nothing but copy the WAV file!\n");
 
     if (!quiet_mode && ratio != 1.0 && cycle_ratio && !scale_rate)
@@ -350,65 +395,127 @@
     write_pcm_wav_header (outfile, 0, WaveHeader.NumChannels, 2, scaled_rate);
 
     if (cycle_ratio)
-        ratio = (flags & STRETCH_DUAL_FLAG) ? 4.0 : 2.0;
+        max_ratio = (flags & STRETCH_DUAL_FLAG) ? 4.0 : 2.0;
+    else if (silence_mode && silence_ratio > max_ratio)
+        max_ratio = silence_ratio;
 
-    int max_expected_samples = stretch_output_capacity (stretcher, buffer_samples, ratio);
+    int max_expected_samples = stretch_output_capacity (stretcher, buffer_samples, max_ratio);
+    int16_t *inbuffer = malloc (buffer_samples * WaveHeader.BlockAlign), *prebuffer = NULL;
     int16_t *outbuffer = malloc (max_expected_samples * WaveHeader.BlockAlign);
-    int16_t *inbuffer = malloc (buffer_samples * WaveHeader.BlockAlign);
+    int non_silence_frames = 0, silence_frames = 0, used_silence_frames = 0;
     int max_generated_stretch = 0, max_generated_flush = 0;
+    int samples_to_stretch = 0, consecutive_silence_frames = 1;
 
-    if (!inbuffer || !outbuffer) {
+    /* in the gap/silence mode we need an additional buffer to scan the "next" buffer for level */
+
+    if (silence_mode)
+        prebuffer = malloc (buffer_samples * WaveHeader.BlockAlign);
+
+    if (!inbuffer || !outbuffer || (silence_mode && !prebuffer)) {
         fprintf (stderr, "can't allocate required memory!\n");
         fclose (infile);
         return 1;
     }
 
+    /* read the entire file in frames and process with stretch */
+
     while (1) {
-        int samples_read = fread (inbuffer, WaveHeader.BlockAlign,
+        int samples_read = fread (silence_mode ? prebuffer : inbuffer, WaveHeader.BlockAlign,
             samples_to_process >= buffer_samples ? buffer_samples : samples_to_process, infile);
-        int samples_generated;
 
+        if (!silence_mode && !samples_read)
+            break;
+
         insamples += samples_read;
         samples_to_process -= samples_read;
 
+        /* this is where we scan the frame we just read to see if it's below the silence threshold */
+
+        if (silence_mode) {
+            if (samples_read) {
+                double level = rms_level_dB (prebuffer, samples_read, WaveHeader.NumChannels);
+
+                if (level > silence_threshold_dB) {
+                    consecutive_silence_frames = 0;
+                    non_silence_frames++;
+                }
+                else {
+                    consecutive_silence_frames++;
+                    silence_frames++;
+                }
+            }
+        }
+        else
+            samples_to_stretch = samples_read;
+
         if (cycle_ratio) {
             if (flags & STRETCH_DUAL_FLAG)
-                ratio = (sin ((double) outsamples / WaveHeader.SampleRate) * (cycle_ratio & 1 ? 1.875 : -1.875)) + 2.125;
+                ratio = (sin ((double) outsamples / WaveHeader.SampleRate / 2.0) * (cycle_ratio & 1 ? 1.875 : -1.875)) + 2.125;
             else
                 ratio = (sin ((double) outsamples / WaveHeader.SampleRate) * (cycle_ratio & 1 ? 0.75 : -0.75)) + 1.25;
         }
 
-        if (samples_read) {
-            samples_generated = stretch_samples (stretcher, inbuffer, samples_read, outbuffer, ratio);
+        if (samples_to_stretch) {
+            int samples_generated;
 
-            if (samples_generated > max_generated_stretch)
-                max_generated_stretch = samples_generated;
-        }
-        else {
-            samples_generated = stretch_flush (stretcher, outbuffer);
+            /* we use the gap/silence stretch ratio if the current frame, and the ones on either side, measure below the threshold */
 
-            if (samples_generated > max_generated_flush)
-                max_generated_flush = samples_generated;
-        }
+            if (consecutive_silence_frames >= 3) {
+                samples_generated = stretch_samples (stretcher, inbuffer, samples_to_stretch, outbuffer, silence_ratio);
+                used_silence_frames++;
+            }
+            else
+                samples_generated = stretch_samples (stretcher, inbuffer, samples_to_stretch, outbuffer, ratio);
 
-        if (samples_generated) {
-            fwrite (outbuffer, WaveHeader.BlockAlign, samples_generated, outfile);
-            outsamples += samples_generated;
+            if (samples_generated) {
+                if (samples_generated > max_generated_stretch)
+                    max_generated_stretch = samples_generated;
 
-            if (samples_generated > max_expected_samples) {
-                fprintf (stderr, "%s: generated samples (%d) exceeded expected (%d)!\n", samples_read ? "stretch" : "flush",
-                    samples_generated, max_expected_samples);
-                fclose (infile);
-                return 1;
+                fwrite (outbuffer, WaveHeader.BlockAlign, samples_generated, outfile);
+                outsamples += samples_generated;
+
+                if (samples_generated > max_expected_samples) {
+                    fprintf (stderr, "stretch: generated samples (%d) exceeded expected (%d)!\n", samples_generated, max_expected_samples);
+                    fclose (infile);
+                    return 1;
+                }
             }
         }
 
-        if (!samples_read && !samples_generated)
+        if (silence_mode) {
+            if (samples_read) {
+                memcpy (inbuffer, prebuffer, samples_read * WaveHeader.BlockAlign);
+                samples_to_stretch = samples_read;
+            }
+            else
+                break;
+        }
+    }
+
+    /* next call the stretch flush function until it returns zero */
+
+    while (1) {
+        int samples_flushed = stretch_flush (stretcher, outbuffer);
+
+        if (!samples_flushed)
             break;
+
+        if (samples_flushed > max_generated_flush)
+            max_generated_flush = samples_flushed;
+
+        fwrite (outbuffer, WaveHeader.BlockAlign, samples_flushed, outfile);
+        outsamples += samples_flushed;
+
+        if (samples_flushed > max_expected_samples) {
+            fprintf (stderr, "flush: generated samples (%d) exceeded expected (%d)!\n", samples_flushed, max_expected_samples);
+            fclose (infile);
+            return 1;
+        }
     }
 
     free (inbuffer);
     free (outbuffer);
+    free (prebuffer);
     stretch_deinit (stretcher);
 
     fclose (infile);
@@ -425,6 +532,12 @@
                 (unsigned long) WaveHeader.SampleRate, (unsigned long) scaled_rate);
         fprintf (stderr, "max expected samples = %d, actually seen = %d stretch, %d flush\n",
             max_expected_samples, max_generated_stretch, max_generated_flush);
+        if (silence_frames || non_silence_frames) {
+            int total_frames = silence_frames + non_silence_frames;
+            fprintf (stderr, "%d silence frames detected (%.2f%%), %d actually used (%.2f%%)\n",
+                silence_frames, silence_frames * 100.0 / total_frames,
+                used_silence_frames, used_silence_frames * 100.0 / total_frames); 
+        }
     }
 
     return 0;
@@ -461,4 +574,21 @@
         fwrite (&fmthdr, sizeof (fmthdr), 1, outfile) &&
         fwrite (&wavhdr, wavhdrsize, 1, outfile) &&
         fwrite (&datahdr, sizeof (datahdr), 1, outfile);
+}
+
+double rms_level_dB (int16_t *audio, int samples, int channels)
+{
+    double rms_sum = 0.0;
+    int i;
+
+    if (channels == 1)
+        for (i = 0; i < samples; ++i)
+            rms_sum += (double) audio [i] * audio [i];
+    else
+        for (i = 0; i < samples; ++i) {
+            double average = (audio [i * 2] + audio [i * 2 + 1]) / 2.0;
+            rms_sum += average * average;
+        }
+
+    return log10 (rms_sum / samples / (32768.0 * 32767.0 * 0.5)) * 10.0;
 }