shithub: audio-stretch

ref: fdf38c9b21956b1aab29005f20bd9d1d5af7fe06
dir: /this.c/

View raw version
#include <u.h>
#include <libc.h>
#include <thread.h>
#include "stretch.h"

enum{
	Nrate = 44100,
	Nchan = 2,
	Sampsz = Nchan * 2,
};

void
usage(void)
{
	fprint(2, "usage: %s [OPTIONS] [FILE]\n"
		"-r F  stretch ratio, [0.25,4.0] default 1.0\n"
		"-g F  gap/silence stretch ratio (if different)\n"
		"-u N  upper freq period limit [40,44100[ default 333 Hz\n"
		"-l N  lower freq period limit, [20,[ default 55 Hz\n"
		"-b N  audio buffer/window length, [1,100] default 25 ms\n"
		"-t N  gap/silence threshold (dB re FS, default -40)\n"
		"-c	cycle through all ratios, starting higher\n"
		"-C	cycle through all ratios, starting lower\n"
		"-d	force dual instance even for shallow ratios\n"
		"-s	scale rate to preserve duration (not pitch)\n"
		"-f	fast pitch detection (default >= 32 kHz)\n"
		"-n	normal pitch detection (default < 32 kHz)\n",
		argv0);
	exits("usage");
}

double
rms_level_dB (int16_t *audio, int samples, int channels)
{
	double rms_sum = 0.0;
	int i;

	if (channels == 1)
		for (i = 0; i < samples; ++i)
			rms_sum += (double) audio [i] * audio [i];
	else
		for (i = 0; i < samples; ++i) {
			double average = (audio [i * 2] + audio [i * 2 + 1]) / 2.0;
			rms_sum += average * average;
		}

	return log10 (rms_sum / samples / (32768.0 * 32767.0 * 0.5)) * 10.0;
}

void
threadmain(int argc, char **argv)
{
	int n, m, fd, ofd, pfd[2], cycle, dual, fast, normal, doscale, lf, uf, wsz, flags, maxnsamp, silence, nibuf, min_period, max_period, non_silence_frames, silence_frames, used_silence_frames, max_generated_stretch, max_generated_flush, samples_to_stretch, consecutive_silence_frames, verbose;
	char *r;
	s16int *prebuf, *ibuf, *obuf;
	u32int insamp, outsamp;
	float max_ratio;
	double level, ratio, gap, smin;
	StretchHandle S;

	fd = 0;
	ofd = 1;
	cycle = 0;	/* cycle ratio, direction, 1: fw, 2: back */
	dual = 0;	/* force dual instance */
	fast = 0;	/* force fast pitch detection */
	normal = 0;	/* force normal pitch detection */
	doscale = 0;	/* scale rate to preserve duration */
	lf = 55;	/* freq lower bound */
	uf = 333;	/* freq upper bound */
	gap = 0.0;	/* gap/silence stretch ratio */
	ratio = 1.0;	/* stretch ratio */
	wsz = 25;	/* window size */
	smin = -40;	/* gap/silence threshold */
	verbose = 0;
	ARGBEGIN{
	case 'C': cycle = 2; break;
	case 'c': cycle = 1; break;
	case 'd': dual = 1; break;
	case 'f': fast = 1; break;
	case 'g': gap = strtod(EARGF(usage()), nil); if(gap < 0.25 || gap > 4.0) usage(); break;
	case 'l': lf = strtol(EARGF(usage()), nil, 10); if(lf < 20) usage(); break;
	case 'n': normal = 1; break;
	case 'r': ratio = strtod(EARGF(usage()), nil); if(ratio < 0.25 || ratio > 4.0) usage(); break;
	case 's': doscale = 1; break;
	case 't': smin = strtod(EARGF(usage()), nil); if(smin < -70 || smin > -10) usage(); break;
	case 'u': uf = strtol(EARGF(usage()), nil, 10); if(uf < 40) usage(); break;
	case 'v': verbose = 1; break;
	case 'w': wsz = strtol(EARGF(usage()), nil, 10); if(wsz < 1 || wsz > 100) usage(); break;
	case 'h':
	default: usage(); break;
	}ARGEND
	if(*argv != nil && (fd = open(*argv, OREAD)) < 0)
		usage();

	insamp = 0;
	outsamp = 0;
	min_period = Nrate / uf;
	max_period = Nrate / lf;
	flags = 0;
	silence = gap != 0.0 && gap != ratio && !cycle;
	nibuf = Nrate * (wsz / 1000.0);
	max_ratio = ratio;
	if(dual || ratio < 0.5 || ratio > 2.0 ||
		(silence && (gap < 0.5 || gap > 2.0)))
		flags |= STRETCH_DUAL_FLAG;
	if((fast || Nrate >= 32000) && !normal)
		flags |= STRETCH_FAST_FLAG;
	if(verbose){
		fprint(2, "file sample rate is %d Hz (%s), buffer size is %d samples\n",
			Nrate, Nchan == 2 ? "stereo" : "mono", nibuf);
		fprint(2, "stretch period range = %d to %d, %d channels, %s, %s\n",
			min_period, max_period, 2, (flags & STRETCH_FAST_FLAG) ? "fast mode" : "normal mode",
			(flags & STRETCH_DUAL_FLAG) ? "dual instance" : "single instance");
	}
	if((S = stretch_init(min_period, max_period, Nchan, flags)) == NULL)
		sysfatal("initialization failed");
	if(cycle)
		max_ratio = (flags & STRETCH_DUAL_FLAG) ? 4.0 : 2.0;
	else if(silence && gap > max_ratio)
		max_ratio = gap;
	maxnsamp = stretch_output_capacity(S, nibuf, max_ratio);
	obuf = prebuf = nil;
	if((ibuf = malloc(nibuf * Sampsz)) == nil
	|| (obuf = malloc(maxnsamp * Sampsz)) == nil
	|| silence && (prebuf = malloc(nibuf * Sampsz)) == nil)
		sysfatal("malloc: %r");
	non_silence_frames = 0,
	silence_frames = 0,
	used_silence_frames = 0;
	max_generated_stretch = 0,
	max_generated_flush = 0;
	samples_to_stretch = 0,
	consecutive_silence_frames = 1;

	if(doscale){
		if(pipe(pfd) < 0)
			sysfatal("pipe: %r");
		switch(rfork(RFPROC|RFFDG)){
		case -1:
			sysfatal("rfork: %r");
		case 0:
			close(0);
			close(pfd[1]);
			dup(pfd[0], 0);
			if((r = smprint("s16c2r%d", (u32int)(Nrate * ratio + 0.5))) == nil)
				sysfatal("smprint: %r");
			execl("/bin/audio/pcmconv", "pcmconv", "-i", r, "-o", "s16c2r44100", nil);
			sysfatal("execl: %r");
		default:
			close(1);
			close(pfd[0]);
			ofd = pfd[1];
		}
	}
	for(;;){
		n = read(fd, silence ? prebuf : ibuf, Sampsz * nibuf);
		n /= Sampsz;
		if(n < 0)
			sysfatal("read: %r");
		if(!silence && n == 0)
			break;
		insamp += n;
		if(silence){
			if(n != 0){
				level = rms_level_dB(prebuf, n, Nchan);
				if(level > smin){
					consecutive_silence_frames = 0;
					non_silence_frames++;
				}
				else{
					consecutive_silence_frames++;
					silence_frames++;
				}
			}
		}
		else
			samples_to_stretch = n;
		if(cycle){
			if(flags & STRETCH_DUAL_FLAG)
				ratio = (sin((double) outsamp / Nrate / 2.0) *(cycle & 1 ? 1.875 : -1.875)) + 2.125;
			else
				ratio = (sin((double) outsamp / Nrate) * (cycle & 1 ? 0.75 : -0.75)) + 1.25;
		}
		if(samples_to_stretch){
			if(consecutive_silence_frames >= 3){
				m = stretch_samples(S, ibuf, samples_to_stretch, obuf, gap);
				used_silence_frames++;
			}else
				m = stretch_samples(S, ibuf, samples_to_stretch, obuf, ratio);
			if(m){
				if(m > max_generated_stretch)
					max_generated_stretch = m;
				write(ofd, obuf, Sampsz * m);
				outsamp += m;
				if(m > maxnsamp)
					sysfatal("sample generation overflow");
			}
		}
		if(silence){
			if(n){
				memcpy(ibuf, prebuf, n * Sampsz);
				samples_to_stretch = n;
			}else
				break;
		}
	}
	for(;;){
		if((n = stretch_flush(S, obuf)) == 0)
			break;
		if(n > max_generated_flush)
			max_generated_flush = n;
		if(n > maxnsamp)
			sysfatal("flush overflow");
		write(ofd, obuf, Sampsz * n);
		outsamp += n / Sampsz;
	}
    if(insamp && verbose){
        fprint(2, "done, %ud samples --> %ud samples (ratio = %.3f)\n",
            insamp, outsamp, (double)outsamp / insamp);
        if(doscale)
            fprint(2, "sample rate changed from %d Hz to %ud Hz\n",
                Nrate, (u32int)(Nrate * ratio + 0.5));
        fprint(2, "max expected samples = %d, actually seen = %d stretch, %d flush\n",
            maxnsamp, max_generated_stretch, max_generated_flush);
        if(silence_frames || non_silence_frames) {
            int total_frames = silence_frames + non_silence_frames;
            fprint(2, "%d silence frames detected (%.2f%%), %d actually used (%.2f%%)\n",
                silence_frames, silence_frames * 100.0 / total_frames,
                used_silence_frames, used_silence_frames * 100.0 / total_frames); 
        }
    }
	stretch_deinit(S);
	exits(nil);
}