shithub: hj264

ref: 98a5c516f0d3c3a604958f27b9808ce63fd8df0a
dir: /hj264.c/

View raw version
#include "builtins.h"
#define MINIH264_IMPLEMENTATION
#define H264E_MAX_THREADS 7
#include "minih264e.h"
#include <thread.h>
#include <bio.h>
#include <draw.h>
#include <tos.h>
#include <npe.h>
#include "yuv.h"

#define max(a,b) ((a)>(b)?(a):(b))
#define min(a,b) ((a)<(b)?(a):(b))
#define clp(v,a,b) min((b), max((v),(a)))
#define align(p,a) (void*)((((uintptr)p - 1) | (a-1)) + 1)

enum {
	FmtRaw,
	FmtIVF,

	Align = 64,
	Maxquality = 10,

	TimedenumIVF = 1000ULL,
};

typedef struct Hjob Hjob;
typedef struct Hjthread Hjthread;
typedef struct Hj264 Hj264;
typedef struct Img Img;

struct Hjob {
	void (*run)(void *);
	void *arg;
};

struct Hjthread {
	int id;
	Channel *job;
	Channel *done;
};

struct Hj264 {
	H264E_persist_t *persist;
	H264E_scratch_t *scratch;
	H264E_run_param_t rp;
	H264E_io_yuv_t ioyuv;
	YUV yuv;
	int fmt;
	Biobuf out;
	Channel *frame;
	Channel *done;
	Hjthread threads[H264E_MAX_THREADS];
	Hjob jobs[H264E_MAX_THREADS];
	int nthreads;
	u8int buf[1];
};

struct Img {
	uvlong ns;
	int w;
	int h;
	u8int bgrx[];
};

static int nopt;

#pragma varargck type "ℏ" int
static int
hjerror(Fmt *f)
{
	char *s;
	int e;

	s = nil;
	e = va_arg(f->args, int);
	switch(e){
	case H264E_STATUS_SUCCESS: s = "success"; break;
	case H264E_STATUS_BAD_ARGUMENT: s = "bad argument"; break;
	case H264E_STATUS_BAD_PARAMETER: s = "bad parameter"; break;
	case H264E_STATUS_BAD_FRAME_TYPE: s = "bad frame type"; break;
	case H264E_STATUS_SIZE_NOT_MULTIPLE_16: s = "size not multiple of 16"; break;
	case H264E_STATUS_SIZE_NOT_MULTIPLE_2: s = "size not multiple of 2"; break;
	case H264E_STATUS_BAD_LUMA_ALIGN: s = "bad luma alignment"; break;
	case H264E_STATUS_BAD_LUMA_STRIDE: s = "bad luma stride"; break;
	case H264E_STATUS_BAD_CHROMA_ALIGN: s = "bad chroma alignment"; break;
	case H264E_STATUS_BAD_CHROMA_STRIDE: s = "bad chroma stride"; break;
	}

	return s == nil ? fmtprint(f, "error %d", e) : fmtprint(f, "%s", s);
}

static void
threadf(void *p)
{
	Hjthread *t;
	Hjob *j;
	Channel *job, *done;

	t = p;
	threadsetname("hj264/%d", t->id);

	job = t->job;
	done = t->done;
	for(sendp(done, nil); (j = recvp(job)) != nil; sendp(done, j))
		j->run(j->arg);

	chanfree(done);
	chanfree(job);

	threadexitsall(nil);
}

static void
hjobsrun(void *p, void (*run)(void *), void **arg, int njob)
{
	int n, t;
	Hj264 *h;
	Hjob *j;

	h = p;
	for(n = 0; n < njob;){
		for(t = 0; t < h->nthreads && n < njob; t++, n++){
			j = &h->jobs[t];
			j->run = run;
			j->arg = arg[n];
			sendp(h->threads[t].job, j);
		}

		for(t--; t >= 0; t--)
			recvp(h->threads[t].done);
	}
}

static int
hj264_encode(Hj264 *h, u8int **data, int *sz)
{
	int e;

	if((e = H264E_encode(h->persist, h->scratch, &h->rp, &h->ioyuv, data, sz)) != 0){
		werrstr("H264E_encode: %ℏ", e);
		return -1;
	}

	return 0;
}

static Hj264 *
hj264new(int nthreads, int denoise, int kbps, int gop, int ww, int hh)
{
	int i, e, szscratch, szpersist, szyuv;
	H264E_create_param_t cp;
	Hjthread *t;
	u8int *p;
	Hj264 *h;

	nthreads = clp(nthreads, 1, H264E_MAX_THREADS);
	/* YUV logic requires alignment, allow height to be different (pad it) */
	hh = ((hh-1) | 15) + 1;

	memset(&cp, 0, sizeof(cp));
	cp.num_layers = 1;
	cp.gop = gop;
	cp.max_threads = nthreads;
	cp.temporal_denoise_flag = denoise;
	cp.max_long_term_reference_frames = MAX_LONG_TERM_FRAMES;
	cp.vbv_size_bytes = kbps/1000*8/2; /* 2 seconds */
	cp.width = ww;
	cp.height = hh;

	if((e = H264E_sizeof(&cp, &szpersist, &szscratch)) != 0){
		werrstr("H264E_sizeof: %ℏ", e);
		return nil;
	}

	/* FIXME not padding width yet, so it still has to be multiple of 16 */
	/* once we do that, put this line to where "hh" is aligned */
	ww = ((ww-1) | 15) + 1;

	szyuv = ww*hh*3/2;
	if((h = calloc(1, sizeof(*h) + Align+szyuv + Align+szpersist + Align+szscratch)) == nil)
		return nil;

	p = align(h->buf, Align);
	h->ioyuv.yuv[0] = p;
	h->ioyuv.stride[0] = ww;
	h->ioyuv.yuv[1] = p + ww*hh;
	h->ioyuv.stride[1] = ww/2;
	h->ioyuv.yuv[2] = p + ww*hh*5/4;
	h->ioyuv.stride[2] = ww/2;
	h->yuv.y = h->ioyuv.yuv[0]; h->yuv.ys = h->ioyuv.stride[0];
	h->yuv.u = h->ioyuv.yuv[1]; h->yuv.us = h->ioyuv.stride[1];
	h->yuv.v = h->ioyuv.yuv[2]; h->yuv.vs = h->ioyuv.stride[2];
	h->persist = align(p+szyuv, Align);
	h->scratch = align(h->persist+szpersist, Align);

	cp.token = h;
	cp.run_func_in_thread = hjobsrun;
	if((e = H264E_init(h->persist, &cp)) != 0){
		werrstr("H264E_init: %ℏ", e);
		return nil;
	}

	h->nthreads = nthreads;
	for(i = 0; i < nthreads; i++){
		t = &h->threads[i];
		t->id = i;
		t->job = chancreate(sizeof(void*), 0);
		t->done = chancreate(sizeof(void*), 0);
		procrfork(threadf, t, mainstacksize, RFCFDG|RFCENVG);
		recvp(t->done);
	}

	return h;
}

static void
hj264free(Hj264 *h)
{
	int i;

	for(i = 0; i < h->nthreads; i++){
		chanclose(h->threads[i].done);
		chanclose(h->threads[i].job);
	}

	free(h);
}

static void
encthread(void *p)
{
	u8int *data, v[12];
	Img *img, *prev;
	uvlong ts;
	Hj264 *h;
	int sz;

	threadsetname("hj264/encthread");

	h = p;
	prev = nil;
	for(;;){
		if((img = recvp(h->frame)) == nil)
			break;
		if(!nopt && prev != nil && memcmp(img->bgrx, prev->bgrx, img->w*img->h*4) == 0){
			free(img);
			continue;
		}

		xrgb2yuv420(img->bgrx, img->w, img->h, &h->yuv);
		ts = img->ns / Nmsec;
		if(!nopt){
			free(prev);
			prev = img;
		}else{
			free(img);
		}

		if(hj264_encode(h, &data, &sz) != 0)
			sysfatal("hj264_encode: %r");
		if(h->fmt == FmtIVF){
			v[0] = sz;
			v[1] = sz >> 8;
			v[2] = sz >> 16;
			v[3] = sz >> 24;
			v[4] = ts;
			v[5] = ts >> 8;
			v[6] = ts >> 16;
			v[7] = ts >> 24;
			v[8] = ts >> 32;
			v[9] = ts >> 40;
			v[10] = ts >> 48;
			v[11] = ts >> 56;
			if(Bwrite(&h->out, v, 12) != 12)
				break;
		}
		if(Bwrite(&h->out, data, sz) != sz)
			break;
	}

	Bflush(&h->out);

	chanclose(h->frame);
	if(h->done != nil)
		sendp(h->done, nil);

	free(prev);

	threadexitsall(nil);
}

static Img *
imgread(int f, int w, int h)
{
	int r, n, e;
	Img *i;

	e = w*h*4;
	i = malloc(sizeof(*i) + e);
	i->w = w;
	i->h = h;
	for(n = 0; n < e; n += r){
		if((r = pread(f, i->bgrx+n, e-n, n+5*12)) <= 0){
			free(i);
			return nil;
		}
	}
	i->ns = npe_nanosec();

	return i;
}

static void
usage(void)
{
	fprint(2, "usage: %s [-D] [-f FPS] [-F FORMAT] [-g GOP] [-n THREADS] [-O] [-k KBPS] [-q 0…10] [-Q QP] FILE\n", argv0);
	threadexitsall("usage");
}

static uvlong nframes, tstart, debug;

static int
done(void *, char *msg)
{
	uvlong s;
	Hj264 *h;

	if(debug){
		s = npe_nanosec() - tstart;
		s /= Nsec;
		if(s != 0)
			fprint(2, "%llud fps\n", nframes / s);
	}
	h = *procdata();
	Bflush(&h->out);
	threadexitsall(msg);
	return 1;
}

static int
pipeignore(void *, char *s)
{
	return strncmp(s, "sys: write", 10) == 0 ? 1 : 0;
}

int
main(int argc, char **argv)
{
	int nthreads, fps, kbps, denoise, quality, qp, gop;
	char *s, tmp[61], *f[5];
	uvlong fstart, fend;
	int ww, hh, in, fmt;
	u8int v[20];
	Img *img;
	Hj264 *h;

	/* use NPROC-1 threads by default */
	nthreads = ((s = getenv("NPROC")) != nil) ? atoi(s)-1 : 1;
	denoise = 0;
	quality = 0;
	kbps = 0;
	fps = 30;
	qp = 33;
	gop = 2*fps;
	fmt = FmtIVF;
	ARGBEGIN{
	case 'd':
		debug++;
		break;
	case 'D':
		denoise++;
		break;
	case 'f':
		fps = atoi(EARGF(usage()));
		break;
	case 'F':
		s = EARGF(usage());
		if(cistrcmp(s, "ivf") == 0)
			fmt = FmtIVF;
		else if(cistrcmp(s, "raw") == 0)
			fmt = FmtRaw;
		else
			sysfatal("unknown format %s", s);
		break;
	case 'g':
		gop = atoi(EARGF(usage()));
		break;
	case 'k':
		kbps = atoi(EARGF(usage()));
		break;
	case 'n':
		nthreads = atoi(EARGF(usage()));
		break;
	case 'O':
		nopt = 1;
		break;
	case 'q':
		quality = atoi(EARGF(usage()));
		break;
	case 'Q':
		qp = atoi(EARGF(usage()));
		break;
	default:
		usage();
	}ARGEND

	if(quality > Maxquality)
		quality = Maxquality;
	if(kbps < 0)
		kbps = 0;

	if(argc != 1)
		usage();
	if((in = open(*argv, OREAD)) < 0)
		sysfatal("input: %r");

	fmtinstall(L'ℏ', hjerror);

	tmp[60] = 0;
	if(readn(in, tmp, 60) != 60 || tokenize(tmp, f, 5) != 5)
		sysfatal("invalid image");
	if(strcmp(f[0], "x8r8g8b8") != 0)
		sysfatal("only x8r8g8b8 is supported");
	ww = atoi(f[3]) - atoi(f[1]);
	hh = atoi(f[4]) - atoi(f[2]);
	if(ww & 15)
		sysfatal("frame width has to be multiple of 16");
	if(ww < 16 || hh < 16)
		sysfatal("frame too small: %dx%d", ww, hh);

	if((h = hj264new(nthreads, denoise, kbps, gop, ww, hh)) == nil)
		sysfatal("hj264new: %r");
	if(Binit(&h->out, 1, OWRITE) < 0)
		sysfatal("Binit failed: %r");
	h->frame = chancreate(sizeof(void*), nthreads);
	h->done = chancreate(sizeof(void*), 0);
	h->fmt = fmt;

	/* FIXME how about changing these on the fly? */
	h->rp.encode_speed = Maxquality - quality;
	h->rp.qp_min = h->rp.qp_max = qp;
	if(kbps > 0){
		h->rp.qp_min = 10;
		h->rp.qp_max = 50;
		h->rp.desired_frame_bytes = kbps*1000/8/fps;
	}
	*procdata() = h;
	threadnotify(done, 1);
	atnotify(pipeignore, 1);
	procrfork(encthread, h, mainstacksize, RFCENVG);

	if(h->fmt == FmtIVF){
		Bwrite(&h->out, "DKIF\x00\x00\x20\x00AVC1", 12);
		v[0] = ww;
		v[1] = ww >> 8;
		v[2] = hh;
		v[3] = hh >> 8;
		v[4] = TimedenumIVF;
		v[5] = TimedenumIVF >> 8;
		v[6] = TimedenumIVF >> 16;
		v[7] = TimedenumIVF >> 24;
		v[8] = 1;
		v[9] = 0;
		v[10] = 0;
		v[11] = 0;
		memset(v+12, 0, 8); /* unknown duration */
		Bwrite(&h->out, v, sizeof(v));
		Bflush(&h->out);
	}

	tstart = npe_nanosec();
	for(nframes = 0;; nframes++){
		fstart = npe_nanosec();
		if((img = imgread(in, ww, hh)) == nil)
			break;
		if(sendp(h->frame, img) != 1)
			break;
		fend = npe_nanosec();

		if(Nsec/fps > (fend - fstart))
			npe_nsleep(Nsec/fps - (fend - fstart));
	}

	chanclose(h->frame);
	recvp(h->done);
	hj264free(h);

	return done(nil, nil);
}