shithub: hj264

ref: 440860780f767ae6af42ae2c9f49bcca46b6ceeb
dir: /hj264.c/

View raw version
#define MINIH264_IMPLEMENTATION
#define H264E_MAX_THREADS 7
#ifdef __amd64__
#define memcpy memcpyf
#endif
#include "minih264e.h"
#include <thread.h>
#include <bio.h>
#include <draw.h>
#include <memdraw.h>
#include <tos.h>

void npe_nsleep(uvlong ns);

#define max(a,b) ((a)>(b)?(a):(b))
#define min(a,b) ((a)<(b)?(a):(b))
#define clp(v,a,b) min((b), max((v),(a)))
#define align(p,a) (void*)((((uintptr)p - 1) | (a-1)) + 1)

enum {
	Align = 64,
	Maxquality = 10,
	Gop = 20,
};

typedef struct Hjob Hjob;
typedef struct Hjthread Hjthread;
typedef struct Hj264 Hj264;
typedef struct Img Img;

struct Hjob {
	void (*run)(void *);
	void *arg;
};

struct Hjthread {
	int id;
	Channel *job;
	Channel *done;
};

struct Hj264 {
	H264E_persist_t *persist;
	H264E_scratch_t *scratch;
	H264E_run_param_t rp;
	H264E_io_yuv_t yuv;
	Biobuf out;
	Channel *frame;
	Hjthread threads[H264E_MAX_THREADS];
	Hjob jobs[H264E_MAX_THREADS];
	int nthreads;
	u8int buf[1];
};

struct Img {
	int w;
	int h;
	u8int bgrx[];
};

static void
xrgb2yuv(u8int *bgrx, int w, int h, H264E_io_yuv_t *io)
{
	u8int *py, *pu, *pv;
	int x, y, r, g, b;

	py = io->yuv[0];
	pu = io->yuv[1];
	pv = io->yuv[2];

	for(y = 0; y < h;){
		for(x = 0; x < w;){
			b = bgrx[0];
			g = bgrx[1];
			r = bgrx[2];
			bgrx += 4;

#define YY ((( 77*r + 150*g +  29*b + 128) >> 8) +   0)
#define UU (((-43*r -  84*g + 127*b + 128) >> 8) + 128)
#define VV (((127*r - 106*g -  21*b + 128) >> 8) + 128)
			py[x] = YY;
			pu[x/2] = UU;
			pv[x/2] = VV;
			x++;

			b = bgrx[0];
			g = bgrx[1];
			r = bgrx[2];
			bgrx += 4;
			py[x] = YY;
			x++;
		}
		py += io->stride[0];
		y++;

		for(x = 0; x < w;){
			b = bgrx[0];
			g = bgrx[1];
			r = bgrx[2];
			bgrx += 4;
			py[x] = YY;
			x++;
#undef YY
#undef UU
#undef VV
		}
		py += io->stride[0];
		pu += io->stride[1];
		pv += io->stride[2];
		y++;
	}
}

#pragma varargck type "ℏ" int
static int
hjerror(Fmt *f)
{
	char *s;
	int e;

	s = nil;
	e = va_arg(f->args, int);
	switch(e){
	case H264E_STATUS_SUCCESS: s = "success"; break;
	case H264E_STATUS_BAD_ARGUMENT: s = "bad argument"; break;
	case H264E_STATUS_BAD_PARAMETER: s = "bad parameter"; break;
	case H264E_STATUS_BAD_FRAME_TYPE: s = "bad frame type"; break;
	case H264E_STATUS_SIZE_NOT_MULTIPLE_16: s = "size not multiple of 16"; break;
	case H264E_STATUS_SIZE_NOT_MULTIPLE_2: s = "size not multiple of 2"; break;
	case H264E_STATUS_BAD_LUMA_ALIGN: s = "bad luma alignment"; break;
	case H264E_STATUS_BAD_LUMA_STRIDE: s = "bad luma stride"; break;
	case H264E_STATUS_BAD_CHROMA_ALIGN: s = "bad chroma alignment"; break;
	case H264E_STATUS_BAD_CHROMA_STRIDE: s = "bad chroma stride"; break;
	}

	return s == nil ? fmtprint(f, "error %d", e) : fmtprint(f, "%s", s);
}

static void
threadf(void *p)
{
	Hjthread *t;
	Hjob *j;
	Channel *job, *done;

	t = p;
	threadsetname("hj264/%d", t->id);

	job = t->job;
	done = t->done;
	for(sendp(done, nil); (j = recvp(job)) != nil; sendp(done, j))
		j->run(j->arg);

	chanfree(done);
	chanfree(job);

	threadexits(nil);
}

static void
hjobsrun(void *p, void (*run)(void *), void **arg, int njob)
{
	int n, t;
	Hj264 *h;
	Hjob *j;

	h = p;
	for(n = 0; n < njob;){
		for(t = 0; t < h->nthreads && n < njob; t++, n++){
			j = &h->jobs[t];
			j->run = run;
			j->arg = arg[n];
			sendp(h->threads[t].job, j);
		}

		for(t--; t >= 0; t--)
			recvp(h->threads[t].done);
	}
}

static int
hj264_encode(Hj264 *h, u8int **data, int *sz)
{
	int e;

	if((e = H264E_encode(h->persist, h->scratch, &h->rp, &h->yuv, data, sz)) != 0){
		werrstr("H264E_encode: %ℏ", e);
		return -1;
	}

	return 0;
}

static Hj264 *
hj264new(int nthreads, int denoise, int kbps, int ww, int hh)
{
	int i, e, szscratch, szpersist, szyuv;
	H264E_create_param_t cp;
	Hjthread *t;
	u8int *p;
	Hj264 *h;

	nthreads = clp(nthreads, 1, H264E_MAX_THREADS);
	/* YUV logic requires alignment, allow height to be different (pad it) */
	hh = ((hh-1) | 15) + 1;

	memset(&cp, 0, sizeof(cp));
	cp.num_layers = 1;
	cp.gop = Gop;
	cp.max_threads = nthreads;
	cp.temporal_denoise_flag = denoise;
	cp.max_long_term_reference_frames = MAX_LONG_TERM_FRAMES;
	cp.vbv_size_bytes = kbps/1000*8/2; /* 2 seconds */
	cp.width = ww;
	cp.height = hh;

	if((e = H264E_sizeof(&cp, &szpersist, &szscratch)) != 0){
		werrstr("H264E_sizeof: %ℏ", e);
		return nil;
	}

	/* FIXME not padding width yet, so it still has to be multiple of 16 */
	/* once we do that, put this line to where "hh" is aligned */
	ww = ((ww-1) | 15) + 1;

	szyuv = ww*hh*3/2;
	if((h = calloc(1, sizeof(*h) + Align+szyuv + Align+szpersist + Align+szscratch)) == nil)
		return nil;

	p = align(h->buf, Align);
	h->yuv.yuv[0] = p;
	h->yuv.stride[0] = ww;
	h->yuv.yuv[1] = p + ww*hh;
	h->yuv.stride[1] = ww/2;
	h->yuv.yuv[2] = p + ww*hh*5/4;
	h->yuv.stride[2] = ww/2;
	h->persist = align(p+szyuv, Align);
	h->scratch = align(h->persist+szpersist, Align);

	cp.token = h;
	cp.run_func_in_thread = hjobsrun;
	if((e = H264E_init(h->persist, &cp)) != 0){
		werrstr("H264E_init: %ℏ", e);
		return nil;
	}

	h->nthreads = nthreads;
	for(i = 0; i < nthreads; i++){
		t = &h->threads[i];
		t->id = i;
		t->job = chancreate(sizeof(void*), 0);
		t->done = chancreate(sizeof(void*), 0);
		proccreate(threadf, t, mainstacksize);
		recvp(t->done);
	}

	return h;
}

static void
hj264free(Hj264 *h)
{
	int i;

	for(i = 0; i < h->nthreads; i++){
		chanclose(h->threads[i].done);
		chanclose(h->threads[i].job);
	}

	free(h);
}

static uvlong
nanosec(void)
{
	static uvlong fasthz, xstart;
	uvlong x, div;

	if(fasthz == ~0ULL)
		return nsec() - xstart;

	if(fasthz == 0){
		if(_tos->cyclefreq){
			cycles(&xstart);
			fasthz = _tos->cyclefreq;
		} else {
			xstart = nsec();
			fasthz = ~0ULL;
			fprint(2, "cyclefreq not available, falling back to nsec()\n");
			fprint(2, "you might want to disable aux/timesync\n");
			return 0;
		}
	}
	cycles(&x);
	x -= xstart;

	/* this is ugly */
	for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL);

	return x / (fasthz / div);
}

static void
encthread(void *p)
{
	u8int *data;
	Img *img;
	Hj264 *h;
	int sz;

	h = p;
	for(;;){
		if((img = recvp(h->frame)) == nil)
			break;
		xrgb2yuv(img->bgrx, img->w, img->h, &h->yuv);
		free(img);

		if(hj264_encode(h, &data, &sz) != 0)
			sysfatal("hj264_encode: %r");
		if(Bwrite(&h->out, data, sz) != sz)
			break;
	}

	Bflush(&h->out);
	hj264free(h);

	threadexits(nil);
}

static Img *
imgread(int f, int w, int h)
{
	int r, n, e;
	Img *i;

	e = w*h*4;
	i = malloc(sizeof(*i) + e);
	i->w = w;
	i->h = h;
	for(n = 0; n < e; n += r){
		if((r = pread(f, i->bgrx+n, e-n, n+5*12)) <= 0){
			free(i);
			return nil;
		}
	}

	return i;
}

static void
usage(void)
{
	fprint(2, "usage: %s [-d] [-f FPS] [-n THREADS] [-k KBPS] [-q 0…10] [-Q QP]\n", argv0);
	threadexitsall("usage");
}

int
main(int argc, char **argv)
{
	int nthreads, fps, kbps, denoise, quality, qp;
	uvlong start, end, fstart, fend;
	int ww, hh, in, nframes;
	Memimage *im;
	Img *img;
	Hj264 *h;
	char *s;

	/* use NPROC-1 threads by default */
	nthreads = ((s = getenv("NPROC")) != nil) ? atoi(s)-1 : 1;
	denoise = 0;
	quality = 10;
	kbps = 0;
	fps = 30;
	qp = 33;
	ARGBEGIN{
	case 'd':
		denoise++;
		break;
	case 'f':
		fps = atoi(EARGF(usage()));
		break;
	case 'k':
		kbps = atoi(EARGF(usage()));
		break;
	case 'n':
		nthreads = atoi(EARGF(usage()));
		break;
	case 'q':
		quality = atoi(EARGF(usage()));
		break;
	case 'Q':
		qp = atoi(EARGF(usage()));
		break;
	default:
		usage();
	}ARGEND

	if(quality > Maxquality)
		quality = Maxquality;
	if(kbps < 0)
		kbps = 0;

	if(argc < 1)
		usage();
	if((in = open(*argv, OREAD)) < 0)
		sysfatal("input: %r");

	fmtinstall(L'ℏ', hjerror);

	memimageinit();
	if((im = readmemimage(in)) == nil)
		sysfatal("image: %r");
	ww = Dx(im->r);
	hh = Dy(im->r);
	freememimage(im);

	if((h = hj264new(nthreads, denoise, kbps, ww, hh)) == nil)
		sysfatal("hj264new: %r");
	if(Binit(&h->out, 1, OWRITE) < 0)
		sysfatal("Binit failed: %r");
	h->frame = chancreate(sizeof(void*), 1); /* FIXME this is wrong as the encoder might be too late */

	/* FIXME how about changing these on the fly? */
	h->rp.encode_speed = Maxquality - quality;
	h->rp.qp_min = h->rp.qp_max = qp;
	if(kbps > 0){
		h->rp.qp_min = 10;
		h->rp.qp_max = 50;
		h->rp.desired_frame_bytes = kbps*1000/8/fps;
	}
	proccreate(encthread, h, mainstacksize);

	start = nanosec();
	for(nframes = 0;; nframes++){
		fstart = nanosec();
		if((img = imgread(in, ww, hh)) == nil)
			break;
		if(sendp(h->frame, img) != 1)
			break;
		fend = nanosec();

		if(1000000000ULL/fps > (fend - fstart))
			npe_nsleep(1000000000ULL/fps - (fend - fstart));

		/* FIXME make a graceful shutdown on a note */
		if(nanosec() - start > 10000000000ULL)
			break;
	}
	end = nanosec();
	fprint(2, "%d fps\n", (int)(nframes / ((end - start)/1000000000ULL)));

	chanclose(h->frame);

	threadexitsall(nil);

	return 0;
}