shithub: riscv

Download patch

ref: 5de71b116a9daf647952a84006cc481a6ce259bd
parent: 02ea56dbdad9e2b3145ac57ee1765aa74d5dd4f8
author: aiju <devnull@localhost>
date: Fri Oct 3 12:52:56 EDT 2014

games/gba: new faster ppu code, audio support

--- /dev/null
+++ b/sys/src/games/gba/apu.c
@@ -1,0 +1,370 @@
+#include <u.h>
+#include <libc.h>
+#include <thread.h>
+#include "dat.h"
+#include "fns.h"
+
+Event evsamp;
+int srate, sratediv;
+s16int sbuf[2*4000], *sbufp, bias;
+enum {
+	Freq = 44100,
+};
+static int stime;
+s8int snddma[2];
+static int fd;
+
+u16int envctr, envrel, envmod;
+u8int sweepen, sweepctr;
+u16int sweepfreq;
+typedef struct chan chan;
+struct chan {
+	u8int n, ectr;
+	u16int len;
+	u16int *env, *freq;
+	u16int fctr, fthr;
+	u32int finc;
+	u8int vol;
+};
+u8int wave[64], wpos, wbank;
+u16int lfsr;
+
+chan sndch[4] = {
+	{
+		.n = 0,
+		.env = reg + 0x62/2,
+		.freq = reg + 0x64/2,
+	},
+	{
+		.n = 1,
+		.env = reg + 0x68/2,
+		.freq = reg + 0x6c/2,
+	},
+	{
+		.n = 2,
+	},
+	{
+		.n = 3,
+		.env = reg + 0x78/2,
+		.freq = reg + 0x7c/2,
+	}
+};
+
+void
+rate(int i, u16int v)
+{
+	switch(i){
+	case 0: case 1:
+		sndch[i].finc = 131072ULL * 65536 / (srate * (2048 - (v & 0x7ff)));
+		break;
+	case 2:
+		sndch[2].finc = 2097152ULL * 65536 / (srate * (2048 - (v & 0x7ff)));
+		break;
+	case 3:
+		sndch[3].finc = 524288ULL * 65536 / srate;
+		if((v & 7) != 0)
+			sndch[3].finc /= v & 7;
+		else
+			sndch[3].finc <<= 1;
+		sndch[3].finc >>= (v >> 4 & 15) + 1;
+	}
+}
+
+void
+env(chan *c)
+{
+	if((envmod & 1) == 0 && c->len > 0 && (*c->freq & 1<<14) != 0)
+		--c->len;
+	if(c->len == 0){
+		c->vol = 0;
+		return;
+	}
+	if((envmod & 7) != 7 || c->ectr == 0 || --c->ectr != 0)
+		return;
+	c->ectr = *c->env >> 8 & 7;
+	if((*c->env & 1<<11) != 0){
+		if(c->vol < 15)
+			c->vol++;
+	}else
+		if(c->vol > 0)
+			c->vol--;
+}
+
+s8int
+wavesamp(void)
+{
+	s8int x;
+	u16int vol, cnt;
+	int v;
+
+	sndch[2].fctr = v = sndch[2].fctr + sndch[2].finc;
+	if(sndch[2].len == 0 || (reg[0x70/2] & 1<<7) == 0)
+		return 0;
+	vol = reg[0x72/2];
+	cnt = reg[0x70/2];
+	for(;;){
+		x = wave[wbank ^ wpos];
+		v -= 0x10000;
+		if(v < 0)
+			break;
+		wpos++;
+		if((cnt & 1<<5) != 0)
+			wpos &= 63;
+		else
+			wpos &= 31;
+	}
+	if((vol & 1<<15) != 0)
+		x = (x >> 2) + (x >> 3);
+	else if((vol & 3<<14) == 0)
+		x = 0;
+	else
+		x = x >> (vol >> 14 & 3);
+	return x;
+}
+
+s8int
+lfsrsamp(void)
+{
+	int v;
+	u16int l;
+
+	sndch[3].fctr = v = sndch[3].fctr + sndch[3].finc;
+	for(;;){
+		l = lfsr;
+		v -= 0x10000;
+		if(v < 0)
+			break;
+		lfsr >>= 1;
+		if(((l ^ lfsr) & 1) != 0)
+			if((reg[0x7c/2] & 1<<3) != 0)
+				lfsr |= 0x40;
+			else
+				lfsr |= 0x4000;
+	}
+	if((l & 1) != 0)
+		return -sndch[3].vol;
+	else
+		return sndch[3].vol;
+}
+
+void
+sweep(int wb)
+{
+	u16int fr;
+	int d;
+	u16int cnt;
+	
+	cnt = reg[0x60/2];
+	d = sweepfreq >> (cnt & 7);
+	if((cnt & 1<<3) != 0)
+		d = -d;
+	fr = sweepfreq + d;
+	print("%d %d %d\n", d, sweepfreq, fr);
+	if(fr > 2047){
+		sndch[0].len = 0;
+		sndch[0].vol = 0;
+		sweepen = 0;
+	}else if(wb){
+		sweepfreq = fr;
+		reg[0x64/2] = reg[0x64/2] & 0xfc00 | fr;
+		rate(0, fr);
+		sweep(0);
+	}
+}
+
+void
+sndstart(chan *c, u16int v)
+{
+	u16int cnt;
+
+	c->vol = *c->env >> 12;
+	c->ectr = *c->env >> 8 & 7;
+	if(c->len == 0)
+		c->len = 64;
+	if(c == sndch){
+		cnt = reg[0x60/2];
+		sweepen = (cnt & 0x07) != 0 && (cnt & 0x70) != 0;
+		sweepctr = cnt >> 4 & 7;
+		sweepfreq = v & 0x7ff;
+		if((cnt & 0x07) != 0)
+			sweep(0);
+	}
+}
+
+void
+sampletick(void *)
+{
+	u16int cntl, cnth;
+	s16int ch[6];
+	s16int s[2];
+	int i;
+	
+	addevent(&evsamp, sratediv + evsamp.time);
+	
+	if(--envctr == 0){
+		envctr = envrel;
+		env(&sndch[0]);
+		env(&sndch[1]);
+		if((envmod & 1) == 0 && sndch[2].len > 0 && (reg[0x74/2] & 1<<14) != 0)
+			sndch[2].len--;
+		env(&sndch[3]);
+		if((envmod & 3) == 2 && sweepen && --sweepctr == 0){
+			sweepctr = reg[0x60/2] >> 4 & 7;
+			sweep(1);
+		}
+		envmod++;
+	}
+	
+	sndch[0].fctr += sndch[0].finc;
+	if(sndch[0].fctr >= sndch[0].fthr)
+		ch[0] = sndch[0].vol;
+	else
+		ch[0] = -sndch[0].vol;
+	sndch[1].fctr += sndch[1].finc;
+	if(sndch[1].fctr >= sndch[1].fthr)
+		ch[1] = sndch[1].vol;
+	else
+		ch[1] = -sndch[1].vol;
+	ch[2] = wavesamp();
+	ch[3] = lfsrsamp();
+	
+	cntl = reg[SOUNDCNTL];
+	cnth = reg[SOUNDCNTH];
+	for(i = 0; i < 4; i++)
+		ch[i] = ch[i] >> (cnth & 3);
+	ch[5] = snddma[0] << 1 + (cnth >> 2 & 1);
+	ch[6] = snddma[1] << 1 + (cnth >> 3 & 1);
+	
+	s[0] = 0;
+	s[1] = 0;
+	for(i = 0; i < 4; i++){
+		if((cntl & 1<<8<<i) != 0)
+			s[1] += ch[i] * (1 + (cntl & 7));
+		if((cntl & 1<<12<<i) != 0)
+			s[0] += ch[i] * (1 + (cntl >> 4 & 7));
+	}
+	for(i = 5; i < 6; i++){
+		if((cnth & 1<<3<<i) != 0)
+			s[1] += ch[i];
+		if((cnth & 1<<4<<i) != 0)
+			s[0] += ch[i]; 
+	}
+	s[0] += bias;
+	s[1] += bias;
+	if(s[0] < -0x200) s[0] = -0x200;
+	else if(s[0] > 0x1ff) s[0] = 0x1ff;
+	if(s[1] < -0x200) s[1] = -0x200;
+	else if(s[1] > 0x1ff) s[1] = 0x1ff;
+	
+	stime -= Freq;
+	while(stime < 0){
+		if(sbufp < sbuf + nelem(sbuf)){
+			sbufp[0] = s[0] << 6;
+			sbufp[1] = s[1] << 6;
+			sbufp += 2;
+		}
+		stime += srate;
+	}
+}
+
+
+void
+sndwrite(u16int a, u16int v)
+{
+	int sh, p, i;
+	static u16int thr[4] = {0x2000, 0x4000, 0x8000, 0xC000};
+	
+	switch(a){
+	case 0x62:
+		sndch[0].fthr = thr[v >> 6 & 3];
+		sndch[0].len = 64 - (v & 63);
+		break;
+	case 0x64:
+		rate(0, v);
+		if((v & 1<<15) != 0)
+			sndstart(&sndch[0], v);
+		break;
+	case 0x68:
+		sndch[1].fthr = thr[v >> 6 & 3];
+		break;
+	case 0x6c:
+		rate(1, v);
+		if((v & 1<<15) != 0)
+			sndstart(&sndch[1], v);
+		break;
+	case 0x70:
+		wbank = v >> 1 & 32;
+		break;
+	case 0x72:
+		sndch[2].len = 256 - (v & 0xff);
+		break;
+	case 0x74:
+		rate(2, v);
+		if((v & 1<<15) != 0 && sndch[2].len == 0)
+			sndch[2].len = 256;
+		break;
+	case 0x7c:
+		rate(3, v);
+		if((v & 1<<15) != 0){
+			if((v & 1<<3) != 0)
+				lfsr = 0x7f;
+			else
+				lfsr = 0x7fff;
+			sndstart(&sndch[3], v);
+		}
+		break;
+	case SOUNDBIAS*2:
+		sh = 9 - (v >> 14 & 3);
+		if(sratediv != 1<<sh){
+			srate = 1 << 24 - sh;
+			sratediv = 1 << sh;
+			envrel = srate / 512;
+			rate(0, reg[0x64/2]);
+			rate(1, reg[0x6c/2]);
+			rate(2, reg[0x74/2]);
+			rate(3, reg[0x7c/2]);
+		}
+		bias = (v & 0x3ff) - 0x200;
+		break;
+	case 0x90: case 0x92: case 0x94: case 0x96:
+	case 0x98: case 0x9a: case 0x9c: case 0x9e:
+		p = ~reg[0x70/2] >> 1 & 32;
+		for(i = a - 0x90; i < a - 0x90 + 2; i++){
+			wave[(wpos + 2 * i) & 31 + p] = v >> 4 & 0xf;
+			wave[(wpos + 2 * i + 1) & 31 + p] = v & 0xf;
+			v >>= 8;
+		}
+		break;
+	}
+}
+
+void
+audioinit(void)
+{
+	fd = open("/dev/audio", OWRITE);                                                                                                                                                                                                                                                                                                               
+	if(fd < 0)
+		sysfatal("open: %r");
+	sbufp = sbuf;
+	sndwrite(SOUNDBIAS*2, 0x200);
+	evsamp.f = sampletick;
+	addevent(&evsamp, sratediv);
+}
+
+int
+audioout(void)
+{
+	int rc;
+	static int cl;
+
+	if(sbufp == nil)
+		return -1;
+	if(sbufp == sbuf)
+		return 0;
+	cl = clock;
+	rc = write(fd, sbuf, (sbufp - sbuf) * 2);
+	if(rc > 0)
+		sbufp -= (rc+1)/2;
+	if(sbufp < sbuf)
+		sbufp = sbuf;
+	return 0;
+}
--- a/sys/src/games/gba/dat.h
+++ b/sys/src/games/gba/dat.h
@@ -15,11 +15,20 @@
 extern uchar *rom, *back;
 extern int nrom, nback, backup;
 
-extern int ppux, ppuy;
-extern u8int bldy, blda, bldb;
+extern int hblank, ppuy;
 
+extern int clock;
 extern int scale;
 
+typedef struct Event Event;
+struct Event {
+	int time;
+	void (*f)(void *);
+	Event *next;
+	void *aux;
+};
+extern Event *elist;
+
 enum {
 	DISPCNT = 0x0/2,
 	DISPSTAT = 0x4/2,
@@ -35,6 +44,10 @@
 	BG2XH = 0x2a/2,
 	BG2YL = 0x2c/2,
 	BG2YH = 0x2e/2,
+	BG3XL = 0x38/2,
+	BG3XH = 0x3a/2,
+	BG3YL = 0x3c/2,
+	BG3YH = 0x3e/2,
 	
 	WIN0H = 0x40/2,
 	WIN1H = 0x42/2,
@@ -42,15 +55,24 @@
 	WIN1V = 0x46/2,
 	WININ = 0x48/2,
 	WINOUT = 0x4a/2,
+	MOSAIC = 0x4c/2,
 	BLDCNT = 0x50/2,
 	BLDALPHA = 0x52/2,
 	BLDY = 0x54/2,
 	
+	SOUNDCNTL = 0x80/2,
+	SOUNDCNTH = 0x82/2,
+	SOUNDBIAS = 0x88/2,
+	
+	FIFOAH = 0xa2/2,
+	FIFOBH = 0xa6/2,
+	
 	DMA0CNTH = 0xba/2,
 	DMA1CNTH = 0xc6/2,
 	DMA2CNTH = 0xd2/2,
 	DMA3CNTH = 0xde/2,
 	
+	TM0CNTH = 0x102/2,
 	KEYCNT = 0x132/2,
 
 	IE = 0x200/2,
@@ -73,9 +95,16 @@
 	IRQVCTREN = 1<<5,
 
 	/* BGnCNT */
+	BGMOSAIC = 1<<6,
 	BG8 = 1<<7,
 	DISPWRAP = 1<<13,
 	
+	/* TIMERnCNTH */
+	PRESC = 3,
+	COUNTUP = 1<<2,
+	TIMERIRQ = 1<<6,
+	TIMERON = 1<<7,
+	
 	/* DMAnCNTH */
 	DMADCNT = 5,
 	DMASCNT = 7,
@@ -111,4 +140,5 @@
 	
 	KB = 1024,
 	BACKTYPELEN = 64,
+	HZ = 16777216,
 };
--- /dev/null
+++ b/sys/src/games/gba/ev.c
@@ -1,0 +1,211 @@
+#include <u.h>
+#include <libc.h>
+#include <thread.h>
+#include "dat.h"
+#include "fns.h"
+
+typedef struct {
+	u16int *cnt;
+	Event;
+	u16int val;
+	int clock;
+	u8int i, sh, snd;
+} Timer;
+
+typedef struct fifo fifo;
+struct fifo {
+	u32int d[8];
+	u8int head, level, headpos;
+};
+fifo sndfifo[2];
+
+Event *elist;
+Timer timers[4];
+Event evhblank;
+
+void
+addevent(Event *ev, int time)
+{
+	Event **p, *e;
+	int t;
+	
+	t = time;
+	for(p = &elist; (e = *p) != nil; p = &e->next){
+		if(t < e->time){
+			e->time -= t;
+			break;
+		}
+		t -= e->time;
+	}
+	ev->next = e;
+	ev->time = t;
+	*p = ev;
+}
+
+void
+delevent(Event *ev)
+{
+	Event **p, *e;
+	
+	for(p = &elist; (e = *p) != nil; p = &e->next)
+		if(e == ev){
+			*p = e->next;
+			if(e->next != nil)
+				e->next->time += e->time;
+			return;
+		}
+}
+
+void
+popevent(void)
+{
+	Event *e;
+	int t;
+	
+	do{
+		e = elist;
+		t = e->time;
+		elist = e->next;
+		e->f(e->aux);
+	}while((elist->time += t) <= 0);
+}
+
+
+void
+fifoput(int i, u32int s)
+{
+	fifo *f;
+	
+	f = sndfifo + i;
+	if(f->level < 8)
+		f->d[(f->head + f->level++) & 7] = s;
+}
+
+void
+fifotimer(int b, int n)
+{
+	fifo *f;
+	int i, j;
+	extern s8int snddma[2];
+	
+	for(i = 0; i < 2; i++){
+		if((b & 1<<i) == 0)
+			continue;
+		f = &sndfifo[i];
+		for(j = 0; j < n && f->level > 0; j++){
+			snddma[i] = f->d[f->head] & 0xff;
+			f->d[f->head] >>= 8;
+			if(++f->headpos == 4){
+				f->head = (f->head + 1) & 7;
+				f->level--;
+				f->headpos = 0;
+			}
+		}
+		if(f->level <= 4)
+			dmastart(DMASOUND);
+	}
+}
+
+void
+soundcnth(u16int v)
+{
+	timers[0].snd = 0;
+	timers[1].snd = 0;
+	if((v & 3<<8) != 0)
+		timers[(v >> 10) & 1].snd |= 1;
+	if((v & 3<<12) != 0)
+		timers[(v >> 14) & 1].snd |= 2;
+	if((v & 1<<11) != 0){
+		sndfifo[0].level = 0;
+		sndfifo[0].head = 0;
+		sndfifo[0].headpos = 0;
+	}
+	if((v & 1<<15) != 0){
+		sndfifo[1].level = 0;
+		sndfifo[1].head = 0;
+		sndfifo[1].headpos = 0;
+	}
+}
+
+u16int
+timerget(int i)
+{
+	Timer *t;
+	
+	t = &timers[i];
+	if((*t->cnt & (COUNTUP|TIMERON)) != TIMERON)
+		return t->val;
+	return t->val + (clock - t->clock >> t->sh);
+}
+
+void
+timerset(int i, u16int nc)
+{
+	u32int v;
+	u16int oc;
+	Timer *t;
+	
+	t = &timers[i];
+	oc = *t->cnt;
+	if((oc & (PRESC|COUNTUP|TIMERON)) == (nc & (PRESC|COUNTUP|TIMERON)))
+		return;
+	if((oc & (COUNTUP|TIMERON)) == TIMERON){
+		v = t->val + (clock - t->clock >> t->sh);
+		delevent(t);
+	}else
+		v = t->val;
+	if((oc & TIMERON) == 0 && (nc & TIMERON) != 0)
+		v = t->cnt[-1];
+	if((nc & 3) != 0)
+		t->sh = 4 + (nc & 3) * 2;
+	else
+		t->sh = 0;
+	t->val = v;
+	t->clock = clock & -(1 << t->sh);
+	if((nc & (COUNTUP|TIMERON)) == TIMERON)
+		addevent(t, (0x10000 - t->val << t->sh) + (-clock & (1 << t->sh) - 1));
+}
+
+void
+timertick(void *aux)
+{
+	Timer *t;
+	u32int v;
+	int to;
+	
+	t = aux;
+	t->clock = clock + t->time & -(1 << t->sh);
+	t->val = -t->time >> t->sh;
+	do{
+		to = 0;
+		do{
+			t->val = v = t->val + t->cnt[-1];
+			to++;
+		}while(v >= 0x10000);
+		if(t == aux)
+			addevent(t, (0x10000 - t->val << t->sh) + (-clock & (1 << t->sh) - 1));
+		if((*t->cnt & TIMERIRQ) != 0)
+			setif(IRQTIM0 << t->i);
+		if(t->snd)
+			fifotimer(t->snd, to);
+		if(++t >= timers + 4 || (*t->cnt & (COUNTUP | TIMERON)) != (COUNTUP|TIMERON))
+			break;
+		t->val = v = t->val + to;
+	}while(v >= 0x10000);
+}
+
+void
+eventinit(void)
+{
+	int i;
+	extern void hblanktick(void *);
+
+	for(i = 0; i < 4; i++){
+		timers[i].f = timertick;
+		timers[i].aux = &timers[i];
+		timers[i].i = i;
+		timers[i].cnt = &reg[TM0CNTH + i * 2];
+	}
+	evhblank.f = hblanktick;
+	addevent(&evhblank, 240*4);
+}
--- a/sys/src/games/gba/fns.h
+++ b/sys/src/games/gba/fns.h
@@ -11,3 +11,15 @@
 void dmastart(int);
 void flushback(void);
 void writeback(void);
+void eventinit(void);
+void popevent(void);
+u16int timerget(int);
+void timerset(int, u16int);
+void addevent(Event *, int);
+void ppuwrite(u16int, u16int);
+void fifoput(int, u32int);
+void soundcnth(u16int);
+void soundbias(u16int);
+void audioinit(void);
+int audioout(void);
+void sndwrite(u16int, u16int);
--- a/sys/src/games/gba/gba.c
+++ b/sys/src/games/gba/gba.c
@@ -15,11 +15,10 @@
 int keys, paused, framestep, backup;
 QLock pauselock;
 int savefd, saveframes;
+int clock;
 
 char *biosfile = "/sys/games/lib/gbabios.bin";
 
-int ppuclock;
-
 void *
 emalloc(ulong sz)
 {
@@ -347,6 +346,7 @@
 	flushimage(display, 1);
 	if(profile)
 		timing();
+	audioout();
 	if(framestep){
 		paused = 1;
 		qlock(&pauselock);
@@ -370,7 +370,7 @@
 void
 usage(void)
 {
-	fprint(2, "usage: %s [-23T] [-s savetype] [-b biosfile] rom\n", argv0);
+	fprint(2, "usage: %s [-23aT] [-s savetype] [-b biosfile] rom\n", argv0);
 	exits("usage");
 }
 
@@ -388,6 +388,9 @@
 	case '3':
 		scale = 3;
 		break;
+	case 'a':
+		audioinit();
+		break;
 	case 's':
 		s = EARGF(usage());
 		backup = parsetype(s, &nback);
@@ -416,7 +419,8 @@
 		sysfatal("initmouse: %r");
 	proccreate(keyproc, nil, mainstacksize);
 	screeninit();
-	
+
+	eventinit();
 	memreset();
 	reset();
 	for(;;){
@@ -430,11 +434,8 @@
 			t = 8;
 		else
 			t = step();
-		ppuclock += t;
-		while(ppuclock >= 4){
-			ppustep();
-			ppuclock -= 4;
-		}
-		timerstep(t);
+		clock += t;
+		if((elist->time -= t) <= 0)
+			popevent();
 	}
 }
--- a/sys/src/games/gba/mem.c
+++ b/sys/src/games/gba/mem.c
@@ -10,8 +10,6 @@
 uchar *rom, *back;
 int nrom, nback;
 u16int reg[512];
-u16int tim[4];
-int timerclock;
 int dmaact;
 enum {
 	DMASRC,
@@ -100,7 +98,7 @@
 		
 		if(ppuy >= 160 && ppuy != 227)
 			v |= 1;
-		if(ppux >= 240)
+		if(hblank)
 			v |= 2;
 		if(ppuy == v >> 8)
 			v |= 4;
@@ -108,7 +106,7 @@
 	case 0x006:
 		return ppuy;
 	case 0x100: case 0x104: case 0x108: case 0x10c:
-		return tim[(a - 0x100) / 4];
+		return timerget((a - 0x100) / 4);
 	case 0x130:
 		return keys ^ 0x3ff;
 	default:
@@ -122,7 +120,11 @@
 	u16int *p;
 	int i;
 	static u8int ws0[4] = {5,4,3,9};
-	
+
+	if(a < 0x56)
+		ppuwrite(a, v);
+	else if(a < 0xa0)
+		sndwrite(a, v);
 	p = &reg[a/2];
 	switch(a){
 	case IF*2:
@@ -133,19 +135,6 @@
 		*p = v;
 		setif(0);
 		return;
-	case BLDALPHA*2:
-		blda = v & 0x1f;
-		if(blda > 16)
-			blda = 16;
-		bldb = v >> 8 & 0x1f;
-		if(bldb > 16)
-			bldb = 16;
-		break;
-	case BLDY*2:
-		bldy = v & 0x1f;
-		if(bldy > 16)
-			bldy = 16;
-		break;
 	case DMA0CNTH*2: case DMA1CNTH*2: case DMA2CNTH*2: case DMA3CNTH*2:
 		i = (a - DMA0CNTH*2) / 12;
 		if((v & DMAEN) != 0){
@@ -159,9 +148,14 @@
 		}else
 			dmaact &= ~1<<i;
 		break;
+	case SOUNDCNTH*2:
+		soundcnth(v);
+		break;
+	case FIFOAH*2: case FIFOBH*2:
+		fifoput(a >> 2 & 1, p[-1] | v << 16);
+		break;
 	case 0x102: case 0x106: case 0x10a: case 0x10e:
-		if((*p & 1<<7) == 0 && (v & 1<<7) != 0)
-			tim[(a-0x102)/4] = p[-1];
+		timerset((a - 0x102) / 4, v);
 		break;
 	case WAITCNT*2:
 		waitst[3] = waitst[7] = ws0[v & 3];
@@ -196,7 +190,7 @@
 			w = w & 0xff00 | (u8int)v;
 		else
 			w = w & 0xff | v << 8;
-		regwrite16(a, w);
+		regwrite16(a & ~1, w);
 		break;
 	default:
 		regwrite16(a, v);
@@ -242,6 +236,11 @@
 		cyc++;
 		if(n == 4)
 			return regread(b) | regread(b+2) << 16;
+		else if(n == 1)
+			if((b & 1) != 0)
+				return regread(b) >> 8;
+			else
+				return regread(b) & 0xff;
 		return regread(b);
 	case 5:
 		b = a & sizeof(pram) - 1;
@@ -353,6 +352,7 @@
 memreset(void)
 {
 	reg[0x88/2] = 0x200;
+	reg[BG2PA] = reg[BG2PD] = 0x100;
 	if(backup == EEPROM)
 		if(nrom <= 16*KB*KB)
 			eepstart = 0x1000000;
@@ -362,50 +362,6 @@
 		eepstart = -1;
 }
 
-void
-timerstep(int t)
-{
-	int i, carry;
-	u16int c;
-	u16int nt;
-
-	nt = -t;
-	carry = 0;
-	timerclock += t;
-	for(i = 0; i < 4; i++){
-		c = reg[0x102/2 + i*2];
-		if((c & 1<<7) == 0)
-			goto next;
-		if((c & 1<<2) == 0)
-			switch(c & 3){
-			case 1:
-				if((timerclock & 63) != 0)
-					goto next;
-				break;
-			case 2:
-				if((timerclock & 255) != 0)
-					goto next;
-				break;
-			case 3:
-				if((timerclock & 1023) != 0)
-					goto next;
-				break;
-			}
-		else
-			if(!carry)
-				goto next;
-		if(carry = tim[i] >= nt){
-			tim[i] += reg[0x100/2 + i*2];
-			if((c & 1<<6) != 0)
-				setif(IRQTIM0 << i);
-		}
-		tim[i] += t;
-		continue;
-	next:
-		carry = 0;
-	}
-}
-
 int
 dmastep(void)
 {
@@ -413,7 +369,7 @@
 	u16int *cntp, cnt;
 	u32int *dr;
 	u32int v;
-	int sz;
+	int sz, snd;
 	
 	cyc = 0;
 	for(i = 0; i < 4; i++)
@@ -425,6 +381,9 @@
 	cntp = reg + DMA0CNTH + i * 6;
 	cnt = *cntp;
 	dr = dmar + 4 * i;
+	snd = (cnt >> DMAWHEN & 3) == 3 && (i == 1 || i == 2);
+	if(snd)
+		cnt = cnt & ~(3 << DMADCNT) | DMAFIX << DMADCNT | DMAWIDE;
 
 	sz = (cnt & DMAWIDE) != 0 ? 4 : 2;
 	if(i == 0)
@@ -468,7 +427,7 @@
 	u16int *cntp, cnt, c;
 	
 	cntp = reg + DMA0CNTH;
-	for(i = 0; i < 3; i++, cntp += 6){
+	for(i = 0; i < 4; i++, cntp += 6){
 		cnt = *cntp;
 		if((cnt & DMAEN) == 0)
 			continue;
@@ -475,8 +434,11 @@
 		c = cnt >> DMAWHEN & 3;
 		if(c == 3)
 			c += (i + 1) / 2;
-		if(c == cond)
+		if(c == cond){
 			dmaact |= 1<<i;
+			if(c == DMASOUND)
+				dmar[i * 4 + DMACNT] = 4;
+		}
 	}
 }
 
--- a/sys/src/games/gba/mkfile
+++ b/sys/src/games/gba/mkfile
@@ -7,6 +7,8 @@
 	mem.$O\
 	gba.$O\
 	ppu.$O\
+	ev.$O\
+	apu.$O\
 
 HFILES=dat.h fns.h
 
--- a/sys/src/games/gba/ppu.c
+++ b/sys/src/games/gba/ppu.c
@@ -4,29 +4,28 @@
 #include "dat.h"
 #include "fns.h"
 
-int ppux, ppuy;
-uchar pic[240*160*2*3*3];
+int hblank, ppuy;
 u8int bldy, blda, bldb;
+u32int hblclock;
+int ppux0;
+u32int pixcol[480];
+u8int pixpri[480];
+u8int pixwin[240];
+uchar pic[240*160*3*2];
+int objalpha;
 
 typedef struct bg bg;
 struct bg {
 	uchar n;
-	uchar depth;
-
-	s32int rpx0, rpy0, rpx, rpy;
-	s32int sx, sy;
-	
+	s32int rpx0, rpy0, rpx1, rpy1, rpx, rpy;
 	u16int tx, ty;
 	u8int tnx, tny;
-	u16int t;
-	u8int *chr;
-	u16int *pal;
+	
+	u8int mosaic, mctr, lasti;
+	u32int curc;
+	u8int curpri;
 };
-static u8int mode=-1;
 static bg bgst[4] = {{.n = 0}, {.n = 1}, {.n = 2}, {.n = 3}};
-static u32int pixeldat[2], pixelpri[2];
-static u16int bgmask;
-static u8int objwin, objtrans;
 
 typedef struct sprite sprite;
 struct sprite {
@@ -44,6 +43,8 @@
 	
 	s32int rx, ry;
 	s16int dx, dy;
+	
+	u8int mctr, mcol;
 };
 static sprite sprt[128], *sp = sprt;
 enum {
@@ -50,6 +51,7 @@
 	SPRROT = 1<<8,
 	SPRDIS = 1<<9,
 	SPRDOUB = 1<<9,
+	SPRMOSA = 1<<12,
 	SPR8 = 1<<13,
 	SPRWIDE = 1<<14,
 	SPRTALL = 1<<15,
@@ -56,277 +58,23 @@
 	SPRHFLIP = 1<<28,
 	SPRVFLIP = 1<<29,
 	SPRSIZE0 = 1<<30,
-	SPRSIZE1 = 1<<31
-};
+	SPRSIZE1 = 1<<31,
 
-void
-pixeldraw(int x, int y, u16int v)
-{
-	uchar *p;
-	u16int *q;
-	union { u16int w; u8int b[2]; } u;
-
-	if(scale == 1){
-		p = pic + (x + y * 240) * 2;
-		p[0] = v;
-		p[1] = v >> 8;
-		return;
-	}
-	u.b[0] = v;
-	u.b[1] = v >> 8;
-	if(scale == 2){
-		q = (u16int*)pic + (x + y * 240) * 2;
-		q[0] = u.w;
-		q[1] = u.w;
-	}else{
-		q = (u16int*)pic + (x + y * 240) * 3;
-		q[0] = u.w;
-		q[1] = u.w;
-		q[2] = u.w;
-	}
-}
-
-void
-pixel(u16int c, int n, int p)
-{
-	if(p < pixelpri[0]){
-		pixeldat[1] = pixeldat[0];
-		pixelpri[1] = pixelpri[0];
-		pixelpri[0] = p;
-		pixeldat[0] = c | n << 16;
-	}else if(p < pixelpri[1]){
-		pixelpri[1] = p;
-		pixeldat[1] = c | n << 16;
-	}
-}
-
-void
-tile(bg *b)
-{
-	u16int bgcnt, ta, tx, ty, y, t;
-	u8int d;
-	u8int *chr;
+	NOWIN = 0,
+	OBJWIN = 1,
+	WIN2 = 2,
+	WIN1 = 4,
 	
-	bgcnt = reg[BG0CNT + b->n];
-	d = bgcnt >> 7 & 1;
-	tx = b->tx;
-	ty = b->ty;
-	ta = (bgcnt << 3 & 0xf800) + ((tx & 0x1f) << 1) + ((ty & 0x1f) << 6);
-	switch(bgcnt >> 14){
-	case 1: ta += tx << 6 & 0x800; break;
-	case 2: ta += ty << 6 & 0x800; break;
-	case 3: ta += tx << 6 & 0x800 | ty << 7 & 0x1000; break;
-	}
-	t = vram[ta] | vram[ta+1] << 8;
-	b->t = t;
-	chr = vram + (bgcnt << 12 & 0xc000) + ((t & 0x3ff) << 5+d);
-	y = b->tny;
-	if((t & 1<<11) != 0)
-		y ^= 7;
-	chr = chr + (y << 2+d);
-	b->chr = chr;
-	if(d != 0)
-		b->pal = pram;
-	else
-		b->pal = pram + (t >> 8 & 0xf0);
-}
-
-void
-bginit(bg *b, int scal, int)
-{
-	u16int cnt, x, y;
-	u16int *rr;
+	OBJALPHA = 1<<16,
+	SRCOBJ = 4<<17,
+	SRCBACK = 5<<17,
 	
-	cnt = reg[DISPCNT];
-	if(scal){
-		rr = reg + (b->n - 2 << 3);
-		if(ppuy == 0){
-			b->rpx0 = (s32int)(rr[BG2XL] | rr[BG2XH] << 16) << 4 >> 4;
-			b->rpy0 = (s32int)(rr[BG2YL] | rr[BG2YH] << 16) << 4 >> 4;
-		}
-		b->rpx = b->rpx0;
-		b->rpy = b->rpy0;
-		b->rpx0 += (s16int)rr[BG2PB];
-		b->rpy0 += (s16int)rr[BG2PD];
-		switch(cnt & 7){
-		case 3:
-		case 4:
-			b->sx = 240 << 8;
-			b->sy = 160 << 8;
-			b->depth = (cnt & 7) == 3;
-			break;
-		case 5:
-			b->sx = 160 << 8;
-			b->sy = 128 << 8;
-			b->depth = 1;
-			break;
-		}
-	}else{
-		rr = reg + (b->n << 1);
-		x = rr[BG0HOFS] & 0x1ff;
-		y = (rr[BG0VOFS] & 0x1ff) + ppuy;
-		b->tx = x >> 3;
-		b->ty = y >> 3;
-		b->tnx = x & 7;
-		b->tny = y & 7;
-		tile(b);
-	}
-}
+	VACANT = 0x10,
+	BACKDROP = 8,
+};
+#define SRCBG(n) ((n)<<17)
 
 void
-bgsinit(void)
-{
-	mode = reg[DISPCNT] & 7;
-	switch(mode){
-	case 0:
-		bginit(&bgst[0], 0, 0);
-		bginit(&bgst[1], 0, 0);
-		bginit(&bgst[2], 0, 0);
-		bginit(&bgst[3], 0, 0);
-		break;
-	case 1:
-		bginit(&bgst[0], 0, 0);
-		bginit(&bgst[1], 0, 0);
-		bginit(&bgst[2], 1, 0);
-		break;
-	case 2:
-		bginit(&bgst[2], 1, 0);
-		bginit(&bgst[3], 1, 0);
-		break;
-	case 3:
-	case 4:
-	case 5:
-		bginit(&bgst[2], 1, 1);
-		break;
-	}	
-}
-
-void
-bitbg(bg *b)
-{
-	u16int cnt;
-	int v;
-	uchar *p;
-	u16int *rr;
-	uchar *base;
-	
-	cnt = reg[DISPCNT];
-	rr = reg - 8 + (b->n << 3);
-	if((bgmask & 1<<b->n) == 0)
-		goto next;
-	if(b->rpx >= 0 && b->rpy >= 0 && b->rpx <= b->sx && b->rpy <= b->sy){
-		base = vram;
-		if((cnt & FRAME) != 0 && (cnt & 7) != 3)
-			base += 0xa000;
-		if(b->depth){
-			p = base + 2 * (b->rpx >> 8) + 480 * (b->rpy >> 8);
-			v = p[0] | p[1] << 8;
-		}else{
-			v = base[(b->rpx >> 8) + 240 * (b->rpy >> 8)];
-			if(v != 0)
-				v = pram[v];
-			else
-				v = -1;
-		}
-	}else
-		v = -1;
-	if(v >= 0)
-		pixel(v, b->n, reg[BG0CNT + b->n] & 3);
-next:
-	b->rpx += (s16int) rr[BG2PA];
-	b->rpy += (s16int) rr[BG2PC];
-}
-
-void
-rotbg(bg *b)
-{
-	u16int *rr, ta;
-	u16int bgcnt;
-	int row, sz, x, y;
-	uchar *p, v;
-
-	rr = reg - 8 + (b->n << 3);
-	if((bgmask & 1<<b->n) == 0)
-		goto next;
-	bgcnt = reg[BG0CNT + b->n];
-	row = (bgcnt >> 14) + 4;
-	sz = 1 << 3 + row;
-	x = b->rpx >> 8;
-	y = b->rpy >> 8;
-	if((bgcnt & DISPWRAP) != 0){
-		x &= sz - 1;
-		y &= sz - 1;
-	}else if((uint)x >= sz || (uint)y >= sz)
-		goto next;
-	ta = (bgcnt << 3 & 0xf800) + ((y >> 3) << row) + (x >> 3);
-	p = vram + (bgcnt << 12 & 0xc000) + (vram[ta] << 6);
-	p += (x & 7) + ((y & 7) << 3);
-	if((v = *p) != 0)
-		pixel(pram[v], b->n, bgcnt & 3);
-next:
-	b->rpx += (s16int) rr[BG2PA];
-	b->rpy += (s16int) rr[BG2PC];
-}
-
-void
-txtbg(bg *b)
-{
-	u16int bgcnt;
-	u8int x, v;
-
-	bgcnt = reg[BG0CNT + b->n];
-	if((bgmask & 1<<b->n) == 0)
-		goto next;
-	x = b->tnx;
-	if((b->t & 1<<10) != 0)
-		x ^= 7;
-	if((bgcnt & BG8) != 0)
-		v = b->chr[x];
-	else{
-		v = b->chr[x>>1];
-		if((x & 1) != 0)
-			v >>= 4;
-		else
-			v &= 0xf;
-	}
-	if(v != 0)
-		pixel(b->pal[v], b->n, bgcnt & 3);
-next:
-	if(++b->tnx == 8){
-		b->tnx = 0;
-		b->tx++;
-		tile(b);
-	}
-}
-
-void
-bgs(void)
-{
-	switch(mode){
-	case 0:
-		txtbg(&bgst[0]);
-		txtbg(&bgst[1]);
-		txtbg(&bgst[2]);
-		txtbg(&bgst[3]);
-		break;
-	case 1:
-		txtbg(&bgst[0]);
-		txtbg(&bgst[1]);
-		rotbg(&bgst[2]);
-		break;
-	case 2:
-		rotbg(&bgst[2]);
-		rotbg(&bgst[3]);
-		break;
-	case 3:
-	case 4:
-	case 5:
-		bitbg(&bgst[2]);
-		break;
-	}
-}
-
-void
 sprinit(void)
 {
 	u16int *p, *pp;
@@ -352,6 +100,10 @@
 			hb <<= 1;
 		if(dy >= hb || (u8int)t0 + hb > 256 && ppuy + 256 - (u8int)t0 >= hb)
 			continue;
+		if((t0 & SPRMOSA) != 0){
+			dy = dy - dy % ((reg[MOSAIC] >> 12 & 15) + 1);
+			sp->mctr = 0;
+		}
 		sp->x = (s32int)(t0 << 7) >> 23;
 		sp->t0 = t0;
 		ws = wss[s];
@@ -402,91 +154,405 @@
 }
 
 void
-spr(void)
+spr(int x1)
 {
+	int x0, i, dx, sx0, sx1;
+	u8int pri, v, d, *b;
+	u16int x, y;
+	u32int c, t0;
 	sprite *s;
-	ushort dx;
-	u32int t0;
-	uchar v;
-	ushort x, y;
-	u16int c;
-	int pv, ppri, pri;
-	uchar d;
-	uchar *b;
 	
-	pv = -1;
-	ppri = 6;;
+	x0 = ppux0;
 	for(s = sprt; s < sp; s++){
-		dx = ppux - s->x;
-		if(dx >= s->wb)
+		if(s->x >= x1 || s->x + s->wb <= x0)
 			continue;
 		t0 = s->t0;
-		if((t0 & SPRROT) != 0){
-			x = s->rx >> 8;
-			y = s->ry >> 8;
-			if(x < s->w && y < s->h){
-				b = s->base;
+		pri = s->t1 >> 10 & 3;
+		sx0 = s->x >= x0 ? s->x : x0;
+		sx1 = s->x + s->wb;
+		if(x1 < sx1)
+			sx1 = x1;
+		dx = sx0 - s->x;
+		for(i = sx0; i < sx1; i++, dx++){
+			if((t0 & SPRROT) != 0){
 				d = (t0 & SPR8) != 0;
-				b += (y & 7) << 2 + d;
-				b += y >> 3 << s->ysh;
-				b += (x & 7) >> 1 - d;
-				b += x >> 3 << 5 + d;
-				v = *b;
-				if(!d)
-					if((x & 1) != 0)
-						v >>= 4;
-					else
-						v &= 0xf;
-			}else
-				v = 0;
-			s->rx += s->dx;
-			s->ry += s->dy;
-		}else if((t0 & SPRHFLIP) != 0){
-			if((t0 & SPR8) != 0)
-				v = *--s->base;
-			else if((dx & 1) != 0)
-				v = *s->base & 0x0f;
-			else
-				v = *--s->base >> 4;
-			if((dx & 7) == 7)
-				s->base -= s->inc;
-		}else{
-			v = *s->base;
-			if((t0 & SPR8) != 0)
-				s->base++;
-			else if((dx & 1) != 0){
-				v >>= 4;
-				s->base++;
-			}else
-				v &= 0xf;
-			if((dx & 7) == 7)
-				s->base += s->inc;
-		}
-		if(v != 0){
-			pri = s->t1 >> 10 & 3;
-			c = s->pal[v];
-			switch(s->t0 >> 10 & 3){
-			case 1:
-				c |= 1<<16;
-			case 0:
-				if(ppri > pri){
-					pv = c;
-					ppri = pri;
+				x = s->rx >> 8;
+				y = s->ry >> 8;
+				s->rx += s->dx;
+				s->ry += s->dy;
+				if(x < s->w && y < s->h){
+					b = s->base;
+					b += (y & 7) << 2 + d;
+					b += y >> 3 << s->ysh;
+					b += (x & 7) >> 1 - d;
+					b += x >> 3 << 5 + d;
+					v = *b;
+					if(!d)
+						if((x & 1) != 0)
+							v >>= 4;
+						else
+							v &= 0xf;
+				}else
+					v = 0;
+			}else if((t0 & SPRHFLIP) != 0){
+				if((t0 & SPR8) != 0)
+					v = *--s->base;
+				else if((dx & 1) != 0)
+					v = *s->base & 0x0f;
+				else
+					v = *--s->base >> 4;
+				if((dx & 7) == 7)
+					s->base -= s->inc;
+			}else{
+				v = *s->base;
+				if((t0 & SPR8) != 0)
+					s->base++;
+				else if((dx & 1) != 0){
+					v >>= 4;
+					s->base++;
+				}else
+					v &= 0xf;
+				if((dx & 7) == 7)
+					s->base += s->inc;
+			}
+			if((t0 & SPRMOSA) != 0)
+				if(s->mctr == 0){
+					s->mctr = reg[MOSAIC] >> 8 & 15;
+					s->mcol = v;
+				}else{
+					--s->mctr;
+					v = s->mcol;
 				}
-				break;
-			case 2:
-				objwin = 1;
-				break;
+			if(v != 0){
+				c = s->pal[v] | SRCOBJ;
+				switch(t0 >> 10 & 3){
+				case 1:
+					c |= OBJALPHA;
+					objalpha++;
+				case 0:
+					if(pri < pixpri[i]){
+						pixcol[i] = c;
+						pixpri[i] = pri;
+					}
+					break;
+				case 2:
+					if((reg[DISPCNT] & 1<<15) != 0)
+						pixwin[i] |= OBJWIN;
+					break;
+				}
 			}
 		}
 	}
-	if(pv >= 0){
-		pixel(pv, 4, ppri);
-		if(pv >> 16 != 0)
-			objtrans = 1;
+}
+
+void
+bgpixel(bg *b, int i, u32int c, int pri)
+{
+	u8int *p;
+	u32int *q;
+	int j;
+
+	if(b != nil){
+		c |= SRCBG(b->n);
+		if(b->mosaic){
+			for(j = (u8int)(b->lasti+1); j <= i; j++){
+				if(b->mctr == 0){
+					if(j == i){
+						b->curc = c;
+						b->curpri = pri;
+					}else
+						b->curpri = VACANT;
+					b->mctr = reg[MOSAIC] & 15;
+				}else
+					b->mctr--;
+				if(b->curpri != VACANT && (pixwin[j] & 1<<b->n) == 0)
+					bgpixel(nil, j, b->curc, b->curpri);
+			}
+			b->lasti = i;
+			return;
+		}
 	}
+	p = pixpri + i;
+	q = pixcol + i;
+	if(pri < p[0]){
+		p[240] = p[0];
+		p[0] = pri;
+		q[240] = q[0];
+		q[0] = c;
+	}else if(pri < p[240]){
+		p[240] = pri;
+		q[240] = c;
+	}
 }
 
+
+
+void
+bginit(bg *b, int scal, int)
+{
+	u16int x, y;
+	u16int *rr;
+	int msz;
+
+	b->mosaic = (reg[BG0CNT + b->n] & BGMOSAIC) != 0;
+	if(b->mosaic){
+		b->mctr = 0;
+		b->lasti = -1;
+	}
+	if(scal){
+		rr = reg + (b->n - 2 << 3);
+		if(ppuy == 0){
+			b->rpx0 = (s32int)(rr[BG2XL] | rr[BG2XH] << 16) << 4 >> 4;
+			b->rpy0 = (s32int)(rr[BG2YL] | rr[BG2YH] << 16) << 4 >> 4;
+		}
+		if(!b->mosaic || ppuy % ((reg[MOSAIC] >> 4 & 15) + 1) == 0){
+			b->rpx1 = b->rpx0;
+			b->rpy1 = b->rpy0;
+		}
+		b->rpx = b->rpx1;
+		b->rpy = b->rpy1;
+		b->rpx0 += (s16int)rr[BG2PB];
+		b->rpy0 += (s16int)rr[BG2PD];
+	}else{
+		rr = reg + (b->n << 1);
+		x = rr[BG0HOFS] & 0x1ff;
+		y = ppuy;
+		if(b->mosaic){
+			msz = (reg[MOSAIC] >> 4 & 15) + 1;
+			y = y - y % msz;
+		}
+		y += (rr[BG0VOFS] & 0x1ff);
+		b->tx = x >> 3;
+		b->ty = y >> 3;
+		b->tnx = x & 7;
+		b->tny = y & 7;
+	}
+}
+
+void
+bgsinit(void)
+{
+	switch(reg[DISPCNT] & 7){
+	case 0:
+		bginit(&bgst[0], 0, 0);
+		bginit(&bgst[1], 0, 0);
+		bginit(&bgst[2], 0, 0);
+		bginit(&bgst[3], 0, 0);
+		break;
+	case 1:
+		bginit(&bgst[0], 0, 0);
+		bginit(&bgst[1], 0, 0);
+		bginit(&bgst[2], 1, 0);
+		break;
+	case 2:
+		bginit(&bgst[2], 1, 0);
+		bginit(&bgst[3], 1, 0);
+		break;
+	case 3:
+	case 4:
+	case 5:
+		bginit(&bgst[2], 1, 1);
+		break;
+	}	
+}
+
+void
+bitbg(bg *b, int x1)
+{
+	u8int *base, *p, pri, d;
+	u16int cnt, *rr, sx, sy;
+	int i, v;
+	
+	cnt = reg[DISPCNT];
+	if((cnt & 1<<8 + b->n) == 0)
+		return;
+	rr = reg + (b->n - 2 << 3);
+	if((cnt & 7) != 5){
+		sx = 240 << 8;
+		sy = 160 << 8;
+		d = (cnt & 7) == 3;
+	}else{
+		sx = 160 << 8;
+		sy = 128 << 8;
+		d = 1;
+	}
+	base = vram;
+	if((cnt & FRAME) != 0 && (cnt & 7) != 3)
+		base += 0xa000;
+	pri = reg[BG0CNT + b->n] & 3;
+	for(i = ppux0; i < x1; i++){
+		if(((pixwin[i] & 1<<b->n) == 0 || b->mosaic) && (u32int)b->rpx < sx && (u32int)b->rpy < sy){
+			if(d){
+				p = base + 2 * (b->rpx >> 8) + 480 * (b->rpy >> 8);
+				v = p[0] | p[1] << 8;
+			}else{
+				v = base[(b->rpx >> 8) + 240 * (b->rpy >> 8)];
+				if(v != 0)
+					v = pram[v];
+				else
+					v = -1;
+			}
+			if(v >= 0)
+				bgpixel(b, i, v, pri);
+	
+		}
+		b->rpx += (s16int) rr[BG2PA];
+		b->rpy += (s16int) rr[BG2PC];
+	}
+}
+
+void
+txtbg(bg *b, int x1)
+{
+	u8int y, v, d, *cp;
+	u16int bgcnt, ta0, ta, tx, ty, t, *pal;
+	u32int ca;
+	int i, x, mx;
+	
+	if((reg[DISPCNT] & 1<<8 + b->n) == 0)
+		return;
+	bgcnt = reg[BG0CNT + b->n];
+	d = bgcnt >> 7 & 1;
+	tx = b->tx;
+	ty = b->ty;
+	ta0 = (bgcnt << 3 & 0xf800) + ((ty & 0x1f) << 6);
+	switch(bgcnt >> 14){
+	case 2: ta0 += ty << 6 & 0x800; break;
+	case 3: ta0 += ty << 7 & 0x1000; break;
+	}
+	x = ppux0;
+	i = b->tnx;
+	for(; x < x1; tx++, i = 0){
+		ta = ta0 + ((tx & 0x1f) << 1);
+		if((bgcnt & 1<<14) != 0)
+			ta += tx << 6 & 0x800;
+		t = vram[ta] | vram[ta+1] << 8;
+		if(d)
+			pal = pram;
+		else
+			pal = pram + (t >> 8 & 0xf0);
+		ca = (bgcnt << 12 & 0xc000) + ((t & 0x3ff) << 5+d);
+		if(ca >= 0x10000)
+			continue;
+		y = b->tny;
+		if((t & 1<<11) != 0)
+			y ^= 7;
+		ca += y << 2+d;
+		cp = vram + ca;
+		for(; i < 8; i++, x++){
+			if(x >= x1)
+				goto out;
+			if((pixwin[x] & 1<<b->n) != 0 && !b->mosaic)
+				continue;
+			mx = i;
+			if((t & 1<<10) != 0)
+				mx ^= 7;
+			v = cp[mx >> 1-d];
+			if(!d)
+				if((mx & 1) != 0)
+					v >>= 4;
+				else
+					v &= 0xf;
+			if(v != 0)
+				bgpixel(b, x, pal[v], bgcnt & 3);
+		}
+	}
+out:
+	b->tx = tx;
+	b->tnx = i;
+}
+
+void
+rotbg(bg *b, int x1)
+{
+	uchar *p, v;
+	u16int bgcnt, *rr, ta;
+	int i, row, sz, x, y;
+
+	rr = reg + (b->n - 2 << 3);
+	if((reg[DISPCNT] & 1<<8 + b->n) == 0)
+		return;
+	bgcnt = reg[BG0CNT + b->n];
+	row = (bgcnt >> 14) + 4;
+	sz = 1 << 3 + row;
+	for(i = ppux0; i < x1; i++){
+		x = b->rpx >> 8;
+		y = b->rpy >> 8;
+		b->rpx += (s16int) rr[BG2PA];
+		b->rpy += (s16int) rr[BG2PC];
+		if((pixwin[i] & 1<<b->n) != 0 && !b->mosaic)
+			continue;
+		if((bgcnt & DISPWRAP) != 0){
+			x &= sz - 1;
+			y &= sz - 1;
+		}else if((uint)x >= sz || (uint)y >= sz)
+			 continue;
+		ta = (bgcnt << 3 & 0xf800) + ((y >> 3) << row) + (x >> 3);
+		p = vram + (bgcnt << 12 & 0xc000) + (vram[ta] << 6);
+		p += (x & 7) + ((y & 7) << 3);
+		if((v = *p) != 0)
+			bgpixel(b, i, pram[v], bgcnt & 3);
+		
+	}
+}
+
+void
+windows(int x1)
+{
+	static u8int wintab[8] = {2, 3, 1, 1, 0, 0, 0, 0};
+	int i, sx0, sx1;
+	u16int v, h;
+	u16int cnt;
+	
+	cnt = reg[DISPCNT];
+	if((cnt >> 13) != 0){
+		if((cnt & 1<<13) != 0){
+			v = reg[WIN0V];
+			h = reg[WIN0H];
+			if(ppuy < (u8int)v && ppuy >= v >> 8){
+				sx1 = (u8int)h;
+				sx0 = h >> 8;
+				if(sx0 < ppux0)
+					sx0 = ppux0;
+				if(sx1 > x1)
+					sx1 = x1;
+				for(i = sx0; i < sx1; i++)
+					pixwin[i] |= WIN1;
+			}
+		}
+		if((cnt & 1<<14) != 0){
+			v = reg[WIN1V];
+			h = reg[WIN1H];
+			if(ppuy < (u8int)v && ppuy >= v >> 8){
+				sx1 = (u8int)h;
+				sx0 = h >> 8;
+				if(sx0 < ppux0)
+					sx0 = ppux0;
+				if(sx1 > x1)
+					sx1 = x1;
+				for(i = sx0; i < sx1; i++)
+					pixwin[i] |= WIN2;
+			}
+		}
+		for(i = ppux0; i < x1; i++){
+			v = wintab[pixwin[i]];
+			h = reg[WININ + (v & 2) / 2];
+			if((v & 1) != 0)
+				h >>= 8;
+			pixwin[i] = ~h;
+		}
+	}
+	for(i = ppux0; i < x1; i++)
+		if(pixpri[i] == VACANT || (pixwin[i] & 1<<4) != 0){
+			pixcol[i] = pram[0] | SRCBACK;
+			pixpri[i] = BACKDROP;
+		}else{
+			pixcol[i+240] = pram[0] | SRCBACK;
+			pixpri[i+240] = BACKDROP;
+		}
+	objalpha = 0;
+}
+
 u16int
 mix(u16int c1, u16int c2)
 {
@@ -537,117 +603,185 @@
 }
 
 void
-windows(void)
+colormath(int x1)
 {
-	u16int dispcnt;
-	u16int v, h;
+	u16int bldcnt;
+	u32int *p;
+	int i;
+	
+	bldcnt = reg[BLDCNT];
+	if((bldcnt & 3<<6) == 0 && objalpha == 0)
+		return;
+	p = pixcol + ppux0;
+	for(i = ppux0; i < x1; i++, p++){
+		if((*p & OBJALPHA) != 0)
+			goto alpha;
+		if((pixwin[i] & 1<<5) != 0 || (bldcnt & 1<<(*p >> 17)) == 0)
+			continue;
+		switch(bldcnt >> 6 & 3){
+		case 1:
+		alpha:
+			if((bldcnt & 1<<8+(p[240] >> 17)) == 0)
+				continue;
+			*p = mix(*p, p[240]);
+			break;
+		case 2:
+			*p = brighten(*p);
+			break;
+		case 3:
+			*p = darken(*p);
+			break;
+		}
+	}
+}
 
-	dispcnt = reg[DISPCNT];
-	bgmask = dispcnt >> 8 | 1<<5;
-	if((dispcnt >> 13) != 0){
-		if((dispcnt & 1<<13) != 0){
-			v = reg[WIN0V];
-			h = reg[WIN0H];
-			if(ppuy < (u8int)v && ppuy >= v >> 8 &&
-				ppux < (u8int)h && ppux >= h >> 8){
-				bgmask &= reg[WININ];
-				goto windone;
-			}
+void
+linecopy(void)
+{
+	u32int *p;
+	uchar *q;
+	u16int *r;
+	u16int v;
+	union { u16int w; u8int b[2]; } u;
+	int n;
+	
+	p = pixcol;
+	q = pic + ppuy * 240 * 2 * scale;
+	r = (u16int*)q;
+	n = 240;
+	while(n--){
+		v = *p++;
+		if(scale == 1){
+			*q++ = v;
+			*q++ = v >> 8;
+			continue;
 		}
-		if((dispcnt & 1<<14) != 0){
-			v = reg[WIN1V];
-			h = reg[WIN1H];
-			if(ppuy < (u8int)v && ppuy >= v >> 8 &&
-				ppux < (u8int)h && ppux >= h >> 8){
-				bgmask &= reg[WININ] >> 8;
-				goto windone;
-			}
+		u.b[0] = v;
+		u.b[1] = v >> 8;
+		if(scale == 2){
+			*r++ = u.w;
+			*r++ = u.w;
+		}else{
+			*r++ = u.w;
+			*r++ = u.w;
+			*r++ = u.w;
 		}
-		if((dispcnt & 1<<15) != 0 && objwin != 0){
-			bgmask &= reg[WINOUT] >> 8;
-			goto windone;
-		}
-		bgmask &= reg[WINOUT];
 	}
-windone:
-	if(pixelpri[0] != 8 && (bgmask & 1<<4) == 0){
-		pixelpri[0] = 8;
-		pixeldat[0] = pram[0] | 5 << 16;
-	}
 }
 
 void
-colormath(void)
+syncppu(int x1)
 {
-	u8int src0;
-	u16int bldcnt;
-	
-	if((bgmask & 1<<5) == 0)
+	int i;
+	u16int cnt;
+
+	if(hblank || ppuy >= 160)
 		return;
-	bldcnt = reg[BLDCNT];
-	src0 = pixeldat[0] >> 16;
-	if(objtrans && src0 == 4)
-		goto alpha;
-	if((bldcnt & 3<<6) == 0 || (bldcnt & 1<<src0) == 0)
+	if(x1 >= 240)
+		x1 = 240;
+	else if(x1 <= ppux0)
 		return;
-	switch(bldcnt >> 6 & 3){
+	cnt = reg[DISPCNT];
+	if((cnt & FBLANK) != 0){
+		for(i = ppux0; i < x1; i++)
+			pixcol[i] = 0xffff;
+		ppux0 = x1;
+		return;
+	}
+
+	if((cnt & 1<<12) != 0)
+		spr(x1);
+	windows(x1);
+	switch(cnt & 7){
+	case 0:
+		txtbg(&bgst[0], x1);
+		txtbg(&bgst[1], x1);
+		txtbg(&bgst[2], x1);
+		txtbg(&bgst[3], x1);
+		break;
 	case 1:
-	alpha:
-		if((bldcnt & 1<<8+(pixeldat[1]>>16)) == 0)
-			return;
-		pixeldat[0] = mix(pixeldat[0], pixeldat[1]);
+		txtbg(&bgst[0], x1);
+		txtbg(&bgst[1], x1);
+		rotbg(&bgst[2], x1);
 		break;
 	case 2:
-		pixeldat[0] = brighten(pixeldat[0]);
+		rotbg(&bgst[2], x1);
+		rotbg(&bgst[3], x1);
 		break;
 	case 3:
-		pixeldat[0] = darken(pixeldat[0]);
-		break;
+	case 4:
+	case 5:
+		bitbg(&bgst[2], x1);
 	}
+	colormath(x1);
+	ppux0 = x1;
 }
 
 void
-ppustep(void)
+hblanktick(void *)
 {
+	extern Event evhblank;
 	u16int stat;
-	u16int cnt;
-	
+
 	stat = reg[DISPSTAT];
-	cnt = reg[DISPCNT];
-	if(ppuy < 160 && ppux < 240)
-		if((cnt & FBLANK) == 0){
-			objwin = 0;
-			objtrans = 0;
-			pixelpri[0] = 8;
-			pixeldat[0] = pram[0] | 5 << 16;
-			if((cnt & 1<<12) != 0)
-				spr();
-			windows();
-			bgs();
-			colormath();
-			pixeldraw(ppux, ppuy, pixeldat[0]);
-		}else
-			pixeldraw(ppux, ppuy, 0xffff);
-	if(ppux == 240 && ppuy < 160){
-		if((stat & IRQHBLEN) != 0)
-			setif(IRQHBL);
-		dmastart(DMAHBL);
-	}
-	if(++ppux >= 308){
-		ppux = 0;
+	if(hblank){
+		hblclock = clock + evhblank.time;
+		addevent(&evhblank, 240*4 + evhblank.time);
+		hblank = 0;
+		ppux0 = 0;
+		memset(pixpri, VACANT, sizeof(pixpri));
+		memset(pixwin, 0, 240);
 		if(++ppuy >= 228){
 			ppuy = 0;
 			flush();
 		}
-		if((stat & IRQVCTREN) != 0 && ppuy == stat >> 8)
-			setif(IRQVCTR);
 		if(ppuy < 160){
-			bgsinit();
 			sprinit();
+			bgsinit();
 		}else if(ppuy == 160){
+			dmastart(DMAVBL);
 			if((stat & IRQVBLEN) != 0)
 				setif(IRQVBL);
-			dmastart(DMAVBL);
 		}
+		if((stat & IRQVCTREN) != 0 && ppuy == stat >> 8)
+			setif(IRQVCTR);
+	}else{
+		syncppu(240);
+		linecopy();
+		addevent(&evhblank, 68*4 + evhblank.time);
+		hblank = 1;
+		if((stat & IRQHBLEN) != 0)
+			setif(IRQHBL);
+		if(ppuy < 160)
+			dmastart(DMAHBL);
+	}
+}
+
+void
+ppuwrite(u16int a, u16int v)
+{
+	syncppu((clock - hblclock) / 4);
+	switch(a){
+	case BLDALPHA*2:
+		blda = v & 0x1f;
+		if(blda > 16)
+			blda = 16;
+		bldb = v >> 8 & 0x1f;
+		if(bldb > 16)
+			bldb = 16;
+		break;
+	case BLDY*2:
+		bldy = v & 0x1f;
+		if(bldy > 16)
+			bldy = 16;
+		break;
+	case BG2XL*2: bgst[2].rpx0 = bgst[2].rpx0 & 0xffff0000 | v; break;
+	case BG2XH*2: bgst[2].rpx0 = bgst[2].rpx0 & 0xffff | (s32int)(v << 20) >> 4; break;
+	case BG2YL*2: bgst[2].rpy0 = bgst[2].rpy0 & 0xffff0000 | v; break;
+	case BG2YH*2: bgst[2].rpy0 = bgst[2].rpy0 & 0xffff | (s32int)(v << 20) >> 4; break;
+	case BG3XL*2: bgst[3].rpx0 = bgst[3].rpx0 & 0xffff0000 | v; break;
+	case BG3XH*2: bgst[3].rpx0 = bgst[3].rpx0 & 0xffff | (s32int)(v << 20) >> 4; break;
+	case BG3YL*2: bgst[3].rpy0 = bgst[3].rpy0 & 0xffff0000 | v; break;
+	case BG3YH*2: bgst[3].rpy0 = bgst[3].rpy0 & 0xffff | (s32int)(v << 20) >> 4; break;
 	}
 }