shithub: zelda3

Download patch

ref: cd5da7dfc211da8dc604a576c2e732c063f3a3a9
parent: 2f012a3c3d26fa0a62b1608d6dd9ac6acf8d18cb
author: Snesrev <snesrev@protonmail.com>
date: Mon Sep 5 21:16:13 EDT 2022

New 10-15x faster PPU implementation (toggle with R)

--- a/README.md
+++ b/README.md
@@ -90,6 +90,8 @@
 | O   | Set dungeon key to 1  |
 | K   | Clear all input history from the joypad log  |
 | L   | Stop replaying a shapshot  |
+| R   | Toggle between fast and slow renderer |
+| F   | Display renderer performance |
 | F1-F10 | Load snapshot      |
 | Alt+Enter | Toggle Fullscreen     |
 | Shift+F1-F10 | Save snapshot |
--- a/config.c
+++ b/config.c
@@ -31,8 +31,8 @@
   _(SDLK_1), _(SDLK_2), _(SDLK_3), _(SDLK_4), _(SDLK_5), _(SDLK_6), _(SDLK_7), _(SDLK_8), _(SDLK_9), _(SDLK_0), _(SDLK_MINUS), _(SDLK_EQUALS), _(SDLK_BACKSPACE), N, N, N, N, N, N, N,
   // Replay Ref State
   C(SDLK_1), C(SDLK_2), C(SDLK_3), C(SDLK_4), C(SDLK_5), C(SDLK_6), C(SDLK_7), C(SDLK_8), C(SDLK_9), C(SDLK_0), C(SDLK_MINUS), C(SDLK_EQUALS), C(SDLK_BACKSPACE), N, N, N, N, N, N, N,
-  // CheatLife, CheatKeys, ClearKeyLog, StopReplay, Fullscreen, Reset, Pause, PauseDimmed, Turbo, ZoomIn, ZoomOut, DisplayPerf
-  _(SDLK_w), _(SDLK_o), _(SDLK_k), _(SDLK_l), A(SDLK_RETURN), _(SDLK_e), S(SDLK_p), _(SDLK_p), _(SDLK_t), N, N, _(SDLK_f)
+  // CheatLife, CheatKeys, ClearKeyLog, StopReplay, Fullscreen, Reset, Pause, PauseDimmed, Turbo, ZoomIn, ZoomOut, DisplayPerf, ToggleRenderer
+  _(SDLK_w), _(SDLK_o), _(SDLK_k), _(SDLK_l), A(SDLK_RETURN), _(SDLK_e), S(SDLK_p), _(SDLK_p), _(SDLK_t), N, N, _(SDLK_f), _(SDLK_r),
 };
 #undef _
 #undef A
@@ -50,7 +50,7 @@
 static const KeyNameId kKeyNameId[] = {
   M(Controls), M(Load), M(Save), M(Replay), M(LoadRef), M(ReplayRef),
   S(CheatLife), S(CheatKeys), S(ClearKeyLog), S(StopReplay), S(Fullscreen), S(Reset),
-  S(Pause), S(PauseDimmed), S(Turbo), S(ZoomIn), S(ZoomOut), S(DisplayPerf),
+  S(Pause), S(PauseDimmed), S(Turbo), S(ZoomIn), S(ZoomOut), S(DisplayPerf), S(ToggleRenderer),
 };
 #undef S
 #undef M
--- a/config.h
+++ b/config.h
@@ -27,6 +27,7 @@
   kKeys_ZoomIn,
   kKeys_ZoomOut,
   kKeys_DisplayPerf,
+  kKeys_ToggleRenderer,
   kKeys_Total,
 };
 
--- a/main.c
+++ b/main.c
@@ -424,6 +424,7 @@
     case kKeys_ZoomIn: DoZoom(1); break;
     case kKeys_ZoomOut: DoZoom(-1); break;
     case kKeys_DisplayPerf: g_display_perf ^= 1; break;
+    case kKeys_ToggleRenderer: g_zenv.ppu->newRenderer ^= 1; break;
     default: assert(0);
     }
   }
--- a/snes/ppu.c
+++ b/snes/ppu.c
@@ -1,4 +1,3 @@
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -10,7 +9,7 @@
 #include "snes.h"
 #include "../types.h"
 
-static const int spriteSizes[8][2] = {
+static const uint8 kSpriteSizes[8][2] = {
   {8, 16}, {8, 32}, {8, 64}, {16, 32},
   {16, 64}, {32, 64}, {16, 32}, {16, 32}
 };
@@ -20,13 +19,14 @@
 static int ppu_getPixelForBgLayer(Ppu *ppu, int x, int y, int layer, bool priority);
 static void ppu_calculateMode7Starts(Ppu* ppu, int y);
 static int ppu_getPixelForMode7(Ppu* ppu, int x, int layer, bool priority);
-static void ppu_evaluateSprites(Ppu* ppu, int line);
 static bool ppu_getWindowState(Ppu* ppu, int layer, int x);
+static bool ppu_evaluateSprites(Ppu* ppu, int line);
 static uint16_t ppu_getVramRemap(Ppu* ppu);
 
 Ppu* ppu_init(Snes* snes) {
   Ppu* ppu = (Ppu * )malloc(sizeof(Ppu));
   ppu->snes = snes;
+  ppu->newRenderer = true;
   return ppu;
 }
 
@@ -36,6 +36,8 @@
 
 void ppu_reset(Ppu* ppu) {
   memset(ppu->vram, 0, sizeof(ppu->vram));
+  ppu->lastBrightnessMult = 0xff;
+  ppu->lastMosaicModulo = 0xff;
   ppu->vramPointer = 0;
   ppu->vramIncrementOnHigh = false;
   ppu->vramIncrement = 1;
@@ -57,8 +59,7 @@
   ppu->objTileAdr1 = 0;
   ppu->objTileAdr2 = 0;
   ppu->objSize = 0;
-  memset(ppu->objPixelBuffer, 0, sizeof(ppu->objPixelBuffer));
-  memset(ppu->objPriorityBuffer, 0, sizeof(ppu->objPriorityBuffer));
+  memset(&ppu->objBuffer, 0, sizeof(ppu->objBuffer));
   ppu->timeOver = false;
   ppu->rangeOver = false;
   ppu->objInterlace_always_zero = false;
@@ -133,7 +134,7 @@
 }
 
 void ppu_saveload(Ppu *ppu, SaveLoadFunc *func, void *ctx) {
-  func(ctx, &ppu->vram, offsetof(Ppu, pixelBuffer) - offsetof(Ppu, vram));
+  func(ctx, &ppu->vram, offsetof(Ppu, mosaicModulo) - offsetof(Ppu, vram));
 }
 
 void ppu_handleVblank(Ppu* ppu) {
@@ -146,6 +147,8 @@
   ppu->frameInterlace_always_zero = ppu->interlace_always_zero; // set if we have a interlaced frame
 }
 
+static void PpuDrawWholeLine(Ppu *ppu, uint y);
+
 void ppu_runLine(Ppu* ppu, int line) {
   if(line == 0) {
 
@@ -158,24 +161,683 @@
       wl->window2inversed = (wl->window2inversed != 0);
     }
 
-    ppu->mosaicStartLine = 1;
     ppu->rangeOver = false;
     ppu->timeOver = false;
     ppu->evenFrame = !ppu->evenFrame;
   } else {
+    if (ppu->mosaicSize != ppu->lastMosaicModulo) {
+      int mod = ppu->mosaicSize;
+      ppu->lastMosaicModulo = mod;
+      for (int i = 0, j = 0; i < 256; i++) {
+        ppu->mosaicModulo[i] = i - j;
+        j = (j + 1 == mod ? 0 : j + 1);
+      }
+    }
     // evaluate sprites
-    memset(ppu->objPixelBuffer, 0, sizeof(ppu->objPixelBuffer));
-    memset(ppu->objPriorityBuffer, 0xff, sizeof(ppu->objPriorityBuffer));
+    memset(&ppu->objBuffer.pixel, 0, sizeof(ppu->objBuffer.pixel));
+    memset(&ppu->objBuffer.prio, 0x05, sizeof(ppu->objBuffer.prio));
+    ppu->lineHasSprites = !ppu->forcedBlank && ppu_evaluateSprites(ppu, line - 1);
 
-    if(!ppu->forcedBlank) ppu_evaluateSprites(ppu, line - 1);
     // actual line
-    if(ppu->mode == 7) ppu_calculateMode7Starts(ppu, line);
-    for(int x = 0; x < 256; x++) {
-      ppu_handlePixel(ppu, x, line);
+    if (ppu->newRenderer) {
+      PpuDrawWholeLine(ppu, line);
+    } else {
+      if (ppu->mode == 7)
+        ppu_calculateMode7Starts(ppu, line);
+      for (int x = 0; x < 256; x++) {
+        ppu_handlePixel(ppu, x, line);
+      }
     }
   }
 }
 
+typedef struct PpuWindows {
+  uint16 edges[6];
+  uint8 nr;
+  uint8 bits;
+} PpuWindows;
+
+static void PpuWindows_Clear(PpuWindows *win) {
+  win->edges[0] = 0;
+  win->edges[1] = 256;
+  win->nr = 1;
+  win->bits = 0;
+}
+
+static void PpuWindows_Calc(PpuWindows *win, Ppu *ppu, uint layer) {
+  WindowLayer *wl = &ppu->windowLayer[layer];
+  // Evaluate which spans to render based on the window settings.
+  // There are at most 5 windows.
+  // Algorithm from Snes9x
+  uint nr = 1;
+  win->edges[0] = 0;
+  win->edges[1] = 256;
+  uint8 window_bits = 0;
+  uint i, j, t;
+  bool w1_ena = wl->window1enabled && ppu->window1left <= ppu->window1right;
+  if (w1_ena) {
+    if (ppu->window1left) {
+      win->edges[nr] = ppu->window1left;
+      win->edges[++nr] = 256;
+    }
+    if (ppu->window1right < 255) {
+      win->edges[nr] = ppu->window1right + 1;
+      win->edges[++nr] = 256;
+    }
+  }
+  bool w2_ena = wl->window2enabled && ppu->window2left <= ppu->window2right;
+  if (w2_ena) {
+    for (i = 0; i <= nr && (t = ppu->window2left) != win->edges[i]; i++) {
+      if (t < win->edges[i]) {
+        for (j = nr++; j >= i; j--)
+          win->edges[j + 1] = win->edges[j];
+        win->edges[i] = t;
+        break;
+      }
+    }
+    for (; i <= nr && (t = ppu->window2right + 1) != win->edges[i]; i++) {
+      if (t < win->edges[i]) {
+        for (j = nr++; j >= i; j--)
+          win->edges[j + 1] = win->edges[j];
+        win->edges[i] = t;
+        break;
+      }
+    }
+  }
+  win->nr = nr;
+  // get a bitmap of how regions map to windows
+  uint8 w1_bits = 0, w2_bits = 0;
+  if (w1_ena) {
+    for (i = 0; win->edges[i] != ppu->window1left; i++);
+    for (j = i; win->edges[j] != ppu->window1right + 1; j++);
+    w1_bits = ((1 << (j - i)) - 1) << i;
+  }
+  if (wl->window1enabled & wl->window1inversed)
+    w1_bits = ~w1_bits;
+  if (w2_ena) {
+    for (i = 0; win->edges[i] != ppu->window2left; i++);
+    for (j = i; win->edges[j] != ppu->window2right + 1; j++);
+    w2_bits = ((1 << (j - i)) - 1) << i;
+  }
+  if (wl->window2enabled & wl->window2inversed)
+    w2_bits = ~w2_bits;
+  win->bits = w1_bits | w2_bits;
+}
+
+// Draw a whole line of a 4bpp background layer into bgBuffers
+static void PpuDrawBackground_4bpp(Ppu *ppu, uint y, bool sub, uint layer, uint8 zhi, uint8 zlo) {
+#define DO_PIXEL(i) do { \
+  pixel = (bits >> i) & 1 | (bits >> (7 + i)) & 2 | (bits >> (14 + i)) & 4 | (bits >> (21 + i)) & 8; \
+  if (pixel && z > dst[256 + i]) dst[i] = paletteBase + pixel, dst[256 + i] = z; } while (0)
+#define DO_PIXEL_HFLIP(i) do { \
+  pixel = (bits >> (7 - i)) & 1 | (bits >> (14 - i)) & 2 | (bits >> (21 - i)) & 4 | (bits >> (28 - i)) & 8; \
+  if (pixel && z > dst[256 + i]) dst[i] = paletteBase + pixel, dst[256 + i] = z; } while (0)
+#define READ_BITS(ta, tile) (addr = &ppu->vram[((ta) + (tile) * 16) & 0x7fff], addr[0] | addr[8] << 16)
+  enum { kPaletteShift = 6 };
+  Layer *layerp = &ppu->layer[layer];
+  if (!layerp->screenEnabled[sub])
+    return;  // layer is completely hidden
+  PpuWindows win;
+  layerp->screenWindowed[sub] ? PpuWindows_Calc(&win, ppu, layer) : PpuWindows_Clear(&win);
+  BgLayer *bglayer = &ppu->bgLayer[layer];
+  y += bglayer->vScroll;
+  int sc_offs = bglayer->tilemapAdr + (((y >> 3) & 0x1f) << 5);
+  if ((y & 0x100) && bglayer->tilemapHigher)
+    sc_offs += bglayer->tilemapWider ? 0x800 : 0x400;
+  const uint16 *tps[2] = {
+    &ppu->vram[sc_offs & 0x7fff],
+    &ppu->vram[sc_offs + (bglayer->tilemapWider ? 0x400 : 0) & 0x7fff]
+  };
+  int tileadr = ppu->bgLayer[layer].tileAdr, pixel;
+  int tileadr1 = tileadr + 7 - (y & 0x7), tileadr0 = tileadr + (y & 0x7);
+  const uint16 *addr;
+  for (size_t windex = 0; windex < win.nr; windex++) {
+    if (win.bits & (1 << windex))
+      continue;  // layer is disabled for this window part
+    uint x = win.edges[windex] + bglayer->hScroll;
+    uint w = win.edges[windex + 1] - win.edges[windex];
+    uint8 *dst = ppu->bgBuffers[sub].pixel + win.edges[windex];
+    const uint16 *tp = tps[x >> 8 & 1] + ((x >> 3) & 0x1f);
+    const uint16 *tp_last = tps[x >> 8 & 1] + 31;
+    const uint16 *tp_next = tps[(x >> 8 & 1) ^ 1];
+    // Handle clipped pixels on left side
+    if (x & 7) {
+      int curw = IntMin(8 - (x & 7), w);
+      w -= curw;
+      uint32 tile = *tp;
+      tp = (tp != tp_last) ? tp + 1 : tp_next;
+      int ta = (tile & 0x8000) ? tileadr1 : tileadr0;
+      uint8 z = (tile & 0x2000) ? zhi : zlo;
+      uint32 bits = READ_BITS(ta, tile & 0x3ff);
+      if (bits) {
+        int paletteBase = (tile & 0x1c00) >> kPaletteShift;
+        if (tile & 0x4000) {
+          bits >>= (x & 7), x += curw;
+          do DO_PIXEL(0); while (bits >>= 1, dst++, --curw);
+        } else {
+          bits <<= (x & 7), x += curw;
+          do DO_PIXEL_HFLIP(0); while (bits <<= 1, dst++, --curw);
+        }
+      } else {
+        dst += curw;
+      }
+    }
+    // Handle full tiles in the middle
+    while (w >= 8) {
+      uint32 tile = *tp;
+      tp = (tp != tp_last) ? tp + 1 : tp_next;
+      int ta = (tile & 0x8000) ? tileadr1 : tileadr0;
+      uint8 z = (tile & 0x2000) ? zhi : zlo;
+      uint32 bits = READ_BITS(ta, tile & 0x3ff);
+      if (bits) {
+        int paletteBase = (tile & 0x1c00) >> kPaletteShift;
+        if (tile & 0x4000) {
+          DO_PIXEL(0); DO_PIXEL(1); DO_PIXEL(2); DO_PIXEL(3);
+          DO_PIXEL(4); DO_PIXEL(5); DO_PIXEL(6); DO_PIXEL(7);
+        } else {
+          DO_PIXEL_HFLIP(0); DO_PIXEL_HFLIP(1); DO_PIXEL_HFLIP(2); DO_PIXEL_HFLIP(3);
+          DO_PIXEL_HFLIP(4); DO_PIXEL_HFLIP(5); DO_PIXEL_HFLIP(6); DO_PIXEL_HFLIP(7);
+        }
+      }
+      dst += 8, w -= 8;
+    }
+    // Handle remaining clipped part
+    if (w) {
+      uint32 tile = *tp;
+      int ta = (tile & 0x8000) ? tileadr1 : tileadr0;
+      uint8 z = (tile & 0x2000) ? zhi : zlo;
+      uint32 bits = READ_BITS(ta, tile & 0x3ff);
+      if (bits) {
+        int paletteBase = (tile & 0x1c00) >> kPaletteShift;
+        if (tile & 0x4000) {
+          do DO_PIXEL(0); while (bits >>= 1, dst++, --w);
+        } else {
+          do DO_PIXEL_HFLIP(0); while (bits <<= 1, dst++, --w);
+        }
+      }
+    }
+  }
+#undef READ_BITS
+#undef DO_PIXEL
+#undef DO_PIXEL_HFLIP
+}
+
+// Draw a whole line of a 2bpp background layer into bgBuffers
+static void PpuDrawBackground_2bpp(Ppu *ppu, uint y, bool sub, uint layer, uint8 zhi, uint8 zlo) {
+#define DO_PIXEL(i) do { \
+  pixel = (bits >> i) & 1 | (bits >> (7 + i)) & 2; \
+  if (pixel && z > dst[256 + i]) dst[i] = paletteBase + pixel, dst[256 + i] = z; } while (0)
+#define DO_PIXEL_HFLIP(i) do { \
+  pixel = (bits >> (7 - i)) & 1 | (bits >> (14 - i)) & 2; \
+  if (pixel && z > dst[256 + i]) dst[i] = paletteBase + pixel, dst[256 + i] = z; } while (0)
+#define READ_BITS(ta, tile) (addr = &ppu->vram[(ta) + (tile) * 8 & 0x7fff], addr[0])
+  enum { kPaletteShift = 8 };
+  Layer *layerp = &ppu->layer[layer];
+  if (!layerp->screenEnabled[sub])
+    return;  // layer is completely hidden
+  PpuWindows win;
+  layerp->screenWindowed[sub] ? PpuWindows_Calc(&win, ppu, layer) : PpuWindows_Clear(&win);
+  BgLayer *bglayer = &ppu->bgLayer[layer];
+  y += bglayer->vScroll;
+  int sc_offs = bglayer->tilemapAdr + (((y >> 3) & 0x1f) << 5);
+  if ((y & 0x100) && bglayer->tilemapHigher)
+    sc_offs += bglayer->tilemapWider ? 0x800 : 0x400;
+  const uint16 *tps[2] = {
+    &ppu->vram[sc_offs & 0x7fff],
+    &ppu->vram[sc_offs + (bglayer->tilemapWider ? 0x400 : 0) & 0x7fff]
+  };
+  int tileadr = ppu->bgLayer[layer].tileAdr, pixel;
+  int tileadr1 = tileadr + 7 - (y & 0x7), tileadr0 = tileadr + (y & 0x7);
+  const uint16 *addr;
+  for (size_t windex = 0; windex < win.nr; windex++) {
+    if (win.bits & (1 << windex))
+      continue;  // layer is disabled for this window part
+    uint x = win.edges[windex] + bglayer->hScroll;
+    uint w = win.edges[windex + 1] - win.edges[windex];
+    uint8 *dst = ppu->bgBuffers[sub].pixel + win.edges[windex];
+    const uint16 *tp = tps[x >> 8 & 1] + ((x >> 3) & 0x1f);
+    const uint16 *tp_last = tps[x >> 8 & 1] + 31;
+    const uint16 *tp_next = tps[(x >> 8 & 1) ^ 1];
+    // Handle clipped pixels on left side
+    if (x & 7) {
+      int curw = IntMin(8 - (x & 7), w);
+      w -= curw;
+      uint32 tile = *tp;
+      tp = (tp != tp_last) ? tp + 1 : tp_next;
+      int ta = (tile & 0x8000) ? tileadr1 : tileadr0;
+      uint8 z = (tile & 0x2000) ? zhi : zlo;
+      uint32 bits = READ_BITS(ta, tile & 0x3ff);
+      if (bits) {
+        int paletteBase = (tile & 0x1c00) >> kPaletteShift;
+        if (tile & 0x4000) {
+          bits >>= (x & 7), x += curw;
+          do DO_PIXEL(0); while (bits >>= 1, dst++, --curw);
+        } else {
+          bits <<= (x & 7), x += curw;
+          do DO_PIXEL_HFLIP(0); while (bits <<= 1, dst++, --curw);
+        }
+      } else {
+        dst += curw;
+      }
+    }
+    // Handle full tiles in the middle
+    while (w >= 8) {
+      uint32 tile = *tp;
+      tp = (tp != tp_last) ? tp + 1 : tp_next;
+      int ta = (tile & 0x8000) ? tileadr1 : tileadr0;
+      uint8 z = (tile & 0x2000) ? zhi : zlo;
+      uint32 bits = READ_BITS(ta, tile & 0x3ff);
+      if (bits) {
+        int paletteBase = (tile & 0x1c00) >> kPaletteShift;
+        if (tile & 0x4000) {
+          DO_PIXEL(0); DO_PIXEL(1); DO_PIXEL(2); DO_PIXEL(3);
+          DO_PIXEL(4); DO_PIXEL(5); DO_PIXEL(6); DO_PIXEL(7);
+        } else {
+          DO_PIXEL_HFLIP(0); DO_PIXEL_HFLIP(1); DO_PIXEL_HFLIP(2); DO_PIXEL_HFLIP(3);
+          DO_PIXEL_HFLIP(4); DO_PIXEL_HFLIP(5); DO_PIXEL_HFLIP(6); DO_PIXEL_HFLIP(7);
+        }
+      }
+      dst += 8, w -= 8;
+    }
+    // Handle remaining clipped part
+    if (w) {
+      uint32 tile = *tp;
+      int ta = (tile & 0x8000) ? tileadr1 : tileadr0;
+      uint8 z = (tile & 0x2000) ? zhi : zlo;
+      uint32 bits = READ_BITS(ta, tile & 0x3ff);
+      if (bits) {
+        int paletteBase = (tile & 0x1c00) >> kPaletteShift;
+        if (tile & 0x4000) {
+          do DO_PIXEL(0); while (bits >>= 1, dst++, --w);
+        } else {
+          do DO_PIXEL_HFLIP(0); while (bits <<= 1, dst++, --w);
+        }
+      }
+    }
+  }
+#undef READ_BITS
+#undef DO_PIXEL
+#undef DO_PIXEL_HFLIP
+}
+
+// Draw a whole line of a 4bpp background layer into bgBuffers, with mosaic applied
+static void PpuDrawBackground_4bpp_mosaic(Ppu *ppu, uint y, bool sub, uint layer, uint8 zhi, uint8 zlo) {
+#define GET_PIXEL(i) pixel = (bits) & 1 | (bits >> 7) & 2 | (bits >> 14) & 4 | (bits >> 21) & 8
+#define GET_PIXEL_HFLIP(i) pixel = (bits >> 7) & 1 | (bits >> 14) & 2 | (bits >> 21) & 4 | (bits >> 28) & 8
+#define READ_BITS(ta, tile) (addr = &ppu->vram[((ta) + (tile) * 16) & 0x7fff], addr[0] | addr[8] << 16)
+  enum { kPaletteShift = 6 };
+  Layer *layerp = &ppu->layer[layer];
+  if (!layerp->screenEnabled[sub])
+    return;  // layer is completely hidden
+  PpuWindows win;
+  layerp->screenWindowed[sub] ? PpuWindows_Calc(&win, ppu, layer) : PpuWindows_Clear(&win);
+  BgLayer *bglayer = &ppu->bgLayer[layer];
+  y = ppu->mosaicModulo[y] + bglayer->vScroll;
+  int sc_offs = bglayer->tilemapAdr + (((y >> 3) & 0x1f) << 5);
+  if ((y & 0x100) && bglayer->tilemapHigher)
+    sc_offs += bglayer->tilemapWider ? 0x800 : 0x400;
+  const uint16 *tps[2] = {
+    &ppu->vram[sc_offs & 0x7fff],
+    &ppu->vram[sc_offs + (bglayer->tilemapWider ? 0x400 : 0) & 0x7fff]
+  };
+  int tileadr = ppu->bgLayer[layer].tileAdr, pixel;
+  int tileadr1 = tileadr + 7 - (y & 0x7), tileadr0 = tileadr + (y & 0x7);
+  const uint16 *addr;
+  for (size_t windex = 0; windex < win.nr; windex++) {
+    if (win.bits & (1 << windex))
+      continue;  // layer is disabled for this window part
+    int sx = win.edges[windex];
+    uint8 *dst = ppu->bgBuffers[sub].pixel + sx;
+    uint8 *dst_end = ppu->bgBuffers[sub].pixel + win.edges[windex + 1];
+    uint x = sx + bglayer->hScroll;
+    const uint16 *tp = tps[x >> 8 & 1] + ((x >> 3) & 0x1f);
+    const uint16 *tp_last = tps[x >> 8 & 1] + 31, *tp_next = tps[(x >> 8 & 1) ^ 1];
+    x &= 7;
+    int w = ppu->mosaicSize - (sx - ppu->mosaicModulo[sx]);
+    do {
+      w = IntMin(w, dst_end - dst);
+      uint32 tile = *tp;
+      int ta = (tile & 0x8000) ? tileadr1 : tileadr0;
+      uint8 z = (tile & 0x2000) ? zhi : zlo;
+      uint32 bits = READ_BITS(ta, tile & 0x3ff);
+      if (tile & 0x4000) bits >>= x, GET_PIXEL(0); else bits <<= x, GET_PIXEL_HFLIP(0);
+      if (pixel) {
+        pixel += (tile & 0x1c00) >> kPaletteShift;
+        int i = 0;
+        do {
+          if (z > dst[i + 256])
+            dst[i] = pixel, dst[i + 256] = z;
+        } while (++i != w);
+      }
+      dst += w, x += w;
+      for (; x >= 8; x -= 8)
+        tp = (tp != tp_last) ? tp + 1 : tp_next;
+      w = ppu->mosaicSize;
+    } while (dst_end - dst != 0);
+  }
+#undef READ_BITS
+#undef GET_PIXEL
+#undef GET_PIXEL_HFLIP
+}
+
+// Draw a whole line of a 2bpp background layer into bgBuffers, with mosaic applied
+static void PpuDrawBackground_2bpp_mosaic(Ppu *ppu, int y, bool sub, uint layer, uint8 zhi, uint8 zlo) {
+#define GET_PIXEL(i) pixel = (bits) & 1 | (bits >> 7) & 2
+#define GET_PIXEL_HFLIP(i) pixel = (bits >> 7) & 1 | (bits >> 14) & 2
+#define READ_BITS(ta, tile) (addr = &ppu->vram[((ta) + (tile) * 8) & 0x7fff], addr[0])
+  enum { kPaletteShift = 8 };
+  Layer *layerp = &ppu->layer[layer];
+  if (!layerp->screenEnabled[sub])
+    return;  // layer is completely hidden
+  PpuWindows win;
+  layerp->screenWindowed[sub] ? PpuWindows_Calc(&win, ppu, layer) : PpuWindows_Clear(&win);
+  BgLayer *bglayer = &ppu->bgLayer[layer];
+  y = ppu->mosaicModulo[y] + bglayer->vScroll;
+  int sc_offs = bglayer->tilemapAdr + (((y >> 3) & 0x1f) << 5);
+  if ((y & 0x100) && bglayer->tilemapHigher)
+    sc_offs += bglayer->tilemapWider ? 0x800 : 0x400;
+  const uint16 *tps[2] = {
+    &ppu->vram[sc_offs & 0x7fff],
+    &ppu->vram[sc_offs + (bglayer->tilemapWider ? 0x400 : 0) & 0x7fff]
+  };
+  int tileadr = ppu->bgLayer[layer].tileAdr, pixel;
+  int tileadr1 = tileadr + 7 - (y & 0x7), tileadr0 = tileadr + (y & 0x7);
+  const uint16 *addr;
+  for (size_t windex = 0; windex < win.nr; windex++) {
+    if (win.bits & (1 << windex))
+      continue;  // layer is disabled for this window part
+    int sx = win.edges[windex];
+    uint8 *dst = ppu->bgBuffers[sub].pixel + sx;
+    uint8 *dst_end = ppu->bgBuffers[sub].pixel + win.edges[windex + 1];
+    uint x = sx + bglayer->hScroll;
+    const uint16 *tp = tps[x >> 8 & 1] + ((x >> 3) & 0x1f);
+    const uint16 *tp_last = tps[x >> 8 & 1] + 31, *tp_next = tps[(x >> 8 & 1) ^ 1];
+    x &= 7;
+    int w = ppu->mosaicSize - (sx - ppu->mosaicModulo[sx]);
+    do {
+      w = IntMin(w, dst_end - dst);
+      uint32 tile = *tp;
+      int ta = (tile & 0x8000) ? tileadr1 : tileadr0;
+      uint8 z = (tile & 0x2000) ? zhi : zlo;
+      uint32 bits = READ_BITS(ta, tile & 0x3ff);
+      if (tile & 0x4000) bits >>= x, GET_PIXEL(0); else bits <<= x, GET_PIXEL_HFLIP(0);
+      if (pixel) {
+        pixel += (tile & 0x1c00) >> kPaletteShift;
+        uint i = 0;
+        do {
+          if (z > dst[i + 256])
+            dst[i] = pixel, dst[i + 256] = z;
+        } while (++i != w);
+      }
+      dst += w, x += w;
+      for (; x >= 8; x -= 8)
+        tp = (tp != tp_last) ? tp + 1 : tp_next;
+      w = ppu->mosaicSize;
+    } while (dst_end - dst != 0);
+  }
+#undef READ_BITS
+#undef GET_PIXEL
+#undef GET_PIXEL_HFLIP
+}
+
+
+// level6 should be set if it's from palette 0xc0 which means color math is not applied
+#define SPRITE_PRIO_TO_PRIO(prio, level6) (((prio) * 4 + 2) * 16 + 4 + (level6 ? 2 : 0))
+#define SPRITE_PRIO_TO_PRIO_HI(prio) ((prio) * 4 + 2)
+
+static void PpuDrawSprites(Ppu *ppu, uint y, uint sub, bool clear_backdrop) {
+  Layer *layerp = &ppu->layer[4];
+  if (!layerp->screenEnabled[sub])
+    return;  // layer is completely hidden
+  PpuWindows win;
+  layerp->screenWindowed[sub] ? PpuWindows_Calc(&win, ppu, 4) : PpuWindows_Clear(&win);
+  for (size_t windex = 0; windex < win.nr; windex++) {
+    if (win.bits & (1 << windex))
+      continue;  // layer is disabled for this window part
+    int left = win.edges[windex];
+    int width = win.edges[windex + 1] - left;
+    uint8 *src = ppu->objBuffer.pixel + left;
+    uint8 *dst = ppu->bgBuffers[sub].pixel + left;
+    if (clear_backdrop) {
+      memcpy(dst, src, width);
+      memcpy(dst + 256, src + 256, width);
+    } else {
+      do {
+        if (src[256] > dst[256])
+          dst[0] = src[0], dst[256] = src[256];
+      } while (src++, dst++, --width);
+    }
+  }
+}
+
+// Assumes it's drawn on an empty backdrop
+static void PpuDrawBackground_mode7(Ppu *ppu, uint y, bool sub, uint8 z) {
+  int layer = 0;
+  Layer *layerp = &ppu->layer[layer];
+  if (!layerp->screenEnabled[sub])
+    return;  // layer is completely hidden
+  PpuWindows win;
+  layerp->screenWindowed[sub] ? PpuWindows_Calc(&win, ppu, layer) : PpuWindows_Clear(&win);
+
+  // expand 13-bit values to signed values
+  int hScroll = ((int16_t)(ppu->m7matrix[6] << 3)) >> 3;
+  int vScroll = ((int16_t)(ppu->m7matrix[7] << 3)) >> 3;
+  int xCenter = ((int16_t)(ppu->m7matrix[4] << 3)) >> 3;
+  int yCenter = ((int16_t)(ppu->m7matrix[5] << 3)) >> 3;
+  int clippedH = hScroll - xCenter;
+  int clippedV = vScroll - yCenter;
+  clippedH = (clippedH & 0x2000) ? (clippedH | ~1023) : (clippedH & 1023);
+  clippedV = (clippedV & 0x2000) ? (clippedV | ~1023) : (clippedV & 1023);
+  bool mosaic_enabled = ppu->bgLayer[0].mosaicEnabled && ppu->mosaicSize > 1;
+  if (mosaic_enabled)
+    y = ppu->mosaicModulo[y];
+  uint32 ry = ppu->m7yFlip ? 255 - y : y;
+  uint32 m7startX = (ppu->m7matrix[0] * clippedH & ~63) + (ppu->m7matrix[1] * ry & ~63) +
+    (ppu->m7matrix[1] * clippedV & ~63) + (xCenter << 8);
+  uint32 m7startY = (ppu->m7matrix[2] * clippedH & ~63) + (ppu->m7matrix[3] * ry & ~63) +
+    (ppu->m7matrix[3] * clippedV & ~63) + (yCenter << 8);
+  for (size_t windex = 0; windex < win.nr; windex++) {
+    if (win.bits & (1 << windex))
+      continue;  // layer is disabled for this window part
+    int x = win.edges[windex], x2 = win.edges[windex + 1], tile;
+    uint8 *dst = ppu->bgBuffers[sub].pixel + x, *dst_end = ppu->bgBuffers[sub].pixel + x2;
+    uint32 rx = ppu->m7xFlip ? 255 - x : x;
+    uint32 xpos = m7startX + ppu->m7matrix[0] * rx;
+    uint32 ypos = m7startY + ppu->m7matrix[2] * rx;
+    uint32 dx = ppu->m7xFlip ? -ppu->m7matrix[0] : ppu->m7matrix[0];
+    uint32 dy = ppu->m7xFlip ? -ppu->m7matrix[2] : ppu->m7matrix[2];
+    uint32 outside_value = ppu->m7largeField ? 0x3ffff : 0xffffffff;
+    bool char_fill = ppu->m7charFill;
+    if (mosaic_enabled) {
+      int w = ppu->mosaicSize - (x - ppu->mosaicModulo[x]);
+      do {
+        w = IntMin(w, dst_end - dst);
+        if ((uint32)(xpos | ypos) > outside_value) {
+          if (!char_fill)
+            continue;
+          tile = 0;
+        } else {
+          tile = ppu->vram[(ypos >> 11 & 0x7f) * 128 + (xpos >> 11 & 0x7f)] & 0xff;
+        }
+        uint8 pixel = ppu->vram[tile * 64 + (ypos >> 8 & 7) * 8 + (xpos >> 8 & 7)] >> 8;
+        if (pixel) {
+          int i = 0;
+          do dst[i] = pixel, dst[i + 256] = z; while (++i != w);
+        }
+      } while (xpos += dx * w, ypos += dy * w, dst += w, w = ppu->mosaicSize, dst_end - dst != 0);
+    } else {
+      do {
+        if ((uint32)(xpos | ypos) > outside_value) {
+          if (!char_fill)
+            continue;
+          tile = 0;
+        } else {
+          tile = ppu->vram[(ypos >> 11 & 0x7f) * 128 + (xpos >> 11 & 0x7f)] & 0xff;
+        }
+        uint8 pixel = ppu->vram[tile * 64 + (ypos >> 8 & 7) * 8 + (xpos >> 8 & 7)] >> 8;
+        if (pixel)
+          dst[0] = pixel, dst[256] = z;
+      } while (xpos += dx, ypos += dy, ++dst != dst_end);
+    }
+  }
+}
+
+static void PpuDrawBackgrounds(Ppu *ppu, int y, bool sub) {
+// Top 4 bits contain the prio level, and bottom 4 bits the layer type.
+// SPRITE_PRIO_TO_PRIO can be used to convert from obj prio to this prio.
+//  15: BG3 tiles with priority 1 if bit 3 of $2105 is set
+//  14: Sprites with priority 3 (4 * sprite_prio + 2)
+//  12: BG1 tiles with priority 1
+//  11: BG2 tiles with priority 1
+//  10: Sprites with priority 2 (4 * sprite_prio + 2)
+//  8: BG1 tiles with priority 0
+//  7: BG2 tiles with priority 0
+//  6: Sprites with priority 1 (4 * sprite_prio + 2)
+//  3: BG3 tiles with priority 1 if bit 3 of $2105 is clear
+//  2: Sprites with priority 0 (4 * sprite_prio + 2)
+//  1: BG3 tiles with priority 0
+//  0: backdrop
+
+  if (ppu->mode == 1) {
+    if (ppu->lineHasSprites)
+      PpuDrawSprites(ppu, y, sub, true);
+
+    if (ppu->bgLayer[0].mosaicEnabled && ppu->mosaicSize > 1)
+      PpuDrawBackground_4bpp_mosaic(ppu, y, sub, 0, 0xc0, 0x80);
+    else
+      PpuDrawBackground_4bpp(ppu, y, sub, 0, 0xc0, 0x80);
+
+    if (ppu->bgLayer[1].mosaicEnabled && ppu->mosaicSize > 1)
+      PpuDrawBackground_4bpp_mosaic(ppu, y, sub, 1, 0xb1, 0x71);
+    else
+      PpuDrawBackground_4bpp(ppu, y, sub, 1, 0xb1, 0x71);
+
+    if (ppu->bgLayer[2].mosaicEnabled && ppu->mosaicSize > 1)
+      PpuDrawBackground_2bpp_mosaic(ppu, y, sub, 2, 0xf2, 0x12);
+    else
+      PpuDrawBackground_2bpp(ppu, y, sub, 2, 0xf2, 0x12);
+  } else {
+    // mode 7
+    PpuDrawBackground_mode7(ppu, y, sub, 0xc0);
+    if (ppu->lineHasSprites)
+      PpuDrawSprites(ppu, y, sub, false);
+  }
+}
+
+static NOINLINE void PpuDrawWholeLine(Ppu *ppu, uint y) {
+  if (ppu->forcedBlank) {
+    int row = (y - 1) + (ppu->evenFrame ? 0 : 239);
+    uint8_t *dst = &ppu->pixelBuffer[row * 2048];
+    for (int i = 0; i < 256; i++, dst += 8) {
+      dst[1] = dst[5] = 0;
+      dst[2] = dst[6] = 0;
+      dst[3] = dst[7] = 0;
+    }
+    return;
+  }
+
+  // Cache the brightness computation
+  if (ppu->brightness != ppu->lastBrightnessMult) {
+    uint8_t ppu_brightness = ppu->brightness;
+    ppu->lastBrightnessMult = ppu_brightness;
+    for (int i = 0; i < 32; i++)
+      ppu->brightnessMultHalf[i * 2] = ppu->brightnessMultHalf[i * 2 + 1] = ppu->brightnessMult[i] =
+          ((i << 3) | (i >> 2)) * ppu_brightness / 15;
+    // Store 31 extra entries to remove the need for clamping to 31.
+    memset(&ppu->brightnessMult[32], ppu->brightnessMult[31], 31);
+  }
+
+  // Default background is backdrop
+  memset(&ppu->bgBuffers[0].pixel, 0, sizeof(ppu->bgBuffers[0].pixel));
+  memset(&ppu->bgBuffers[0].prio, 0x05, sizeof(ppu->bgBuffers[0].prio));
+
+  // Render main screen
+  PpuDrawBackgrounds(ppu, y, false);
+
+  // The 6:th bit is automatically zero, math is never applied to the first half of the sprites.
+  uint32 math_enabled = ppu->mathEnabled[0] << 0 | ppu->mathEnabled[1] << 1 | ppu->mathEnabled[2] << 2 |
+                        ppu->mathEnabled[3] << 3 | ppu->mathEnabled[4] << 4 | ppu->mathEnabled[5] << 5;
+
+  // Render also the subscreen?
+  bool rendered_subscreen = false;
+  if (ppu->preventMathMode != 3 && ppu->addSubscreen && math_enabled) {
+    memset(&ppu->bgBuffers[1].pixel, 0, sizeof(ppu->bgBuffers[1].pixel));
+    if (ppu->layer[0].screenEnabled[1] | ppu->layer[1].screenEnabled[1] | ppu->layer[2].screenEnabled[1] |
+        ppu->layer[3].screenEnabled[1] | ppu->layer[4].screenEnabled[1]) {
+      memset(&ppu->bgBuffers[1].prio, 0x05, sizeof(ppu->bgBuffers[1].prio));
+      PpuDrawBackgrounds(ppu, y, true);
+      rendered_subscreen = true;
+    }
+  }
+
+  // Color window affects the drawing mode in each region
+  PpuWindows cwin;
+  PpuWindows_Calc(&cwin, ppu, 5);
+  static const uint8 kCwBitsMod[8] = {
+    0x00, 0xff, 0xff, 0x00,
+    0xff, 0x00, 0xff, 0x00,
+  };
+  uint32 cw_clip_math = ((cwin.bits & kCwBitsMod[ppu->clipMode]) ^ kCwBitsMod[ppu->clipMode + 4]) |
+                        ((cwin.bits & kCwBitsMod[ppu->preventMathMode]) ^ kCwBitsMod[ppu->preventMathMode + 4]) << 8;
+
+  int row = (y - 1) + (ppu->evenFrame ? 0 : 239);
+  uint32 *dst = (uint32*)&ppu->pixelBuffer[row * 2048];
+  
+  uint32 windex = 0;
+  do {
+    uint32 left = cwin.edges[windex], right = cwin.edges[windex + 1];
+    // If clip is set, then zero out the rgb values from the main screen.
+    uint32 clip_color_mask = (cw_clip_math & 1) ? 0x1f : 0;
+    uint32 math_enabled_cur = (cw_clip_math & 0x100) ? math_enabled : 0;
+    uint32 fixed_color = ppu->fixedColorR | ppu->fixedColorG << 5 | ppu->fixedColorB << 10;
+    if (math_enabled_cur == 0 || fixed_color == 0 && !ppu->halfColor && !rendered_subscreen) {
+      // Math is disabled (or has no effect), so can avoid the per-pixel maths check
+      uint32 i = left;
+      do {
+        uint32 color = ppu->cgram[ppu->bgBuffers[0].pixel[i]];
+        dst[1] = dst[0] = ppu->brightnessMult[color & clip_color_mask] << 24 |
+                          ppu->brightnessMult[(color >> 5) & clip_color_mask] << 16 |
+                          ppu->brightnessMult[(color >> 10) & clip_color_mask] << 8;
+      } while (dst += 2, ++i < right);
+    } else {
+      uint8 *half_color_map = ppu->halfColor ? ppu->brightnessMultHalf : ppu->brightnessMult;
+      // Store this in locals
+      math_enabled_cur |= ppu->addSubscreen << 8 | ppu->subtractColor << 9;
+      // Need to check for each pixel whether to use math or not based on the main screen layer.
+      uint32 i = left;
+      do {
+        uint32 color = ppu->cgram[ppu->bgBuffers[0].pixel[i]], color2;
+        uint8 main_layer = ppu->bgBuffers[0].prio[i] & 0xf;
+        uint32 r = color & clip_color_mask;
+        uint32 g = (color >> 5) & clip_color_mask;
+        uint32 b = (color >> 10) & clip_color_mask;
+        uint8 *color_map = ppu->brightnessMult;
+        if (math_enabled_cur & (1 << main_layer)) {
+          if (math_enabled_cur & 0x100) {  // addSubscreen ?
+            if (ppu->bgBuffers[1].pixel[i] != 0)
+              color2 = ppu->cgram[ppu->bgBuffers[1].pixel[i]], color_map = half_color_map;
+            else  // Don't halve if ppu->addSubscreen && backdrop
+              color2 = fixed_color;
+          } else {
+            color2 = fixed_color, color_map = half_color_map;
+          }
+          uint32 r2 = (color2 & 0x1f), g2 = ((color2 >> 5) & 0x1f), b2 = ((color2 >> 10) & 0x1f);
+          if (math_enabled_cur & 0x200) {  // subtractColor?
+            r = (r >= r2) ? r - r2 : 0;
+            g = (g >= g2) ? g - g2 : 0;
+            b = (b >= b2) ? b - b2 : 0;
+          } else {
+            r += r2;
+            g += g2;
+            b += b2;
+          }
+        }
+        dst[0] = dst[1] = color_map[r] << 24 | color_map[g] << 16 | color_map[b] << 8;
+      } while (dst += 2, ++i < right);
+    }
+  } while (cw_clip_math >>= 1, ++windex < cwin.nr);
+}
+
 static void ppu_handlePixel(Ppu* ppu, int x, int y) {
   int r = 0, r2 = 0;
   int g = 0, g2 = 0;
@@ -335,7 +997,7 @@
       } else {
         // get a pixel from the sprite buffer
         pixel = 0;
-        if (ppu->objPriorityBuffer[x] == curPriority) pixel = ppu->objPixelBuffer[x];
+        if ((ppu->objBuffer.prio[x] >> 4) == SPRITE_PRIO_TO_PRIO_HI(curPriority)) pixel = ppu->objBuffer.pixel[x];
       }
     }
     if (pixel > 0) {
@@ -488,74 +1150,70 @@
   return false;
 }
 
-static void ppu_evaluateSprites(Ppu* ppu, int line) {
+static bool ppu_evaluateSprites(Ppu* ppu, int line) {
   // TODO: iterate over oam normally to determine in-range sprites,
   //   then iterate those in-range sprites in reverse for tile-fetching
   // TODO: rectangular sprites, wierdness with sprites at -256
-  uint8_t index = ppu->objPriority ? (ppu->oamAdr & 0xfe) : 0;
-  int spritesFound = 0;
-  int tilesFound = 0;
-  for(int i = 0; i < 128; i++) {
-    uint8_t y = ppu->oam[index] >> 8;
+  int index = ppu->objPriority ? (ppu->oamAdr & 0xfe) : 0, index_end = index;
+  int spritesFound = 0, tilesFound = 0;
+  uint8 spriteSizes[2] = { kSpriteSizes[ppu->objSize][0], kSpriteSizes[ppu->objSize][1] };
+  do {
+    int yy = ppu->oam[index] >> 8;
+    if (yy == 0xf0)
+      continue;  // this works for zelda because sprites are always 8 or 16.
     // check if the sprite is on this line and get the sprite size
-    uint8_t row = line - y;
-    int spriteSize = spriteSizes[ppu->objSize][(ppu->highOam[index >> 3] >> ((index & 7) + 1)) & 1];
-    int spriteHeight = spriteSize;
-    if(row < spriteHeight) {
-      // in y-range, get the x location, using the high bit as well
-      int x = ppu->oam[index] & 0xff;
-      x |= ((ppu->highOam[index >> 3] >> (index & 7)) & 1) << 8;
-      if(x > 255) x -= 512;
-      // if in x-range
-      if(x > -spriteSize) {
-        // break if we found 32 sprites already
-        spritesFound++;
-        if(spritesFound > 32) {
-          ppu->rangeOver = true;
-          break;
+    int row = (line - yy) & 0xff;
+    int highOam = ppu->highOam[index >> 3] >> (index & 7);
+    int spriteSize = spriteSizes[(highOam >> 1) & 1];
+    if (row >= spriteSize)
+      continue;
+    // in y-range, get the x location, using the high bit as well
+    int x = (ppu->oam[index] & 0xff) - (highOam & 1) * 256;
+    // if in x-range
+    if (x <= -spriteSize)
+      continue;
+    // break if we found 32 sprites already
+    if (++spritesFound > 32) {
+      ppu->rangeOver = true;
+      break;
+    }
+    // get some data for the sprite and y-flip row if needed
+    int oam1 = ppu->oam[index + 1];
+    int objAdr = (oam1 & 0x100) ? ppu->objTileAdr2 : ppu->objTileAdr1;
+    if (oam1 & 0x8000)
+      row = spriteSize - 1 - row;
+    // fetch all tiles in x-range
+    uint8 paletteBase = 0x80 + 16 * ((oam1 & 0xe00) >> 9);
+    uint8 prio = SPRITE_PRIO_TO_PRIO((oam1 & 0x3000) >> 12, (oam1 & 0x800) == 0);
+    for (int col = 0; col < spriteSize; col += 8) {
+      if (col + x > -8 && col + x < 256) {
+        // break if we found 34 8*1 slivers already
+        if (++tilesFound > 34) {
+          ppu->timeOver = true;
+          return true;
         }
-        // get some data for the sprite and y-flip row if needed
-        int tile = ppu->oam[index + 1] & 0xff;
-        int palette = (ppu->oam[index + 1] & 0xe00) >> 9;
-        bool hFlipped = ppu->oam[index + 1] & 0x4000;
-        if(ppu->oam[index + 1] & 0x8000) row = spriteSize - 1 - row;
-        // fetch all tiles in x-range
-        for(int col = 0; col < spriteSize; col += 8) {
-          if(col + x > -8 && col + x < 256) {
-            // break if we found 34 8*1 slivers already
-            tilesFound++;
-            if(tilesFound > 34) {
-              ppu->timeOver = true;
-              break;
-            }
-            // figure out which tile this uses, looping within 16x16 pages, and get it's data
-            int usedCol = hFlipped ? spriteSize - 1 - col : col;
-            uint8_t usedTile = (((tile >> 4) + (row / 8)) << 4) | (((tile & 0xf) + (usedCol / 8)) & 0xf);
-            uint16_t objAdr = (ppu->oam[index + 1] & 0x100) ? ppu->objTileAdr2 : ppu->objTileAdr1;
-            uint16_t plane1 = ppu->vram[(objAdr + usedTile * 16 + (row & 0x7)) & 0x7fff];
-            uint16_t plane2 = ppu->vram[(objAdr + usedTile * 16 + 8 + (row & 0x7)) & 0x7fff];
-            // go over each pixel
-            for(int px = 0; px < 8; px++) {
-              int shift = hFlipped ? px : 7 - px;
-              int pixel = (plane1 >> shift) & 1;
-              pixel |= ((plane1 >> (8 + shift)) & 1) << 1;
-              pixel |= ((plane2 >> shift) & 1) << 2;
-              pixel |= ((plane2 >> (8 + shift)) & 1) << 3;
-              // draw it in the buffer if there is a pixel here, and the buffer there is still empty
-              int screenCol = col + x + px;
-              if(pixel != 0 && screenCol >= 0 && screenCol < 256 && ppu->objPixelBuffer[screenCol] == 0) {
-                ppu->objPixelBuffer[screenCol] = 0x80 + 16 * palette + pixel;
-                ppu->objPriorityBuffer[screenCol] = (ppu->oam[index + 1] & 0x3000) >> 12;
-              }
-            }
-          }
+        // figure out which tile this uses, looping within 16x16 pages, and get it's data
+        int usedCol = oam1 & 0x4000 ? spriteSize - 1 - col : col;
+        int usedTile = ((((oam1 & 0xff) >> 4) + (row >> 3)) << 4) | (((oam1 & 0xf) + (usedCol >> 3)) & 0xf);
+        uint16 *addr = &ppu->vram[(objAdr + usedTile * 16 + (row & 0x7)) & 0x7fff];
+        uint32 plane = addr[0] | addr[8] << 16;
+        // go over each pixel
+        int px_left = IntMax(-(col + x), 0);
+        int px_right = IntMin(256 - (col + x), 8);
+        uint8 *dst = ppu->objBuffer.pixel + col + x + px_left;
+        
+        for (int px = px_left; px < px_right; px++, dst++) {
+          int shift = oam1 & 0x4000 ? px : 7 - px;
+          uint32 bits = plane >> shift;
+          int pixel = (bits >> 0) & 1 | (bits >> 7) & 2 | (bits >> 14) & 4 | (bits >> 21) & 8;
+          // draw it in the buffer if there is a pixel here, and the buffer there is still empty
+          if (pixel != 0 && dst[0] == 0)
+            dst[0] = paletteBase + pixel, dst[256] = prio;
         }
-        if(tilesFound > 34)
-          break; // break out of sprite-loop if max tiles found
       }
     }
-    index += 2;
-  }
+  } while ((index = (index + 2) & 0xff) != index_end);
+  return (tilesFound != 0);
 }
 
 static uint16_t ppu_getVramRemap(Ppu* ppu) {
@@ -751,7 +1409,6 @@
       ppu->bgLayer[2].mosaicEnabled = val & 0x4;
       ppu->bgLayer[3].mosaicEnabled = val & 0x8;
       ppu->mosaicSize = (val >> 4) + 1;
-      ppu->mosaicStartLine = 0;
       break;
     }
     case 0x07:
--- a/snes/ppu.h
+++ b/snes/ppu.h
@@ -36,8 +36,21 @@
   uint8_t maskLogic_always_zero;
 } WindowLayer;
 
+typedef struct PpuPixelPrioBufs {
+  uint8_t pixel[256];
+  uint8_t prio[256];
+} PpuPixelPrioBufs;
+
 struct Ppu {
+  bool newRenderer;
+  bool lineHasSprites;
+  uint8_t lastBrightnessMult;
+  uint8_t lastMosaicModulo;
   Snes* snes;
+  // store 31 extra entries to remove the need for clamp
+  uint8_t brightnessMult[32 + 31]; 
+  uint8_t brightnessMultHalf[32 * 2];
+  PpuPixelPrioBufs bgBuffers[2];
   // vram access
   uint16_t vram[0x8000];
   uint16_t vramPointer;
@@ -64,8 +77,7 @@
   uint16_t objTileAdr1;
   uint16_t objTileAdr2;
   uint8_t objSize;
-  uint8_t objPixelBuffer[256]; // line buffers
-  uint8_t objPriorityBuffer[256];
+  PpuPixelPrioBufs objBuffer;
   bool timeOver;
   bool rangeOver;
   bool objInterlace_always_zero;
@@ -124,6 +136,9 @@
   bool countersLatched;
   uint8_t ppu1openBus;
   uint8_t ppu2openBus;
+
+  uint8_t mosaicModulo[256];
+
   // pixel buffer (xbgr)
   // times 2 for even and odd frame
   uint8_t pixelBuffer[512 * 4 * 239 * 2];
--- a/zelda_cpu_infra.c
+++ b/zelda_cpu_infra.c
@@ -375,7 +375,7 @@
 void CopyStateAfterSnapshotRestore(bool is_reset) {
   memcpy(g_zenv.ram, g_snes->ram, 0x20000);
   memcpy(g_zenv.sram, g_snes->cart->ram, g_snes->cart->ramSize);
-  memcpy(g_zenv.ppu->vram, &g_snes->ppu->vram, offsetof(Ppu, pixelBuffer) - offsetof(Ppu, vram));
+  memcpy(g_zenv.ppu->vram, &g_snes->ppu->vram, offsetof(Ppu, ppu2openBus) + 1 - offsetof(Ppu, vram));
   memcpy(g_zenv.player->ram, g_snes->apu->ram, sizeof(g_snes->apu->ram));
 
   if (!is_reset) {
@@ -392,7 +392,7 @@
   MakeSnapshot(&g_snapshot_before);
 
   // Copy from my state into the emulator
-  memcpy(&g_snes->ppu->vram, g_zenv.ppu->vram, offsetof(Ppu, pixelBuffer) - offsetof(Ppu, vram));
+  memcpy(&g_snes->ppu->vram, g_zenv.ppu->vram, offsetof(Ppu, ppu2openBus) + 1 - offsetof(Ppu, vram));
   memcpy(g_snes->ram, g_zenv.ram, 0x20000);
   memcpy(g_snes->cart->ram, g_zenv.sram, 0x2000);
   SpcPlayer_CopyVariablesToRam(g_zenv.player);