shithub: MicroHs

Download patch

ref: f03d44497b82a28ce1daa833fdc670f17e461507
parent: efdc17331aba84555fe9931748138aced8a45a63
author: Lennart Augustsson <lennart.augustsson@epicgames.com>
date: Sun Sep 29 14:05:58 EDT 2024

Some new compression stuff.

--- /dev/null
+++ b/bw.c
@@ -1,0 +1,194 @@
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+typedef uint32_t index_t;
+
+/*
+ * |.......................................|
+ *         ^           ^
+ *         a           b
+ *                     <-      n         ->
+ *         <-  m     ->
+ * <-  o ->
+ */
+static uint8_t *compar_arg;
+static size_t compar_len;
+int compar(const void *pa, const void *pb)
+{
+  index_t a = *(index_t*)pa;
+  index_t b = *(index_t*)pb;
+  int r;
+  if (a == b)
+    return 0;
+  if (a < b) {
+    size_t n = compar_len - b; /* bytes until end of buffer */
+    r = memcmp(compar_arg + a, compar_arg + b, n);
+    if (r)
+      return r;
+    size_t m = b - a;
+    r = memcmp(compar_arg + a + n, compar_arg, m);
+    if (r)
+      return r;
+    size_t o = a;
+    return memcmp(compar_arg, compar_arg + m, o);
+  } else {
+    size_t n = compar_len - a; /* bytes until end of buffer */
+    r = memcmp(compar_arg + a, compar_arg + b, n);
+    if (r)
+      return r;
+    size_t m = a - b;
+    r = memcmp(compar_arg, compar_arg + b + n, m);
+    if (r)
+      return r;
+    size_t o = a;
+    return memcmp(compar_arg + m, compar_arg, o);
+    
+  }
+  return 0;
+}
+
+/* Sort all rotations of buf, and the indices of the sorted strings in res. */
+void
+sort_buffer(uint8_t *buf, size_t buflen, index_t *res)
+{
+  for(size_t i = 0; i < buflen; i++)
+    res[i] = i;
+  compar_arg = buf;
+  compar_len = buflen;
+  qsort(res, buflen, sizeof(index_t), compar);
+}
+
+void
+put_rep(FILE *out, size_t n)
+{
+  if (n > 127)
+    put_rep(out, n / 128);
+  fputc(n % 128 + 128, out);
+}
+
+#define MINRLE 3
+
+void
+rle(FILE *out, uint8_t *data, size_t len)
+{
+  for(size_t i = 0; i < len; ) {
+    uint8_t c = data[i++];
+    size_t n;
+    for(n = 1; i < len && data[i] == c; i++, n++)
+      ;
+    if (n >= MINRLE) {
+      put_rep(out, n - 1);
+      fputc(c, out);
+    } else {
+      while(n-- > 0)
+        fputc(c, out);
+    }
+  }
+}
+
+size_t
+get_rep(FILE *in)
+{
+  size_t n = 0;
+  for(;;) {
+    int c = fgetc(in);
+    //fprintf(stderr,"get_rep %02x\n", c);
+    if (c < 0) abort();
+    if (c < 128) {
+      ungetc(c, in);
+      return n;
+    }
+    n = n * 128 + c - 128;
+  }
+}
+
+void
+unrle(FILE *in, uint8_t *data, size_t len)
+{
+  for(size_t i = 0; i < len; ) {
+    size_t n = get_rep(in);
+    int c = fgetc(stdin);
+    //fprintf(stderr,"unrle %02x\n", c);
+    if (c < 0) abort();
+    //fprintf(stderr, "n=%d c='%c'\n", (int)n, c);
+    n += 1;
+    for(size_t j = 0; j < n; j++) {
+      data[i++] = c;
+    }
+    //fprintf(stderr, "i=%d\n", (int)i);
+  }
+}
+
+#define MAXSIZE 10000000
+
+int
+main(int argc, char **argv)
+{
+  int encode = argc == 1;
+
+  if (encode) {
+    fprintf(stderr, "encode\n");
+    uint8_t *data = malloc(MAXSIZE);
+    size_t len = fread(data, 1, MAXSIZE, stdin);
+    index_t *res = malloc(len * sizeof(index_t));
+
+    sort_buffer((uint8_t*)data, len, res);
+#if 0
+    for(size_t i = 0; i < len; i++) {
+      index_t offs = res[i];
+      //fprintf(stderr, "offs=%d\n", (int)offs);
+      fwrite(data + offs, 1, len - offs, stdout);
+      fwrite(data,1, offs, stdout);
+      fputs("\n", stdout);
+    }
+#endif
+    uint8_t *last = malloc(len + 1);
+    index_t zero;
+    for(size_t i = 0; i < len; i++) {
+      index_t offs = res[i];
+      last[i] = data[(offs + len - 1) % len];
+      if (offs == 0)
+        zero = i;
+    }
+    last[len] = 0;
+    fprintf(stderr, "len=%d, zero=%d\n", (int)len, (int)zero);
+    // fprintf(stderr, "%s\n", last);
+    put_rep(stdout, len); fputc(0, stdout);
+    put_rep(stdout, zero); fputc(0, stdout);
+    rle(stdout, last, len);
+    // fwrite(last, 1, len, stdout);
+  } else {
+    fprintf(stderr, "decode\n");
+    size_t len = get_rep(stdin); if (fgetc(stdin) != 0) abort();
+    size_t zero = get_rep(stdin); if (fgetc(stdin) != 0) abort();
+    fprintf(stderr, "len=%d, zero=%d\n", (int)len, (int)zero);
+    uint8_t *data = malloc(len);
+    unrle(stdin, data, len);
+    // fwrite(data, 1, len, stdout);
+#define MAXCHAR 128
+    size_t count[MAXCHAR];
+    index_t *pred = malloc(len * sizeof(index_t));
+    uint8_t *odata = malloc(len);
+    for(size_t i = 0; i < MAXCHAR; i++) {
+      count[i] = 0;
+    }
+    for(size_t i = 0; i < len; i++) {
+      pred[i] = count[data[i]]++;
+    }
+    size_t sum = 0;
+    for(size_t i = 0; i < MAXCHAR; i++) {
+      size_t s = count[i];
+      count[i] = sum;
+      sum += s;
+    }
+    size_t i = zero;
+    for(size_t j = len; j > 0; j--) {
+      odata[j - 1] = data[i];
+      i = pred[i] + count[data[i]];
+    }
+    fwrite(odata, 1, len, stdout);
+ }
+  exit(0);
+}
--- a/src/runtime/bfile.c
+++ b/src/runtime/bfile.c
@@ -353,6 +353,160 @@
 }
 
 #endif  /* WANT_LZ77 */
+
+#if WANT_RLE
+/***************** BFILE via RLE decompression *******************/
+/*
+ * Run Length Encoding for ASCII
+ * Format
+ * c                -  c       one ASCII character
+ * 0x80+n c         -  n       repetitions of ASCII character c
+ * 0x80+n 0x80+m c  -  n*128+m repetitions of ASCII character c
+ * ... for longer run lengths
+ */
+
+struct BFILE_rle {
+  BFILE    mets;
+  BFILE    *bfile;              /* underlying BFILE */
+  size_t   count;
+  int      byte;
+  int      unget;
+};
+
+int
+get_rep(BFILE *in)
+{
+  size_t n = 0;
+  for(;;) {
+    int c = getb(in);
+    //fprintf(stderr,"get_rep %02x\n", c);
+    if (c < 0)
+      return -1;
+    if (c < 128) {
+      ungetb(c, in);
+      return n;
+    }
+    n = n * 128 + c - 128;
+  }
+}
+
+int
+getb_rle(BFILE *bp)
+{
+  struct BFILE_rle *p = (struct BFILE_rle*)bp;
+  CHECKBFILE(bp, getb_rle);
+  if (p->unget >= 0) {
+    int c = p->unget;
+    p->unget = -1;
+    return c;
+  }
+  if (p->count) {
+    p->count--;
+    return p->byte;
+  } else {
+    int n = get_rep(p->bfile);
+    if (n < 0)
+      return -1;
+    p->count = n;
+    p->byte = getb(p->bfile);
+    return p->byte;
+  }
+}
+
+void
+ungetb_rle(int c, BFILE *bp)
+{
+  struct BFILE_rle *p = (struct BFILE_rle*)bp;
+  CHECKBFILE(bp, getb_rle);
+  p->unget = c;
+}
+
+void
+put_rep(BFILE *out, size_t n)
+{
+  if (n > 127)
+    put_rep(out, n / 128);
+  putb(n % 128 + 128, out);
+}
+
+void
+putb_rle(int b, BFILE *bp)
+{
+  struct BFILE_rle *p = (struct BFILE_rle*)bp;
+  CHECKBFILE(bp, getb_rle);
+
+  if (b == p->byte) {
+    p->count++;
+  } else {
+    if (p->count > 2) {
+      /* More than 2 repeating chars, it's worth compressing */
+      put_rep(p->bfile, p->count - 1);
+      putb(p->byte, p->bfile);
+    } else {
+      while(p->count-- > 0)
+        putb(p->byte, p->bfile);
+    }
+    p->count = 1;
+    p->byte = b;
+  }
+}
+
+void
+closeb_rle(BFILE *bp)
+{
+  struct BFILE_rle *p = (struct BFILE_rle*)bp;
+  CHECKBFILE(bp, getb_rle);
+
+  closeb(p->bfile);
+}
+
+void
+flushb_rle(BFILE *bp)
+{
+  /* There is nothing we can do */
+}
+
+BFILE *
+add_rle_decompressor(BFILE *file)
+{
+  struct BFILE_rle *p = MALLOC(sizeof(struct BFILE_rle));
+
+  if (!p)
+    memerr();
+  p->mets.getb = getb_rle;
+  p->mets.ungetb = ungetb_rle;
+  p->mets.putb = 0;
+  p->mets.flushb = 0;
+  p->mets.closeb = closeb_rle;
+  p->count = 0;
+  p->unget = -1;
+  p->bfile = file;
+
+  return (BFILE*)p;
+}
+
+BFILE *
+add_rle_compressor(BFILE *file)
+{
+  struct BFILE_rle *p = MALLOC(sizeof(struct BFILE_rle));
+
+  if (!p)
+    memerr();
+  p->mets.getb = getb_rle;
+  p->mets.ungetb = 0;
+  p->mets.putb = putb_rle;
+  p->mets.flushb = flushb_rle;
+  p->mets.closeb = closeb_rle;
+  p->count = 0;
+  p->byte = -1;
+  p->bfile = file;
+
+  return (BFILE*)p;
+}
+
+#endif  /* WANT_RLE */
+
+
 /***************** BFILE with UTF8 encode/decode *******************/
 
 struct BFILE_utf8 {
--- a/src/runtime/eval.c
+++ b/src/runtime/eval.c
@@ -28,6 +28,10 @@
 #define WANT_LZ77 1
 #endif
 
+#if !defined(WANT_RLE)
+#define WANT_RLE 1
+#endif
+
 #if WANT_LZ77
 size_t lz77d(uint8_t *src, size_t srclen, uint8_t **bufp);
 size_t lz77c(uint8_t *src, size_t srclen, uint8_t **bufp);