shithub: MicroHs

ref: 7ae7d07f0b97bce384239c1f7c1ae4d407e7524c
dir: /src/runtime/lz77.c/

View raw version
#include <inttypes.h>
#include <stdlib.h>
#if 0
#include <stdio.h>
#define INLINE
#endif

#define MAX(x,y) ((x) > (y) ? (x) : (y))
#define MIN(x,y) ((x) < (y) ? (x) : (y))

#define MAXWIN 8192
#define MAXLEN (9 + 255)
#define MINMATCH 3
#define MINOFFS 1
#define MAXLIT 32

/*
 * Encoding inspired by FastLZ
 *  000nnnnn                      - literal run follows, with n+1 bytes
 *  111ooooo pppppppp nnnnnnnn    - at offset o*256+p, copy n+9 bytes (maximum offset is 8191)
 *  nnnooooo pppppppp             - at offset o*256+p, copy n+2 bytes (n != 0, n != 7)
 * Match length is >= 3.
 */

static void
put(uint8_t **bufp, size_t *sizep, size_t *offsp, uint8_t byte)
{
  if (*offsp >= *sizep) {
    *sizep *= 2;
    *bufp = realloc(*bufp, *sizep);
  }
  (*bufp)[(*offsp)++] = byte;
}
#define PUT(x) put(&outbuf, &outsize, &outoffs, (x))

size_t
lz77d(uint8_t *src, size_t srclen, uint8_t **bufp)
{
  uint8_t *end = src + srclen;
  size_t outsize = 100000;
  uint8_t *outbuf = malloc(outsize);
  size_t outoffs = 0;

  while (src < end) {
    int op = *src++;
    int opx = op & 0x1f;        /* part of offset, or literal length */
    op >>= 5;
    if (op == 0) {
      /* 000nnnnn */
      do {
        PUT(*src++);
      } while (--opx >= 0);
    } else {
      size_t len;
      size_t offs = opx * 256 + *src++ + MINOFFS;
      if (op == 7) {
        /* 111ooooo pppppppp nnnnnnnn */
        len = 9 + *src++;
      } else {
        /* nnnooooo pppppppp */
        len = 2 + op;
      }
      //printf("match cur=%d offs=%d len=%d\n", (int)(dst - adst), (int)offs, (int)len);
      offs = outoffs - offs;
      for (size_t i = 0; i < len; i++) {
        uint8_t b = outbuf[offs + i];
        PUT(b);
      }
    }
  }
  *bufp = outbuf;
  return outoffs;
}

#define HASHBITS 11
#define HASHSIZE (1 << HASHBITS)
typedef size_t lzhash_t;
static INLINE lzhash_t
lz77hash(uint8_t *b, size_t n)
{
#if 0
  n = MIN(n, MINMATCH);
  size_t h = 0;
  for (size_t i = 0; i < n; i++)
    h = h * 256 + b[i];
  return ((h * 99999989) >> 15) & (HASHSIZE - 1);
#else
  n = MIN(n, 4);
  size_t h = 5381;
  for (size_t i = 0; i < n; i++)
    h = h * 33 + b[i];
  return h & (HASHSIZE - 1);
#endif
}

static INLINE size_t
match(uint8_t *src, uint8_t *win, uint8_t *end)
{
  size_t n;

  for(n = 0; *src == *win && src < end && n < MAXLEN; src++, win++, n++)
    ;
  return n;
}

typedef size_t lzoffs_t;
#define NUMPREV 16

/* Find the longest match within the match window */
static INLINE size_t
find_longest_match(uint8_t *src, uint8_t *cur, uint8_t *end, size_t *match_offs_p, lzoffs_t lzhashes[][NUMPREV])
{
  size_t win_end = cur - src + 1;
  size_t win_len = MIN(win_end, MAXWIN);

  lzhash_t h = lz77hash(cur, end - cur);
  lzoffs_t *offsets = lzhashes[h];

  size_t m_len = 0;
  size_t m_offs = 0;
#if 0
  for (size_t offs = MINOFFS; offs < win_len; offs++) {
    size_t n = match(cur, cur - offs, end);
    if (n > m_len) {
      m_len = n;
      m_offs = offs;
    }
  }
#else
  for (size_t i = 0; i < NUMPREV; i++) {
    lzoffs_t o = offsets[i];
    if (o == ~0)
      break;                    /* unused slot */
    size_t offs = (cur - src) - o;
    if (offs >= win_len || offs < MINOFFS)
      break;                  /* offset is not in range */
    size_t n = match(cur, cur - offs, end);
    if (n > m_len) {
      m_len = n;
      m_offs = offs;
    }
  }
#endif
  *match_offs_p = m_offs;
  return m_len;
}

static INLINE void
lzupdatehash(uint8_t *src, uint8_t *cur, uint8_t *end, lzoffs_t lzhashes[][NUMPREV])
{
  lzhash_t h = lz77hash(cur, end - cur);
  lzoffs_t offs = cur - src;

  for(size_t k = NUMPREV-1; k > 0; k--)
    lzhashes[h][k] = lzhashes[h][k-1];
  lzhashes[h][0] = offs;
}


size_t
lz77c(uint8_t *src, size_t srclen, uint8_t **bufp)
{
  uint8_t *cur;
  uint8_t *end = src + srclen;
  size_t outsize = 25000;
  uint8_t *outbuf = malloc(outsize);
  size_t outoffs = 0;

  lzoffs_t lzhashes[HASHSIZE][NUMPREV];

  for(size_t i = 0; i < HASHSIZE; i++) {
    for(size_t k = 0; k < NUMPREV; k++)
      lzhashes[i][k] = ~0;
  }

  for (cur = src; cur < end; ) {
    size_t match_offs = 0;
    size_t match_len = 0;
    size_t len;
    /* Start from the current position and look for a match in the window. */
    /* If the is no match, try the next position, and so on. */
    for (len = 0; len < end - cur; len++) {
      match_len = find_longest_match(src, cur + len, end, &match_offs, lzhashes);
      if (match_len >= MINMATCH) /* Stop when we find a match. */
        break;
      lzupdatehash(src, cur + len, end, lzhashes);
    }
    /* As we exit the loop, we have len bytes that did not match anywhere
     * in the window, so they need to be emitted as a literal. */
    while (len) {
      /* Chunk up the literal into maximum sized pieces. */
      size_t n = MIN(len, MAXLIT);
      PUT(n - 1);               /* Chunk length - 1 */
      for (size_t i = 0; i < n; i++) {
        PUT(*cur++);            /* and spit out the chunk. */
      }
      len -= n;
    }
    if (match_len >= MINMATCH) {
      /* If we actually had a match, output it. */
      //cur += match_len;         /* skip over the matched positions */
      for(size_t i = 0; i < match_len; i++) {
        lzupdatehash(src, cur++, end, lzhashes);
      }
      match_offs -= MINOFFS;
      match_len -= 2;
      int hi = match_offs >> 8;
      int lo = match_offs & 0xff;
      if (match_len < 7) {      /* encode as 2 or 3 bytes */
        PUT((match_len << 5) + hi);
        PUT(lo);
      } else {
        PUT((7 << 5) + hi);
        PUT(lo);
        PUT(match_len - 7);
      }
    }
  }
  *bufp = outbuf;
  return outoffs;
}

#if 0
int
main(int argc, char **argv)
{
  FILE *fi = fopen(argv[1], "r");
  FILE *fo = fopen(argv[2], "w");
  int dec = argc > 3;
  fseek(fi, 0, SEEK_END);
  size_t ilen = ftell(fi);
  size_t olen;
  fseek(fi, 0, SEEK_SET);
  uint8_t *ibuf = malloc(ilen);
  uint8_t *obuf;
  size_t n = fread(ibuf, 1, ilen, fi);
  if (n != ilen) exit(1);
  if (dec)
    olen = lz77d(ibuf, ilen, &obuf);
  else
    olen = lz77c(ibuf, ilen, &obuf);
  fwrite(obuf, olen, 1, fo);
  exit(0);
}
#endif