ref: e1e9167cc8b847dfb3b37df4dc041d20ac271010
dir: /lib/flate/flate.myr/
use std use bio use "types" /* DEFLATE https://www.ietf.org/rfc/rfc1951.txt Terminology: - A 'Huffman code' is one mechanism to map codes to symbols. - A 'code' is the sequence of bits used to encode a symbol in a Huffman code. Codes are what appears in the DEFLATE stream. - A 'symbol' is what a code decodes to. - A 'length' is the length of an LZ77 reference (the number of bytes to repeat). - A 'distance' is the distance at which the bits referred to by an LZ77 reference are from the current point. */ pkg flate = /* state maintained during decompression */ type flatedec = struct rdf : bio.file# wrf : bio.file# /* one-byte buffer */ bval : int /* in [0, 255] current input byte */ bpos : int /* in [0, 8] */ /* ring buffer window for the bytes output */ wpos : std.size wlen : std.size wbuf : byte[WinSz] /* Huffman codes */ lenhc : hufc /* for lengths */ dsthc : hufc /* for distances */ ;; /* one Huffman code */ type hufc = struct /* # of codes of length [1, MaxBits] */ count : int[MaxBits + 1] /* symbols sorted lexicographically by <code length, symbol value>; c.f., mkhufc() and readcode() for usage */ symbol : int[MaxSyms] ;; /* decode a DEFLATE stream */ const decode : (outf : bio.file#, inf : bio.file# -> std.result(void, err)) pkglocal const MaxSyms : std.size pkglocal const MaxBits : std.size pkglocal const WinSz : std.size pkglocal const bits : (st : flatedec#, n : int -> std.result(int, err)) pkglocal const nocomp : (st : flatedec# -> std.result(void, err)) pkglocal const mkhufc : (lengths : int[:], hc : hufc# -> void) pkglocal const readcode : (st : flatedec#, hc : hufc# -> std.result(int, err)) pkglocal const outb : (st : flatedec#, b : byte -> std.result(void, err)) ;; const MaxLenSyms : std.size = 288 /* max # of length symbols */ const MaxDstSyms : std.size = 32 /* max # of distance symbols */ const MaxSyms = MaxLenSyms /* max # of symbols in a Huffman code */ const MaxBits = 15 /* max length of a code (in bits) */ const WinSz = 32 * std.KiB /* max distance for a reference */ var fixedlenhc : hufc var fixeddsthc : hufc const __init__ = { /* the fixed Huffman codes; c.f., RFC Section 3.2.6 */ var length : int[MaxSyms] std.slfill(length[0:144], 8) std.slfill(length[144:256], 9) std.slfill(length[256:280], 7) std.slfill(length[280:288], 8) mkhufc(length[:], &fixedlenhc) std.slfill(length[0:32], 5) mkhufc(length[0:32], &fixeddsthc) } const outb = {st, b /* outputs a single byte, to the output file; additionally record the byte emitted in the output window */ var pos match bio.putb(st.wrf, b) | `std.Ok 0: std.die("no bytes written by bio.putb()") | `std.Ok _: | `std.Err e: -> `std.Err (`Io e) ;; pos = st.wpos if pos == WinSz std.assert(st.wlen >= WinSz, "window should be full") pos = 0 ;; st.wbuf[pos] = b st.wpos = pos + 1 st.wlen++ /* don't care about it being > WinSz */ -> `std.Ok void } const bits = {st, n var pos, len, bits, bval std.assert(n >= 0 && n <= 31, "invalid argument") pos = st.bpos bits = st.bval >> pos for len = 8 - pos; len < n; len += 8 match bio.getb(st.rdf) | `std.Ok b: bval = (b : int) st.bval = bval bits += bval << len | `std.Err e: -> `std.Err (`Io e) ;; ;; st.bpos = 8 - (len - n) -> `std.Ok (bits & ((1 << n) - 1)) } const nocomp = {st /* process a non-compressed block; c.f., RFC Section 3.2.4 */ var len st.bpos = 8 /* skip to a byte boundary */ match bits(st, 16) | `std.Ok n: len = (n : std.size) | `std.Err e: -> `std.Err e ;; match bits(st, 16) | `std.Ok nlen: if len != (nlen : std.size) ^ 0xffff -> `std.Err (`Format \ "non-compressed block length could " \ "not be verified") ;; | `std.Err e: -> `std.Err e ;; for var n = 0; n < len; n++ match bio.getb(st.rdf) | `std.Ok b: match outb(st, b) | `std.Ok _: | err: -> err ;; | `std.Err e: -> `std.Err (`Io e) ;; ;; -> `std.Ok void } const mkhufc = {length, hc /* note: it is possible to have 0s in the length array; this means that the symbol at this index will never be part of the input stream */ var index : int[MaxBits + 1] std.assert(length.len <= hc.symbol.len, "invalid argument") std.slfill(hc.count[:], 0) for var symb = 0; symb < length.len; symb++ hc.count[length[symb]]++ ;; std.slfill(index[:], 0) hc.count[0] = 0 for var l = 1; l <= MaxBits; l++ index[l] = index[l - 1] + hc.count[l - 1] ;; for var symb = 0; symb < length.len; symb++ if length[symb] != 0 hc.symbol[index[length[symb]]++] = symb ;; ;; /* TODO: validate the Huffman code */ } const readcode = {st, hc /* if you want to know more about the algorithm here, you must look for "canonical Huffman codes"; see, for example, Managing Gigabytes: Compressing and Indexing Documents and Images by I. H. Witten, A. Moffat, and T. C. Bell. */ var code, first, index, count code = 0 /* code seen so far */ first = 0 /* first code of length len */ index = 0 /* index of the first code of length len in hc.symbol[:] */ for var len = 1; len <= MaxBits; len++ match bits(st, 1) | `std.Ok n: code += n | err: -> err ;; count = hc.count[len] if code < first + count /* the code is exactly len bits long; return the symbol from the table */ -> `std.Ok (hc.symbol[index + (code - first)]) ;; index += count first = (first + count) << 1 /* c.f. RFC Section 3.2.2 */ code <<= 1 /* make room for the next bit */ ;; -> `std.Err (`Format "invalid code") } const decomp = {st /* process data of a compressed block; c.f., RFC Section 3.2.5 */ const basel = [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258 ] const extral = [ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0 ] const based = [ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577 ] const extrad = [ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13 ] var symb, len, dst, index symb = 0 while symb != 256 match readcode(st, &st.lenhc) | `std.Ok s: symb = s | `std.Err e: -> `std.Err e ;; if symb < 256 match outb(st, (symb : byte)) | `std.Ok _: | err: -> err ;; elif symb > 256 symb -= 257 match bits(st, extral[symb]) | `std.Ok x: len = basel[symb] + x | `std.Err e: -> `std.Err e ;; match readcode(st, &st.dsthc) | `std.Ok s: symb = s | `std.Err e: -> `std.Err e ;; match bits(st, extrad[symb]) | `std.Ok x: dst = (based[symb] + x : std.size) | `std.Err e: -> `std.Err e ;; if dst > st.wlen -> `std.Err (`Format \ "reference to a byte before file start") ;; index = (WinSz + st.wpos - dst) % WinSz for ; len > 0; len-- match outb(st, st.wbuf[index]) | `std.Ok _: | err: -> err ;; index = (index + 1) % WinSz ;; ;; ;; -> `std.Ok void } const dynamic = {st /* process a block compressed with dynamic Huffman codes; c.f., RFC Section 3.2.7 */ const coden = [ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 ] const extra = [2, 3, 7] var length : int[MaxLenSyms + MaxDstSyms] var nlen, ndst, ncode, symb, nbits, hufc match bits(st, 5) | `std.Ok n: nlen = n + 257 | `std.Err e: -> `std.Err e ;; if nlen > 286 -> `std.Err (`Format "invalid number of length codes") ;; match bits(st, 5) | `std.Ok n: ndst = n + 1 | `std.Err e: -> `std.Err e ;; if ndst > 30 -> `std.Err (`Format "invalid number of distance codes") ;; match bits(st, 4) | `std.Ok n: ncode = n + 4 | `std.Err e: -> `std.Err e ;; std.slfill(length[:19], 0) for var c = 0; c < ncode; c++ match bits(st, 3) | `std.Ok n: length[coden[c]] = n | `std.Err e: -> `std.Err e ;; ;; mkhufc(length[:19], &hufc) for var c = 0; c < nlen + ndst; /* according to the RFC: "the code length repeat codes can cross from HLIT + 257 to the HDIST + 1 code lengths"; this is why we have a single loop here */ match readcode(st, &hufc) | `std.Ok n: symb = n | `std.Err e: -> `std.Err e ;; if symb > 18 -> `std.Err (`Format "invalid code code") ;; if symb < 16 length[c++] = symb else /* repeat some code length */ match bits(st, extra[symb - 16]) | `std.Ok n: nbits = n | `std.Err e: -> `std.Err e ;; if symb == 16 /* copy the previous code length 3-6 times */ nbits += 3 if c == 0 || c + nbits > nlen + ndst -> `std.Err (`Format \ "invalid code repetition") ;; std.slfill(length[c:c+nbits], length[c-1]) else /* repeat a 0 length */ if symb == 17 nbits += 3 else nbits += 11 ;; if c + nbits > nlen + ndst -> `std.Err (`Format \ "invalid zero repetition") ;; std.slfill(length[c:c+nbits], 0) ;; c += nbits ;; ;; if length[256] == 0 -> `std.Err (`Format "no code length of end-of-block symbol") ;; mkhufc(length[:nlen], &st.lenhc) mkhufc(length[nlen:nlen + ndst], &st.dsthc) -> decomp(st) } const decode = {outf, inf var st, err, last st = [ .rdf = inf, .wrf = outf, .bpos = 8, ] last = 0 while last != 1 match bits(&st, 1) | `std.Ok l: last = l | `std.Err e: -> `std.Err e ;; match bits(&st, 2) | `std.Ok 0: err = nocomp(&st) | `std.Ok 1: st.lenhc = fixedlenhc st.dsthc = fixeddsthc err = decomp(&st) | `std.Ok 2: err = dynamic(&st) | `std.Ok _: -> `std.Err (`Format "invalid block type") | `std.Err e: -> `std.Err e ;; match err | `std.Ok _: | `std.Err e: -> `std.Err e ;; ;; -> `std.Ok void }