shithub: MicroHs

Download patch

ref: 51287c55a40d83f1e4e01175fe1c14c267b89821
parent: 9d7eeb6acc56dd1068d994e77796b12f6043dcf7
author: Lennart Augustsson <lennart@augustsson.net>
date: Fri Oct 4 05:18:17 EDT 2024

Implement BW compression.

Disappointing results.

--- a/lib/System/Compress.hs
+++ b/lib/System/Compress.hs
@@ -1,4 +1,4 @@
-module System.Compress(compress) where
+module System.Compress(compress, decompress) where
 import Prelude(); import MiniPrelude
 import Data.Function
 import Foreign.Ptr
@@ -12,34 +12,23 @@
 import System.IO.Internal
 import System.IO.Unsafe
 
-foreign import ccall "lz77c" c_lz77c :: CString -> CSize -> Ptr CString -> IO CSize
-
-{-
--- This really ought to be [Word8] -> [Word8]
-compress :: String -> String
-compress file = unsafePerformIO $ do
-  (iptr, ilen) <- newCAStringLen file
-  pptr <- new nullPtr
-  olen <- c_lz77c iptr (intToCSize ilen) pptr
-  optr <- peek pptr
-  res <- peekCAStringLen (optr, cSizeToInt olen)
-  free iptr
-  free optr
-  return res
--}
-
 type PBFILE = Ptr BFILE
-foreign import ccall "openb_wr_buf" c_openb_wr_buf :: IO PBFILE
-foreign import ccall "openb_rd_buf" c_openb_rd_buf :: Ptr Char -> Int -> IO PBFILE
-foreign import ccall "add_lz77_compressor" c_add_lz77_compressor :: PBFILE -> IO PBFILE
-foreign import ccall "add_lz77_decompressor" c_add_lz77_decompressor :: PBFILE -> IO PBFILE
-foreign import ccall "putb" c_putb :: Int -> PBFILE -> IO ()
-foreign import ccall "getb" c_getb :: PBFILE -> IO Int
-foreign import ccall "get_buf" c_get_buf :: PBFILE -> Ptr (Ptr Char) -> Ptr Int -> IO ()
-foreign import ccall "closeb" c_close :: PBFILE -> IO ()
-foreign import ccall "flushb" c_flush :: PBFILE -> IO ()
+type Transducer = PBFILE -> IO PBFILE
+foreign import ccall "openb_wr_buf"          c_openb_wr_buf          :: IO PBFILE
+foreign import ccall "openb_rd_buf"          c_openb_rd_buf          :: Ptr Char -> Int -> IO PBFILE
+foreign import ccall "add_lz77_compressor"   c_add_lz77_compressor   :: Transducer
+foreign import ccall "add_lz77_decompressor" c_add_lz77_decompressor :: Transducer
+foreign import ccall "add_rle_compressor"    c_add_rle_compressor    :: Transducer
+foreign import ccall "add_rle_decompressor"  c_add_rle_decompressor  :: Transducer
+foreign import ccall "add_bwt_compressor"    c_add_bwt_compressor    :: Transducer
+foreign import ccall "add_bwt_decompressor"  c_add_bwt_decompressor  :: Transducer
+foreign import ccall "putb"                  c_putb                  :: Int -> PBFILE -> IO ()
+foreign import ccall "getb"                  c_getb                  :: PBFILE -> IO Int
+foreign import ccall "get_buf"               c_get_buf               :: PBFILE -> Ptr (Ptr Char) -> Ptr Int -> IO ()
+foreign import ccall "closeb"                c_close                 :: PBFILE -> IO ()
+foreign import ccall "flushb"                c_flush                 :: PBFILE -> IO ()
 
-withPutTransducer :: (PBFILE -> IO PBFILE) -> [Char] -> [Char]
+withPutTransducer :: Transducer -> [Char] -> [Char]
 withPutTransducer trans file = unsafePerformIO $ do
   bf <- c_openb_wr_buf          -- create a buffer
   cbf <- trans bf               -- and add transducer (e.g., a compressor)
@@ -55,7 +44,7 @@
       c_close cbf               -- and close everything
       return res
 
-withGetTransducer :: (PBFILE -> IO PBFILE) -> [Char] -> [Char]
+withGetTransducer :: Transducer -> [Char] -> [Char]
 withGetTransducer trans file = unsafePerformIO $ do
   (ptr, len) <- newCAStringLen file            -- make memory buffer
   bf <- c_openb_rd_buf ptr len                 -- open it for reading
@@ -72,6 +61,24 @@
 decompress :: [Char] -> [Char]
 decompress = withGetTransducer c_add_lz77_decompressor
 
+compressRLE :: [Char] -> [Char]
+compressRLE = withPutTransducer c_add_rle_compressor
+
+decompressRLE :: [Char] -> [Char]
+decompressRLE = withGetTransducer c_add_rle_decompressor
+
+compressBWT :: [Char] -> [Char]
+compressBWT = withPutTransducer c_add_bwt_compressor
+
+decompressBWT :: [Char] -> [Char]
+decompressBWT = withGetTransducer c_add_bwt_decompressor
+
+compressBWTRLE :: [Char] -> [Char]
+compressBWTRLE = withPutTransducer (c_add_bwt_compressor <=< c_add_rle_compressor <=< c_add_lz77_compressor)
+
+decompressBWTRLE :: [Char] -> [Char]
+decompressBWTRLE = withGetTransducer (c_add_bwt_decompressor <=< c_add_rle_decompressor <=< c_add_lz77_decompressor)
+
 {-
 main :: IO ()
 main = do
@@ -78,7 +85,7 @@
   putStrLn "compress"
   haa <- openBinaryFile "aa" ReadMode
   aa <- hGetContents haa
-  let bb = compress' aa
+  let bb = compressBWTRLE aa
   hbb <- openBinaryFile "bb" WriteMode
   hPutStr hbb bb
   hClose hbb
@@ -86,7 +93,7 @@
   putStrLn "decompress"
   hbb' <- openBinaryFile "bb" ReadMode
   bb' <- hGetContents hbb'
-  let aa' = decompress bb'
+  let aa' = decompressBWTRLE bb'
   hcc <- openBinaryFile "cc" WriteMode
   hPutStr hcc aa'
   hClose hbb'
--- a/src/MicroHs/FFI.hs
+++ b/src/MicroHs/FFI.hs
@@ -178,6 +178,7 @@
   "peekPtr", "pokePtr", "pokeWord", "peekWord",
   "add_lz77_compressor", "add_lz77_decompressor",
   "add_rle_compressor", "add_rle_decompressor",
+  "add_bwt_compressor", "add_bwt_decompressor",
   "peek_uint8", "poke_uint8", "peek_uint16", "poke_uint16", "peek_uint32", "poke_uint32", "peek_uint64", "poke_uint64",
   "peek_int8", "poke_int8", "peek_int16", "poke_int16", "peek_int32", "poke_int32", "peek_int64", "poke_int64",
   "peek_ushort", "poke_ushort", "peek_short", "poke_short",
--- a/src/runtime/bfile.c
+++ b/src/runtime/bfile.c
@@ -408,6 +408,7 @@
   }
   free(obuf);
   p->bf.pos = 0;
+  flushb(p->bfile);
 }
 
 void
@@ -621,6 +622,8 @@
 
   /* output last byte(s) */
   out_rle(p);
+  p->count = 0;
+  flushb(p->bfile);
 }
 
 BFILE *
@@ -664,6 +667,239 @@
 }
 
 #endif  /* WANT_RLE */
+
+#if WANT_BWT
+/***************** BFILE via Burrows-Wheeler Transform *******************/
+/*
+ */
+
+struct BFILE_bwt {
+  BFILE    mets;
+  BFILE    *bfile;              /* underlying BFILE */
+  size_t   count;
+  struct bfbuffer bf;
+  int      read;
+  int      numflush;
+};
+
+int
+getb_bwt(BFILE *bp)
+{
+  struct BFILE_bwt *p = (struct BFILE_bwt*)bp;
+  CHECKBFILE(bp, getb_bwt);
+  if (p->bf.pos >= p->bf.size)
+    return -1;
+  return p->bf.buf[p->bf.pos++];
+}
+
+void
+ungetb_bwt(int c, BFILE *bp)
+{
+  struct BFILE_bwt *p = (struct BFILE_bwt*)bp;
+  CHECKBFILE(bp, getb_bwt);
+  p->bf.pos--;
+}
+
+void
+putb_bwt(int b, BFILE *bp)
+{
+  struct BFILE_bwt *p = (struct BFILE_bwt*)bp;
+  CHECKBFILE(bp, getb_bwt);
+
+  bfbuffer_snoc(&p->bf, b);
+}
+
+void
+closeb_bwt(BFILE *bp)
+{
+  struct BFILE_bwt *p = (struct BFILE_bwt*)bp;
+  CHECKBFILE(bp, getb_bwt);
+
+  if (!p->read)
+    flushb(bp);
+
+  closeb(p->bfile);
+}
+
+/* Sort all rotations of buf, and the indices of the sorted strings in res. */
+/*
+ * |.......................................|
+ *         ^           ^
+ *         a           b
+ *                     <-      n         ->
+ *         <-  m     ->
+ * <-  o ->
+ */
+static uint8_t *compar_arg;
+static size_t compar_len;
+int compar(const void *pa, const void *pb)
+{
+  uint32_t a = *(uint32_t*)pa;
+  uint32_t b = *(uint32_t*)pb;
+  int r;
+  if (a == b)
+    return 0;
+  if (a < b) {
+    size_t n = compar_len - b; /* bytes until end of buffer */
+    r = memcmp(compar_arg + a, compar_arg + b, n);
+    if (r)
+      return r;
+    size_t m = b - a;
+    r = memcmp(compar_arg + a + n, compar_arg, m);
+    if (r)
+      return r;
+    size_t o = a;
+    return memcmp(compar_arg, compar_arg + m, o);
+  } else {
+    size_t n = compar_len - a; /* bytes until end of buffer */
+    r = memcmp(compar_arg + a, compar_arg + b, n);
+    if (r)
+      return r;
+    size_t m = a - b;
+    r = memcmp(compar_arg, compar_arg + b + n, m);
+    if (r)
+      return r;
+    size_t o = a;
+    return memcmp(compar_arg + m, compar_arg, o);
+    
+  }
+  return 0;
+}
+
+void
+sort_buffer(uint8_t *buf, size_t buflen, uint32_t *res)
+{
+  for(size_t i = 0; i < buflen; i++)
+    res[i] = i;
+  compar_arg = buf;
+  compar_len = buflen;
+  qsort(res, buflen, sizeof(uint32_t), compar);
+}
+
+uint32_t
+encode_bwt(uint8_t *data, size_t len, uint8_t *last)
+{
+  uint32_t *res = malloc(len * sizeof(uint32_t));
+  if (!res)
+    ERR("encode_bwt");
+  sort_buffer(data, len, res);
+  uint32_t zero = 0;
+  for(size_t i = 0; i < len; i++) {
+    uint32_t offs = res[i];
+    last[i] = data[(offs + len - 1) % len];
+    if (offs == 0)
+      zero = i;
+  }
+  return zero;
+}
+
+void
+flushb_bwt(BFILE *bp)
+{
+  struct BFILE_bwt *p = (struct BFILE_bwt*)bp;
+  CHECKBFILE(bp, getb_bwt);
+
+  /* If we have had a flush, and there is no new data then do nothing */
+  if (p->numflush++ && !p->bf.pos)
+    return;
+  putsb("BW1", p->bfile);             /* version no */
+  putint32(p->bf.pos, p->bfile);      /* 32 bit length */
+  uint8_t *last = malloc(p->bf.pos);
+  if (!last)
+    ERR("flushb_bwt");
+  size_t zero = encode_bwt(p->bf.buf, p->bf.pos, last);
+  putint32(zero, p->bfile);
+  for(size_t i = 0; i < p->bf.pos; i++)
+    putb(last[i], p->bfile);
+  FREE(last);
+  p->bf.pos = 0;
+  flushb(p->bfile);
+}
+
+#define MAXBYTE 256
+
+void
+decode_bwt(uint8_t *data, size_t len, uint8_t *odata, size_t zero)
+{
+  size_t count[MAXBYTE];
+  uint32_t *pred = malloc(len * sizeof(uint32_t));
+  for(size_t i = 0; i < MAXBYTE; i++) {
+    count[i] = 0;
+  }
+  for(size_t i = 0; i < len; i++) {
+    pred[i] = count[data[i]]++;
+  }
+  size_t sum = 0;
+  for(size_t i = 0; i < MAXBYTE; i++) {
+    size_t s = count[i];
+    count[i] = sum;
+    sum += s;
+  }
+  size_t i = zero;
+  for(size_t j = len; j > 0; j--) {
+    odata[j - 1] = data[i];
+    i = pred[i] + count[data[i]];
+  }
+}
+
+BFILE *
+add_bwt_decompressor(BFILE *file)
+{
+  struct BFILE_bwt *p = MALLOC(sizeof(struct BFILE_bwt));
+
+  if (!p)
+    memerr();
+  memset(p, 0, sizeof(struct BFILE_bwt));
+  p->mets.getb = getb_bwt;
+  p->mets.ungetb = ungetb_bwt;
+  p->mets.putb = 0;
+  p->mets.flushb = 0;
+  p->mets.closeb = closeb_bwt;
+  p->read = 1;
+  p->bfile = file;
+  p->numflush = 0;
+
+  /* First check version */
+  if (getb(file) != 'B' || getb(file) != 'W' || getb(file) != '1')
+    ERR("Bad BWT signature");
+
+  size_t size = getint32(file); /* then read size */
+  uint32_t zero = getint32(file);
+
+  uint8_t *buf = MALLOC(size);  /* temporary buffer for input */
+  if (!buf)
+    memerr();
+  for(size_t i = 0; i < size; i++) {
+    buf[i] = getb(file);        /* and read data */
+  }
+  bfbuffer_init(&p->bf, size);
+  decode_bwt(buf, size, p->bf.buf, zero); /* decode */
+  FREE(buf);
+
+  return (BFILE*)p;
+}
+
+BFILE *
+add_bwt_compressor(BFILE *file)
+{
+  struct BFILE_bwt *p = MALLOC(sizeof(struct BFILE_bwt));
+
+  if (!p)
+    memerr();
+  p->mets.getb = getb_bwt;
+  p->mets.ungetb = 0;
+  p->mets.putb = putb_bwt;
+  p->mets.flushb = flushb_bwt;
+  p->mets.closeb = closeb_bwt;
+  p->read = 0;
+  p->bfile = file;
+  p->numflush = 0;
+
+  bfbuffer_init(&p->bf, 25000);
+  return (BFILE*)p;
+}
+
+#endif  /* WANT_BWT */
 
 /***************** BFILE with UTF8 encode/decode *******************/
 
--- a/src/runtime/eval.c
+++ b/src/runtime/eval.c
@@ -32,6 +32,10 @@
 #define WANT_RLE 1
 #endif
 
+#if !defined(WANT_BWT)
+#define WANT_BWT 1
+#endif
+
 #if WANT_LZ77
 size_t lz77d(uint8_t *src, size_t srclen, uint8_t **bufp);
 size_t lz77c(uint8_t *src, size_t srclen, uint8_t **bufp);
@@ -4345,6 +4349,11 @@
 void mhs_add_rle_decompressor(int s) { mhs_from_Ptr(s, 1, add_rle_decompressor(mhs_to_Ptr(s, 0))); }
 #endif  /* WANT_RLE */
 
+#if WANT_BWT
+void mhs_add_bwt_compressor(int s) { mhs_from_Ptr(s, 1, add_bwt_compressor(mhs_to_Ptr(s, 0))); }
+void mhs_add_bwt_decompressor(int s) { mhs_from_Ptr(s, 1, add_bwt_decompressor(mhs_to_Ptr(s, 0))); }
+#endif  /* WANT_BWT */
+
 void mhs_calloc(int s) { mhs_from_Ptr(s, 2, calloc(mhs_to_CSize(s, 0), mhs_to_CSize(s, 1))); }
 void mhs_free(int s) { free(mhs_to_Ptr(s, 0)); mhs_from_Unit(s, 1); }
 void mhs_addr_free(int s) { mhs_from_FunPtr(s, 0, (HsFunPtr)&FREE); }
@@ -4431,7 +4440,6 @@
 #if WANT_STDIO
 { "add_FILE", mhs_add_FILE},
 { "add_utf8", mhs_add_utf8},
-//{ "add_rle", mhs_add_rle},
 { "closeb", mhs_closeb},
 { "&closeb", mhs_addr_closeb},
 { "flushb", mhs_flushb},
@@ -4462,6 +4470,11 @@
 #if WANT_RLE
 { "add_rle_compressor", mhs_add_rle_compressor},
 { "add_rle_decompressor", mhs_add_rle_decompressor},
+#endif  /* WANT_RLE */
+
+#if WANT_BWT
+{ "add_bwt_compressor", mhs_add_bwt_compressor},
+{ "add_bwt_decompressor", mhs_add_bwt_decompressor},
 #endif  /* WANT_RLE */
 
 { "calloc", mhs_calloc},