ref: 51287c55a40d83f1e4e01175fe1c14c267b89821
parent: 9d7eeb6acc56dd1068d994e77796b12f6043dcf7
author: Lennart Augustsson <lennart@augustsson.net>
date: Fri Oct 4 05:18:17 EDT 2024
Implement BW compression. Disappointing results.
--- a/lib/System/Compress.hs
+++ b/lib/System/Compress.hs
@@ -1,4 +1,4 @@
-module System.Compress(compress) where
+module System.Compress(compress, decompress) where
import Prelude(); import MiniPrelude
import Data.Function
import Foreign.Ptr
@@ -12,34 +12,23 @@
import System.IO.Internal
import System.IO.Unsafe
-foreign import ccall "lz77c" c_lz77c :: CString -> CSize -> Ptr CString -> IO CSize
-
-{-
--- This really ought to be [Word8] -> [Word8]
-compress :: String -> String
-compress file = unsafePerformIO $ do
- (iptr, ilen) <- newCAStringLen file
- pptr <- new nullPtr
- olen <- c_lz77c iptr (intToCSize ilen) pptr
- optr <- peek pptr
- res <- peekCAStringLen (optr, cSizeToInt olen)
- free iptr
- free optr
- return res
--}
-
type PBFILE = Ptr BFILE
-foreign import ccall "openb_wr_buf" c_openb_wr_buf :: IO PBFILE
-foreign import ccall "openb_rd_buf" c_openb_rd_buf :: Ptr Char -> Int -> IO PBFILE
-foreign import ccall "add_lz77_compressor" c_add_lz77_compressor :: PBFILE -> IO PBFILE
-foreign import ccall "add_lz77_decompressor" c_add_lz77_decompressor :: PBFILE -> IO PBFILE
-foreign import ccall "putb" c_putb :: Int -> PBFILE -> IO ()
-foreign import ccall "getb" c_getb :: PBFILE -> IO Int
-foreign import ccall "get_buf" c_get_buf :: PBFILE -> Ptr (Ptr Char) -> Ptr Int -> IO ()
-foreign import ccall "closeb" c_close :: PBFILE -> IO ()
-foreign import ccall "flushb" c_flush :: PBFILE -> IO ()
+type Transducer = PBFILE -> IO PBFILE
+foreign import ccall "openb_wr_buf" c_openb_wr_buf :: IO PBFILE
+foreign import ccall "openb_rd_buf" c_openb_rd_buf :: Ptr Char -> Int -> IO PBFILE
+foreign import ccall "add_lz77_compressor" c_add_lz77_compressor :: Transducer
+foreign import ccall "add_lz77_decompressor" c_add_lz77_decompressor :: Transducer
+foreign import ccall "add_rle_compressor" c_add_rle_compressor :: Transducer
+foreign import ccall "add_rle_decompressor" c_add_rle_decompressor :: Transducer
+foreign import ccall "add_bwt_compressor" c_add_bwt_compressor :: Transducer
+foreign import ccall "add_bwt_decompressor" c_add_bwt_decompressor :: Transducer
+foreign import ccall "putb" c_putb :: Int -> PBFILE -> IO ()
+foreign import ccall "getb" c_getb :: PBFILE -> IO Int
+foreign import ccall "get_buf" c_get_buf :: PBFILE -> Ptr (Ptr Char) -> Ptr Int -> IO ()
+foreign import ccall "closeb" c_close :: PBFILE -> IO ()
+foreign import ccall "flushb" c_flush :: PBFILE -> IO ()
-withPutTransducer :: (PBFILE -> IO PBFILE) -> [Char] -> [Char]
+withPutTransducer :: Transducer -> [Char] -> [Char]
withPutTransducer trans file = unsafePerformIO $ do
bf <- c_openb_wr_buf -- create a buffer
cbf <- trans bf -- and add transducer (e.g., a compressor)
@@ -55,7 +44,7 @@
c_close cbf -- and close everything
return res
-withGetTransducer :: (PBFILE -> IO PBFILE) -> [Char] -> [Char]
+withGetTransducer :: Transducer -> [Char] -> [Char]
withGetTransducer trans file = unsafePerformIO $ do
(ptr, len) <- newCAStringLen file -- make memory buffer
bf <- c_openb_rd_buf ptr len -- open it for reading
@@ -72,6 +61,24 @@
decompress :: [Char] -> [Char]
decompress = withGetTransducer c_add_lz77_decompressor
+compressRLE :: [Char] -> [Char]
+compressRLE = withPutTransducer c_add_rle_compressor
+
+decompressRLE :: [Char] -> [Char]
+decompressRLE = withGetTransducer c_add_rle_decompressor
+
+compressBWT :: [Char] -> [Char]
+compressBWT = withPutTransducer c_add_bwt_compressor
+
+decompressBWT :: [Char] -> [Char]
+decompressBWT = withGetTransducer c_add_bwt_decompressor
+
+compressBWTRLE :: [Char] -> [Char]
+compressBWTRLE = withPutTransducer (c_add_bwt_compressor <=< c_add_rle_compressor <=< c_add_lz77_compressor)
+
+decompressBWTRLE :: [Char] -> [Char]
+decompressBWTRLE = withGetTransducer (c_add_bwt_decompressor <=< c_add_rle_decompressor <=< c_add_lz77_decompressor)
+
{-
main :: IO ()
main = do
@@ -78,7 +85,7 @@
putStrLn "compress"
haa <- openBinaryFile "aa" ReadMode
aa <- hGetContents haa
- let bb = compress' aa
+ let bb = compressBWTRLE aa
hbb <- openBinaryFile "bb" WriteMode
hPutStr hbb bb
hClose hbb
@@ -86,7 +93,7 @@
putStrLn "decompress"
hbb' <- openBinaryFile "bb" ReadMode
bb' <- hGetContents hbb'
- let aa' = decompress bb'
+ let aa' = decompressBWTRLE bb'
hcc <- openBinaryFile "cc" WriteMode
hPutStr hcc aa'
hClose hbb'
--- a/src/MicroHs/FFI.hs
+++ b/src/MicroHs/FFI.hs
@@ -178,6 +178,7 @@
"peekPtr", "pokePtr", "pokeWord", "peekWord",
"add_lz77_compressor", "add_lz77_decompressor",
"add_rle_compressor", "add_rle_decompressor",
+ "add_bwt_compressor", "add_bwt_decompressor",
"peek_uint8", "poke_uint8", "peek_uint16", "poke_uint16", "peek_uint32", "poke_uint32", "peek_uint64", "poke_uint64",
"peek_int8", "poke_int8", "peek_int16", "poke_int16", "peek_int32", "poke_int32", "peek_int64", "poke_int64",
"peek_ushort", "poke_ushort", "peek_short", "poke_short",
--- a/src/runtime/bfile.c
+++ b/src/runtime/bfile.c
@@ -408,6 +408,7 @@
}
free(obuf);
p->bf.pos = 0;
+ flushb(p->bfile);
}
void
@@ -621,6 +622,8 @@
/* output last byte(s) */
out_rle(p);
+ p->count = 0;
+ flushb(p->bfile);
}
BFILE *
@@ -664,6 +667,239 @@
}
#endif /* WANT_RLE */
+
+#if WANT_BWT
+/***************** BFILE via Burrows-Wheeler Transform *******************/
+/*
+ */
+
+struct BFILE_bwt {
+ BFILE mets;
+ BFILE *bfile; /* underlying BFILE */
+ size_t count;
+ struct bfbuffer bf;
+ int read;
+ int numflush;
+};
+
+int
+getb_bwt(BFILE *bp)
+{
+ struct BFILE_bwt *p = (struct BFILE_bwt*)bp;
+ CHECKBFILE(bp, getb_bwt);
+ if (p->bf.pos >= p->bf.size)
+ return -1;
+ return p->bf.buf[p->bf.pos++];
+}
+
+void
+ungetb_bwt(int c, BFILE *bp)
+{
+ struct BFILE_bwt *p = (struct BFILE_bwt*)bp;
+ CHECKBFILE(bp, getb_bwt);
+ p->bf.pos--;
+}
+
+void
+putb_bwt(int b, BFILE *bp)
+{
+ struct BFILE_bwt *p = (struct BFILE_bwt*)bp;
+ CHECKBFILE(bp, getb_bwt);
+
+ bfbuffer_snoc(&p->bf, b);
+}
+
+void
+closeb_bwt(BFILE *bp)
+{
+ struct BFILE_bwt *p = (struct BFILE_bwt*)bp;
+ CHECKBFILE(bp, getb_bwt);
+
+ if (!p->read)
+ flushb(bp);
+
+ closeb(p->bfile);
+}
+
+/* Sort all rotations of buf, and the indices of the sorted strings in res. */
+/*
+ * |.......................................|
+ * ^ ^
+ * a b
+ * <- n ->
+ * <- m ->
+ * <- o ->
+ */
+static uint8_t *compar_arg;
+static size_t compar_len;
+int compar(const void *pa, const void *pb)
+{
+ uint32_t a = *(uint32_t*)pa;
+ uint32_t b = *(uint32_t*)pb;
+ int r;
+ if (a == b)
+ return 0;
+ if (a < b) {
+ size_t n = compar_len - b; /* bytes until end of buffer */
+ r = memcmp(compar_arg + a, compar_arg + b, n);
+ if (r)
+ return r;
+ size_t m = b - a;
+ r = memcmp(compar_arg + a + n, compar_arg, m);
+ if (r)
+ return r;
+ size_t o = a;
+ return memcmp(compar_arg, compar_arg + m, o);
+ } else {
+ size_t n = compar_len - a; /* bytes until end of buffer */
+ r = memcmp(compar_arg + a, compar_arg + b, n);
+ if (r)
+ return r;
+ size_t m = a - b;
+ r = memcmp(compar_arg, compar_arg + b + n, m);
+ if (r)
+ return r;
+ size_t o = a;
+ return memcmp(compar_arg + m, compar_arg, o);
+
+ }
+ return 0;
+}
+
+void
+sort_buffer(uint8_t *buf, size_t buflen, uint32_t *res)
+{
+ for(size_t i = 0; i < buflen; i++)
+ res[i] = i;
+ compar_arg = buf;
+ compar_len = buflen;
+ qsort(res, buflen, sizeof(uint32_t), compar);
+}
+
+uint32_t
+encode_bwt(uint8_t *data, size_t len, uint8_t *last)
+{
+ uint32_t *res = malloc(len * sizeof(uint32_t));
+ if (!res)
+ ERR("encode_bwt");
+ sort_buffer(data, len, res);
+ uint32_t zero = 0;
+ for(size_t i = 0; i < len; i++) {
+ uint32_t offs = res[i];
+ last[i] = data[(offs + len - 1) % len];
+ if (offs == 0)
+ zero = i;
+ }
+ return zero;
+}
+
+void
+flushb_bwt(BFILE *bp)
+{
+ struct BFILE_bwt *p = (struct BFILE_bwt*)bp;
+ CHECKBFILE(bp, getb_bwt);
+
+ /* If we have had a flush, and there is no new data then do nothing */
+ if (p->numflush++ && !p->bf.pos)
+ return;
+ putsb("BW1", p->bfile); /* version no */
+ putint32(p->bf.pos, p->bfile); /* 32 bit length */
+ uint8_t *last = malloc(p->bf.pos);
+ if (!last)
+ ERR("flushb_bwt");
+ size_t zero = encode_bwt(p->bf.buf, p->bf.pos, last);
+ putint32(zero, p->bfile);
+ for(size_t i = 0; i < p->bf.pos; i++)
+ putb(last[i], p->bfile);
+ FREE(last);
+ p->bf.pos = 0;
+ flushb(p->bfile);
+}
+
+#define MAXBYTE 256
+
+void
+decode_bwt(uint8_t *data, size_t len, uint8_t *odata, size_t zero)
+{
+ size_t count[MAXBYTE];
+ uint32_t *pred = malloc(len * sizeof(uint32_t));
+ for(size_t i = 0; i < MAXBYTE; i++) {
+ count[i] = 0;
+ }
+ for(size_t i = 0; i < len; i++) {
+ pred[i] = count[data[i]]++;
+ }
+ size_t sum = 0;
+ for(size_t i = 0; i < MAXBYTE; i++) {
+ size_t s = count[i];
+ count[i] = sum;
+ sum += s;
+ }
+ size_t i = zero;
+ for(size_t j = len; j > 0; j--) {
+ odata[j - 1] = data[i];
+ i = pred[i] + count[data[i]];
+ }
+}
+
+BFILE *
+add_bwt_decompressor(BFILE *file)
+{
+ struct BFILE_bwt *p = MALLOC(sizeof(struct BFILE_bwt));
+
+ if (!p)
+ memerr();
+ memset(p, 0, sizeof(struct BFILE_bwt));
+ p->mets.getb = getb_bwt;
+ p->mets.ungetb = ungetb_bwt;
+ p->mets.putb = 0;
+ p->mets.flushb = 0;
+ p->mets.closeb = closeb_bwt;
+ p->read = 1;
+ p->bfile = file;
+ p->numflush = 0;
+
+ /* First check version */
+ if (getb(file) != 'B' || getb(file) != 'W' || getb(file) != '1')
+ ERR("Bad BWT signature");
+
+ size_t size = getint32(file); /* then read size */
+ uint32_t zero = getint32(file);
+
+ uint8_t *buf = MALLOC(size); /* temporary buffer for input */
+ if (!buf)
+ memerr();
+ for(size_t i = 0; i < size; i++) {
+ buf[i] = getb(file); /* and read data */
+ }
+ bfbuffer_init(&p->bf, size);
+ decode_bwt(buf, size, p->bf.buf, zero); /* decode */
+ FREE(buf);
+
+ return (BFILE*)p;
+}
+
+BFILE *
+add_bwt_compressor(BFILE *file)
+{
+ struct BFILE_bwt *p = MALLOC(sizeof(struct BFILE_bwt));
+
+ if (!p)
+ memerr();
+ p->mets.getb = getb_bwt;
+ p->mets.ungetb = 0;
+ p->mets.putb = putb_bwt;
+ p->mets.flushb = flushb_bwt;
+ p->mets.closeb = closeb_bwt;
+ p->read = 0;
+ p->bfile = file;
+ p->numflush = 0;
+
+ bfbuffer_init(&p->bf, 25000);
+ return (BFILE*)p;
+}
+
+#endif /* WANT_BWT */
/***************** BFILE with UTF8 encode/decode *******************/
--- a/src/runtime/eval.c
+++ b/src/runtime/eval.c
@@ -32,6 +32,10 @@
#define WANT_RLE 1
#endif
+#if !defined(WANT_BWT)
+#define WANT_BWT 1
+#endif
+
#if WANT_LZ77
size_t lz77d(uint8_t *src, size_t srclen, uint8_t **bufp);
size_t lz77c(uint8_t *src, size_t srclen, uint8_t **bufp);
@@ -4345,6 +4349,11 @@
void mhs_add_rle_decompressor(int s) { mhs_from_Ptr(s, 1, add_rle_decompressor(mhs_to_Ptr(s, 0))); }
#endif /* WANT_RLE */
+#if WANT_BWT
+void mhs_add_bwt_compressor(int s) { mhs_from_Ptr(s, 1, add_bwt_compressor(mhs_to_Ptr(s, 0))); }
+void mhs_add_bwt_decompressor(int s) { mhs_from_Ptr(s, 1, add_bwt_decompressor(mhs_to_Ptr(s, 0))); }
+#endif /* WANT_BWT */
+
void mhs_calloc(int s) { mhs_from_Ptr(s, 2, calloc(mhs_to_CSize(s, 0), mhs_to_CSize(s, 1))); }
void mhs_free(int s) { free(mhs_to_Ptr(s, 0)); mhs_from_Unit(s, 1); }
void mhs_addr_free(int s) { mhs_from_FunPtr(s, 0, (HsFunPtr)&FREE); }
@@ -4431,7 +4440,6 @@
#if WANT_STDIO
{ "add_FILE", mhs_add_FILE},
{ "add_utf8", mhs_add_utf8},
-//{ "add_rle", mhs_add_rle},
{ "closeb", mhs_closeb},
{ "&closeb", mhs_addr_closeb},
{ "flushb", mhs_flushb},
@@ -4462,6 +4470,11 @@
#if WANT_RLE
{ "add_rle_compressor", mhs_add_rle_compressor},
{ "add_rle_decompressor", mhs_add_rle_decompressor},
+#endif /* WANT_RLE */
+
+#if WANT_BWT
+{ "add_bwt_compressor", mhs_add_bwt_compressor},
+{ "add_bwt_decompressor", mhs_add_bwt_decompressor},
#endif /* WANT_RLE */
{ "calloc", mhs_calloc},