ref: 91b74e4777120478cf658bad19323e1c147b0449
author: McKay Marston <mckay.marston@greymanlabs.com>
date: Fri Feb 5 16:44:10 EST 2021
The first commit to shithub.
--- /dev/null
+++ b/desereter.ml
@@ -1,0 +1,172 @@
+let entry_rgx = Str.regexp "\\([.a-z'-]+\\)(?[0-9]?)? \\(.*\\)"
+
+type wordset = {prefix: string; word: string; suffix: string}
+
+let is_uppercase = function
+ | 'A'
+ |'B'
+ |'C'
+ |'D'
+ |'E'
+ |'F'
+ |'G'
+ |'H'
+ |'I'
+ |'J'
+ |'K'
+ |'L'
+ |'M'
+ |'N'
+ |'O'
+ |'P'
+ |'Q'
+ |'R'
+ |'S'
+ |'T'
+ |'U'
+ |'V'
+ |'W'
+ |'X'
+ |'Y'
+ |'Z' ->
+ true
+ | _ -> false
+
+let get_vowel vowel =
+ if String.length vowel != 3 then ("", "")
+ else
+ let trimmed = String.sub vowel 0 2 in
+ match trimmed with
+ | "IY" -> ("𐐀", "𐐨")
+ | "EY" -> ("𐐁", "𐐩")
+ | "AA" -> ("𐐂", "𐐪")
+ | "AO" -> ("𐐉", "𐐱")
+ | "OW" -> ("𐐄", "𐐬")
+ | "UW" -> ("𐐅", "𐐭")
+ | "IH"
+ |"IX" ->
+ ("𐐆", "𐐮")
+ | "EH" -> ("𐐇", "𐐯")
+ | "AE" -> ("𐐈", "𐐰")
+ | "AX" -> ("𐐉", "𐐱")
+ | "AH" -> ("𐐊", "𐐲")
+ | "UH" -> ("𐐋", "𐐳")
+ | "AY" -> ("𐐌", "𐐴")
+ | "AW" -> ("𐐍", "𐐵")
+ | "ER" -> ("𐐊𐐡", "𐐊𐑉")
+ | _ -> ("", "")
+
+let get_char = function
+ | "W" -> ("𐐎", "𐐶")
+ | "Y" -> ("𐐏", "𐐷")
+ | "H"
+ |"HH" ->
+ ("𐐐", "𐐸")
+ | "P" -> ("𐐑", "𐐹")
+ | "B" -> ("𐐒", "𐐺")
+ | "T" -> ("𐐓", "𐐻")
+ | "D" -> ("𐐔", "𐐼")
+ | "CH" -> ("𐐕", "𐐽")
+ | "JH" -> ("𐐖", "𐐾")
+ | "K" -> ("𐐗", "𐐿")
+ | "G" -> ("𐐘", "𐑀")
+ | "F" -> ("𐐙", "𐑁")
+ | "V" -> ("𐐚", "𐑂")
+ | "TH" -> ("𐐛", "𐑃")
+ | "DH" -> ("𐐜", "𐑄")
+ | "S" -> ("𐐝", "𐑅")
+ | "Z" -> ("𐐞", "𐑆")
+ | "SH" -> ("𐐟", "𐑇")
+ | "ZH" -> ("𐐠", "𐑈")
+ | "R" -> ("𐐡", "𐑉")
+ | "L" -> ("𐐢", "𐑊")
+ | "M" -> ("𐐣", "𐑋")
+ | "N" -> ("𐐤", "𐑌")
+ | "NX"
+ |"NG" ->
+ ("𐐥", "𐑍")
+ | v -> get_vowel v
+
+let rec parse_arpabet line des uppercase =
+ match line with
+ | hd :: tl ->
+ let u, l = get_char hd in
+ (if uppercase then u else l) ^ parse_arpabet tl des false
+ | [] -> des
+
+let unquoted word = String.sub word 1 (String.length word - 2)
+
+let consider word =
+ let wrd = ref (String.lowercase_ascii word) in
+ let prefix = ref "" in
+ let suffix = ref "" in
+ ( try
+ let pos = Str.search_forward (Str.regexp "[({\"]") !wrd 0 + 1 in
+ wrd := String.sub word pos (String.length !wrd - pos) ;
+ prefix := String.sub word 0 pos
+ with Not_found -> () ) ;
+ ( try
+ let pos = Str.search_backward (Str.regexp "[})\"\\.,!;:]") !wrd (String.length !wrd) in
+ suffix := String.sub !wrd pos (String.length !wrd - pos) ;
+ wrd := String.sub !wrd 0 pos;
+ with Not_found -> () ) ;
+ {prefix= !prefix; word= !wrd; suffix= !suffix}
+
+let parse word dictionary =
+ let uppercase = is_uppercase word.[0] in
+ let wordparts = consider word in
+ try
+ let des = parse_arpabet (String.split_on_char ' ' (Hashtbl.find dictionary wordparts.word)) "" uppercase in
+ wordparts.prefix ^ des ^ wordparts.suffix
+ with Not_found -> word
+
+let sanitize line = Str.global_replace (Str.regexp "\\.\\.\\.") " ... " line
+
+let load_dictionary extra =
+ let default =
+ try
+ let prefix = Unix.getenv "OPAM_SWITCH_PREFIX" in
+ prefix ^ "/share/desereter/cmudict.dict"
+ with Not_found -> "/lib/cmudict.dict"
+ in
+ let dictionaries = [default] @ String.split_on_char ';' extra in
+ let dictionary = Hashtbl.create 150000 in
+ let load file =
+ if String.length file > 0 then
+ let ic = open_in file in
+ try
+ while true do
+ let entry = input_line ic in
+ if Str.string_match entry_rgx entry 0 then
+ let word = Str.matched_group 1 entry in
+ let pronunciation = Str.matched_group 2 entry in
+ Hashtbl.add dictionary word pronunciation
+ done
+ with End_of_file -> close_in ic in
+ List.iter load dictionaries ; dictionary
+
+let translate dictionary line =
+ let words = String.split_on_char ' ' (sanitize line) in
+ let words =
+ List.filter
+ (fun x ->
+ let w = String.trim x in
+ String.length w > 0 )
+ words in
+ print_endline (List.fold_left (fun acc word -> acc ^ parse word dictionary ^ " ") "" words)
+
+let () =
+ let line = ref "" in
+ let extra = ref "" in
+ Arg.parse
+ [(* ("-i", Arg.Set_string line, "input"); *) ("-d", Arg.Set_string extra, "dictionary")]
+ (fun x -> line := x)
+ (Sys.argv.(0) ^ " [-d dictionary] <-i input>") ;
+ let dictionary = load_dictionary !extra in
+ if String.length !line > 0 then translate dictionary !line
+ else
+ try
+ while true do
+ read_line () |> translate dictionary
+ done
+ with End_of_file -> ()
--- /dev/null
+++ b/desereter.opam
@@ -1,0 +1,26 @@
+# This file is generated by dune, edit dune-project instead
+opam-version: "2.0"
+version: "19"
+synopsis: "Tool to convert English text to the Deseret Alphabet"
+maintainer: ["smazga@greymanlabs.com"]
+authors: ["smazga"]
+homepage: "https://shithub.us/git/zgasma/desereter/HEAD/info.html"
+bug-reports: "no"
+depends: [
+ "dune" {>= "2.7"}
+ "odoc" {with-doc}
+]
+build: [
+ ["dune" "subst"] {dev}
+ [
+ "dune"
+ "build"
+ "-p"
+ name
+ "-j"
+ jobs
+ "@install"
+ "@runtest" {with-test}
+ "@doc" {with-doc}
+ ]
+]
--- /dev/null
+++ b/dune
@@ -1,0 +1,11 @@
+(executable
+ (name desereter)
+ (libraries unix str))
+
+(install
+ (section bin)
+ (files (desereter.exe as desereter)))
+
+(install
+ (section share)
+ (files cmudict.dict))
--- /dev/null
+++ b/dune-project
@@ -1,0 +1,13 @@
+(lang dune 2.7)
+(version 19)
+(name desereter)
+(maintainers smazga@greymanlabs.com)
+(authors smazga)
+(homepage https://shithub.us/git/zgasma/desereter/HEAD/info.html)
+
+(bug_reports no)
+(generate_opam_files true)
+
+(package
+ (name desereter)
+ (synopsis "Tool to convert English text to the Deseret Alphabet"))
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,14 @@
+TARG=desereter
+
+$TARG: desereter.ml
+ ocamlc str.cma unix.cma desereter.ml -o $target
+
+install:V: $TARG
+ cp $TARG $home/bin/
+ cp cmudict.dict /lib
+
+uninstall:V:
+ rm -f $home/bin/$TARG /lib/cmudict.dict
+
+clean:V:
+ rm -f $TARG *cmi *cmo