shithub: desereter

--- /dev/null

+++ b/desereter.ml

@@ -1,0 +1,172 @@

+let entry_rgx = Str.regexp "\\([.a-z'-]+\\)(?[0-9]?)? \\(.*\\)"

+type wordset = {prefix: string; word: string; suffix: string}

+let is_uppercase = function

+  | 'A'

+   |'B'

+   |'C'

+   |'D'

+   |'E'

+   |'F'

+   |'G'

+   |'H'

+   |'I'

+   |'J'

+   |'K'

+   |'L'

+   |'M'

+   |'N'

+   |'O'

+   |'P'

+   |'Q'

+   |'R'

+   |'S'

+   |'T'

+   |'U'

+   |'V'

+   |'W'

+   |'X'

+   |'Y'

+   |'Z' ->

+      true

+  | _ -> false

+let get_vowel vowel =

+  if String.length vowel != 3 then ("", "")

+  else

+    let trimmed = String.sub vowel 0 2 in

+    match trimmed with

+    | "IY" -> ("𐐀", "𐐨")

+    | "EY" -> ("𐐁", "𐐩")

+    | "AA" -> ("𐐂", "𐐪")

+    | "AO" -> ("𐐉", "𐐱")

+    | "OW" -> ("𐐄", "𐐬")

+    | "UW" -> ("𐐅", "𐐭")

+    | "IH"

+     |"IX" ->

+        ("𐐆", "𐐮")

+    | "EH" -> ("𐐇", "𐐯")

+    | "AE" -> ("𐐈", "𐐰")

+    | "AX" -> ("𐐉", "𐐱")

+    | "AH" -> ("𐐊", "𐐲")

+    | "UH" -> ("𐐋", "𐐳")

+    | "AY" -> ("𐐌", "𐐴")

+    | "AW" -> ("𐐍", "𐐵")

+    | "ER" -> ("𐐊𐐡", "𐐊𐑉")

+    | _ -> ("", "")

+let get_char = function

+  | "W" -> ("𐐎", "𐐶")

+  | "Y" -> ("𐐏", "𐐷")

+  | "H"

+   |"HH" ->

+      ("𐐐", "𐐸")

+  | "P" -> ("𐐑", "𐐹")

+  | "B" -> ("𐐒", "𐐺")

+  | "T" -> ("𐐓", "𐐻")

+  | "D" -> ("𐐔", "𐐼")

+  | "CH" -> ("𐐕", "𐐽")

+  | "JH" -> ("𐐖", "𐐾")

+  | "K" -> ("𐐗", "𐐿")

+  | "G" -> ("𐐘", "𐑀")

+  | "F" -> ("𐐙", "𐑁")

+  | "V" -> ("𐐚", "𐑂")

+  | "TH" -> ("𐐛", "𐑃")

+  | "DH" -> ("𐐜", "𐑄")

+  | "S" -> ("𐐝", "𐑅")

+  | "Z" -> ("𐐞", "𐑆")

+  | "SH" -> ("𐐟", "𐑇")

+  | "ZH" -> ("𐐠", "𐑈")

+  | "R" -> ("𐐡", "𐑉")

+  | "L" -> ("𐐢", "𐑊")

+  | "M" -> ("𐐣", "𐑋")

+  | "N" -> ("𐐤", "𐑌")

+  | "NX"

+   |"NG" ->

+      ("𐐥", "𐑍")

+  | v -> get_vowel v

+let rec parse_arpabet line des uppercase =

+  match line with

+  | hd :: tl ->

+      let u, l = get_char hd in

+      (if uppercase then u else l) ^ parse_arpabet tl des false

+  | [] -> des

+let unquoted word = String.sub word 1 (String.length word - 2)

+let consider word =

+  let wrd = ref (String.lowercase_ascii word) in

+  let prefix = ref "" in

+  let suffix = ref "" in

+  ( try

+      let pos = Str.search_forward (Str.regexp "[({\"]") !wrd 0 + 1 in

+      wrd := String.sub word pos (String.length !wrd - pos) ;

+      prefix := String.sub word 0 pos

+    with Not_found -> () ) ;

+  ( try

+      let pos = Str.search_backward (Str.regexp "[})\"\\.,!;:]") !wrd (String.length !wrd) in

+      suffix := String.sub !wrd pos (String.length !wrd - pos) ;

+      wrd := String.sub !wrd 0 pos;

+    with Not_found -> () ) ;

+  {prefix= !prefix; word= !wrd; suffix= !suffix}

+let parse word dictionary =

+  let uppercase = is_uppercase word.[0] in

+  let wordparts = consider word in

+  try

+    let des = parse_arpabet (String.split_on_char ' ' (Hashtbl.find dictionary wordparts.word)) "" uppercase in

+    wordparts.prefix ^ des ^ wordparts.suffix

+  with Not_found -> word

+let sanitize line = Str.global_replace (Str.regexp "\\.\\.\\.") " ... " line

+let load_dictionary extra =

+  let default =

+		try

+			let prefix = Unix.getenv "OPAM_SWITCH_PREFIX" in

+			prefix ^ "/share/desereter/cmudict.dict"

+		with Not_found -> "/lib/cmudict.dict"

+  in

+  let dictionaries = [default] @ String.split_on_char ';' extra in

+  let dictionary = Hashtbl.create 150000 in

+  let load file =

+    if String.length file > 0 then

+      let ic = open_in file in

+      try

+        while true do

+          let entry = input_line ic in

+          if Str.string_match entry_rgx entry 0 then

+            let word = Str.matched_group 1 entry in

+            let pronunciation = Str.matched_group 2 entry in

+            Hashtbl.add dictionary word pronunciation

+        done

+      with End_of_file -> close_in ic in

+  List.iter load dictionaries ; dictionary

+let translate dictionary line =

+  let words = String.split_on_char ' ' (sanitize line) in

+  let words =

+    List.filter

+      (fun x ->

+        let w = String.trim x in

+        String.length w > 0 )

+      words in

+  print_endline (List.fold_left (fun acc word -> acc ^ parse word dictionary ^ " ") "" words)

+let () =

+  let line = ref "" in

+  let extra = ref "" in

+  Arg.parse

+    [(* ("-i", Arg.Set_string line, "input"); *) ("-d", Arg.Set_string extra, "dictionary")]

+    (fun x -> line := x)

+    (Sys.argv.(0) ^ " [-d dictionary] <-i input>") ;

+  let dictionary = load_dictionary !extra in

+  if String.length !line > 0 then translate dictionary !line

+  else

+    try

+      while true do

+        read_line () |> translate dictionary

+      done

+    with End_of_file -> ()

--- /dev/null

+++ b/desereter.opam

@@ -1,0 +1,26 @@

+# This file is generated by dune, edit dune-project instead

+opam-version: "2.0"

+version: "19"

+synopsis: "Tool to convert English text to the Deseret Alphabet"

+maintainer: ["smazga@greymanlabs.com"]

+authors: ["smazga"]

+homepage: "https://shithub.us/git/zgasma/desereter/HEAD/info.html"

+bug-reports: "no"

+depends: [

+  "dune" {>= "2.7"}

+  "odoc" {with-doc}

+]

+build: [

+  ["dune" "subst"] {dev}

+  [

+    "dune"

+    "build"

+    "-p"

+    name

+    "-j"

+    jobs

+    "@install"

+    "@runtest" {with-test}

+    "@doc" {with-doc}

+  ]

+]

--- /dev/null

+++ b/dune

@@ -1,0 +1,11 @@

+(executable

+ (name desereter)

+ (libraries unix str))

+(install

+ (section bin)

+  (files (desereter.exe as desereter)))

+(install

+ (section share)

+ (files cmudict.dict))

--- /dev/null

+++ b/dune-project

@@ -1,0 +1,13 @@

+(lang dune 2.7)

+(version 19)

+(name desereter)

+(maintainers smazga@greymanlabs.com)

+(authors smazga)

+(homepage https://shithub.us/git/zgasma/desereter/HEAD/info.html)

+(bug_reports no)

+(generate_opam_files true)

+(package

+  (name desereter)

+  (synopsis "Tool to convert English text to the Deseret Alphabet"))

--- /dev/null

+++ b/mkfile

@@ -1,0 +1,14 @@

+TARG=desereter

+$TARG: desereter.ml

+	ocamlc str.cma unix.cma desereter.ml -o $target

+install:V: $TARG

+	cp $TARG $home/bin/

+	cp cmudict.dict /lib

+uninstall:V:

+	rm -f $home/bin/$TARG /lib/cmudict.dict

+clean:V:

+	rm -f $TARG *cmi *cmo