ref: 7c1994bd809aea4c535997c09f6703675986f98e
parent: e98804740dd0d36ee04d710dc0859d31aa0126a5
parent: b0154e4d08ef5ddbb9e791e4e999eadadc23b614
author: Sai Krishna <srallaba@cs.cmu.edu>
date: Mon Jun 24 12:18:25 EDT 2019
Merge pull request #10 from krishnshyam/hear2read-update Update for Indic: Hear2Read
--- /dev/null
+++ b/cmu_indic_lang.c
@@ -1,0 +1,729 @@
+/*************************************************************************/
+/* */
+/* Language Technologies Institute */
+/* Carnegie Mellon University */
+/* Copyright (c) 2013 */
+/* All Rights Reserved. */
+/* */
+/* Permission is hereby granted, free of charge, to use and distribute */
+/* this software and its documentation without restriction, including */
+/* without limitation the rights to use, copy, modify, merge, publish, */
+/* distribute, sublicense, and/or sell copies of this work, and to */
+/* permit persons to whom this work is furnished to do so, subject to */
+/* the following conditions: */
+/* 1. The code must retain the above copyright notice, this list of */
+/* conditions and the following disclaimer. */
+/* 2. Any modifications must be clearly marked as such. */
+/* 3. Original authors' names are not deleted. */
+/* 4. The authors' names are not used to endorse or promote products */
+/* derived from this software without specific prior written */
+/* permission. */
+/* */
+/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
+/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
+/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
+/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
+/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
+/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
+/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
+/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
+/* THIS SOFTWARE. */
+/* */
+/*************************************************************************/
+/* */
+/* indic language support */
+/* */
+/*************************************************************************/
+#include "flite.h"
+#include "cst_val.h"
+#include "cst_voice.h"
+#include "cst_lexicon.h"
+#include "cst_ffeatures.h"
+#include "cmu_indic_lang.h"
+#include "cst_tokenstream.h"
+
+/* ./bin/compile_regexes cst_rx_eng_digits_only "^[0-9,]+$" */
+static const unsigned char cst_rx_eng_digits_only_rxprog[] = {
+ 156, 6, 0, 27, 1, 0, 3, 11, 0, 18, 4, 0, 0, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 44, 0, 2, 0, 3, 0, 0, 0,
+};
+static const cst_regex cst_rx_eng_digits_only_rx = {
+ 0, 1, NULL, 0, 31,
+ (char *)cst_rx_eng_digits_only_rxprog
+};
+const cst_regex * const cst_rx_eng_digits_only = &cst_rx_eng_digits_only_rx;
+
+/* ./bin/compile_regexes cst_rx_not_indic "^[0-9a-zA-Z/:_'-,]+$" */
+static const unsigned char cst_rx_not_indic_rxprog[] = {
+ 156, 6, 0, 87, 1, 0, 3, 11, 0, 78, 4, 0, 0, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+ 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
+ 122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 47, 58, 95, 39, 40,
+ 41, 42, 43, 44, 0, 2, 0, 3, 0, 0, 0,
+};
+static const cst_regex cst_rx_not_indic_rx = {
+ 0, 1, NULL, 0, 91,
+ (char *)cst_rx_not_indic_rxprog
+};
+const cst_regex * const cst_rx_not_indic = &cst_rx_not_indic_rx;
+
+/* ./bin/compile_regexes cst_rx_indic_eng_number "^[1-9][0-9],\\([0-9][0-9],\\)*[0-9][0-9][0-9]$" */
+static const unsigned char cst_rx_indic_eng_number_rxprog[] = {
+ 156, 6, 0, 137, 1, 0, 3, 4, 0, 13, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 0, 4, 0, 14, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+ 57, 0, 8, 0, 5, 44, 0, 6, 0, 48, 21, 0, 3, 6, 0, 36,
+ 4, 0, 14, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 4, 0,
+ 14, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 8, 0, 5, 44,
+ 0, 31, 0, 3, 7, 0, 45, 6, 0, 3, 9, 0, 3, 4, 0, 14,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 4, 0, 14, 48, 49,
+ 50, 51, 52, 53, 54, 55, 56, 57, 0, 4, 0, 14, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 0, 2, 0, 3, 0, 0, 0,
+};
+static const cst_regex cst_rx_indic_eng_number_rx = {
+ 0, 1, NULL, 0, 141,
+ (char *)cst_rx_indic_eng_number_rxprog
+};
+const cst_regex * const cst_rx_indic_eng_number = &cst_rx_indic_eng_number_rx;
+
+cst_val *us_tokentowords(cst_item *token);
+
+/* Note that's an ascii | not the devangari one */
+const cst_string * const indic_postpunctuationsymbols = "\"'`.,:;!?(){}[]|";
+
+static cst_val *cmu_indic_tokentowords_one(cst_item *token, const char *name);
+cst_val *cmu_indic_tokentowords(cst_item *token) {
+ return cmu_indic_tokentowords_one(token, item_feat_string(token, "name"));
+}
+
+/* Indic numbers. This deals with all (quantity) numbers found in any Indic */
+/* language no matter what script they are written in. We use the Indic_Nums */
+/* table to convert the strings of digits (points and commas) into lists of */
+/* words for those scripts' language. Thus Telugu digits get converted to */
+/* Telugu words (even if the voice is a Hindi voice). */
+/* We assume use lakh and crore examples when there is commas to identify */
+/* thus 10,34,123 (in English digits) will be expanded to 10 lakh, thirty */
+/* four thousand one hundred (and) twenty three */
+
+/* We do English too, so I can debug it, and so lakh and crore are right */
+#include "indic_eng_num_table.h"
+#include "indic_hin_num_table.h"
+#include "indic_guj_num_table.h"
+#include "indic_kan_num_table.h"
+#include "indic_mar_num_table.h"
+#include "indic_san_num_table.h"
+#include "indic_tel_num_table.h"
+#include "indic_tam_num_table.h"
+#include "indic_pan_num_table.h"
+
+
+int ts_utf8_sequence_length(char c0);
+// inline int utf8_sequence_length(char c0)
+// {
+ // Get the expected length of UTF8 sequence given its most
+ // significant byte
+// return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;
+// }
+
+
+int ts_utf8_sequence_length(char c0);
+// inline int utf8_sequence_length(char c0)
+// {
+ // Get the expected length of UTF8 sequence given its most
+ // significant byte
+// return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;
+// }
+
+
+int indic_digit_to_offset(const char *ind_digit)
+{
+ /* This functions returns int value of a single digit in Indic/English scripts.
+ Also, it returns -1 if the character isn't a digit */
+
+ int output=-1;
+ int i;
+ int offset=-1;
+
+ i = cst_utf8_ord_string(ind_digit);
+
+ if ((i >= 0x0030) && (i <= 0x0039)) /*ASCII*/
+ offset = 0x0030;
+ if ((i >= 0x0966) && (i <= 0x096F)) /*Devanagari*/
+ offset = 0x0966;
+ if ((i >= 0x09E6) && (i <= 0x09EF)) /*Bengali*/
+ offset = 0x09E6;
+ if ((i >= 0x0A66) && (i <= 0x0A6F)) /*Gurmukhi*/
+ offset = 0x0A66;
+ if ((i >= 0x0AE6) && (i <= 0x0AEF)) /*Gujarati*/
+ offset = 0x0AE6;
+ if ((i >= 0x0B66) && (i <= 0x0B6F)) /*Oriya*/
+ offset = 0x0B66;
+ if ((i >= 0x0BE6) && (i <= 0x0BEF)) /*Tamil*/
+ offset = 0x0BE6;
+ if ((i >= 0x0C66) && (i <= 0x0C6F)) /*Telugu*/
+ offset = 0x0C66;
+ if ((i >= 0x0CE6) && (i <= 0x0CEF)) /*Kannada*/
+ offset = 0x0CE6;
+ if ((i >= 0x0D66) && (i <= 0x0D6F)) /*Malayalam*/
+ offset = 0x0D66;
+
+ if (offset == -1)
+ {
+ /* Not a digit */
+ return -1;
+ }
+
+ output = i - offset;
+
+ return output;
+}
+
+static cst_val *indic_number_digit(const char *digit,const indic_num_table *t)
+{
+ int i;
+
+ if ((digit == NULL) || (t == NULL))
+ return NULL;
+
+ i = indic_digit_to_offset(digit);
+
+ if (i == -1)
+ {
+ printf("Error in getting int from digit %s\n", digit);
+ return NULL;
+ }
+
+ /* The ith array index corresponds to the exact single digit number*/
+ return cons_val(string_val(num_table_digit(t,i,1)),NULL);
+}
+
+static cst_val *indic_number_two_digit(const char *digit1,
+ const char *digit2,
+ const indic_num_table *t)
+{
+ int i,j;
+ cst_val *r = NULL;
+
+ if ((digit1 == NULL) || (digit2 == NULL) || (t == NULL))
+ return NULL;
+
+ i = indic_digit_to_offset(digit1);
+
+ j = indic_digit_to_offset(digit2);
+
+ if (i == -1)
+ {
+ printf("Error in getting int from digit %s\n", digit1);
+ return NULL;
+ }
+
+ if (j == -1)
+ {
+ printf("Error in getting int from digit %s\n", digit2);
+ return NULL;
+ }
+
+ if (i == 0)
+ {
+ printf("Single digit erroneously processed as double digit %s\n", digit2);
+ return cons_val(string_val(num_table_digit(t,i,1)),NULL);
+ }
+
+
+ /*10*(i-1)+j given correct two digit index*/
+ if (num_table_two_digit(t,10*(i-1)+j,3) != NULL)
+ r = cons_val(string_val(num_table_two_digit(t,10*(i-1)+j,3)),r);
+ if (num_table_two_digit(t,10*(i-1)+j,2) != NULL)
+ r = cons_val(string_val(num_table_two_digit(t,10*(i-1)+j,2)),r);
+
+ return r;
+}
+
+static cst_val *indic_number_lang(const indic_num_table *num_table)
+{
+ return string_val(num_table->lang);
+}
+static cst_val *indic_number_hundred(const indic_num_table *num_table)
+{
+ return string_val(num_table->hundred);
+}
+static cst_val *indic_number_thousand(const indic_num_table *num_table)
+{
+ return string_val(num_table->thousand);
+}
+static cst_val *indic_number_lakh(const indic_num_table *num_table)
+{
+ return string_val(num_table->lakh);
+}
+static cst_val *indic_number_crore(const indic_num_table *num_table)
+{
+ return string_val(num_table->crore);
+}
+
+cst_val *indic_number(const cst_val *number,
+ const indic_num_table *num_table)
+{
+ cst_val *r = NULL;
+ /* so its a number in some script (we actually don't care which script) */
+
+#if 0
+ printf("awb_debug enter indic num ");
+ val_print(stdout,number); printf("\n");
+#endif
+
+
+ if (number == NULL)
+ r = NULL;
+ /* If zero is the penultimate digit */
+ else if ((indic_digit_to_offset(val_string(val_car(number))) == 0) &&
+ (val_length(number) == 2))
+ {
+ /* If the last digit is non-zero */
+ if (indic_digit_to_offset(val_string(val_car(val_cdr(number)))) != 0)
+ {
+ r = indic_number_digit(val_string(val_car(val_cdr(number))),num_table);
+ }
+ else
+ {
+ /* So it doesn't say zero in the end*/
+ }
+ }
+ /* If the current digit is a 0 and there is a next digit */
+ else if ((indic_digit_to_offset(val_string(val_car(number))) == 0) &&
+ (val_cdr(number) != NULL))
+
+ {
+ r = indic_number(val_cdr(number),num_table);
+ }
+ else if (val_length(number) == 1)
+ {
+ r = indic_number_digit(val_string(val_car(number)),num_table);
+ }
+ else if (val_length(number) == 2)
+ {
+ r = indic_number_two_digit(val_string(val_car(number)),
+ val_string(val_car(val_cdr(number))),
+ num_table);
+ }
+ else if (val_length(number) == 3)
+ {
+ if ((!cst_streq(val_string(indic_number_lang(num_table)),"mar")) ||
+ indic_digit_to_offset(val_string(val_car(val_cdr(number)))) ||
+ indic_digit_to_offset(val_string(val_car(val_cdr(val_cdr(number))))))
+ r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+ cons_val(indic_number_hundred(num_table),
+ indic_number(val_cdr(number),num_table)));
+ else
+ r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+ cons_val(string_val("शंभर"), indic_number(val_cdr(number),num_table)));
+ }
+ else if (val_length(number) == 4)
+ {
+ r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+ cons_val(indic_number_thousand(num_table),
+ indic_number(val_cdr(number),num_table)));
+ }
+ else if (val_length(number) == 5)
+ {
+ r = val_append(indic_number_two_digit(val_string(val_car(number)),
+ val_string(val_car(val_cdr(number))),
+ num_table),
+ cons_val(indic_number_thousand(num_table),
+ indic_number(val_cdr(val_cdr(number)),num_table)));
+ }
+ else if (val_length(number) == 6)
+ {
+ r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+ cons_val(indic_number_lakh(num_table),
+ indic_number(val_cdr(number),num_table)));
+ }
+ else if (val_length(number) == 7)
+ {
+ r = val_append(indic_number_two_digit(val_string(val_car(number)),
+ val_string(val_car(val_cdr(number))),
+ num_table),
+ cons_val(indic_number_lakh(num_table),
+ indic_number(val_cdr(val_cdr(number)),num_table)));
+ }
+ else if (val_length(number) == 8)
+ {
+ r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+ cons_val(indic_number_crore(num_table),
+ indic_number(val_cdr(number),num_table)));
+ }
+ else if (val_length(number) == 9)
+ {
+ r = val_append(indic_number_two_digit(val_string(val_car(number)),
+ val_string(val_car(val_cdr(number))),
+ num_table),
+ cons_val(indic_number_crore(num_table),
+ indic_number(val_cdr(val_cdr(number)),num_table)));
+ }
+
+#if 0
+ printf("awb_debug end of indic num ");
+ val_print(stdout,r); printf("\n");
+#endif
+
+ return r;
+}
+
+cst_val *indic_number_indiv(const cst_val *number,
+ const indic_num_table *num_table)
+{
+ cst_val *r = NULL;
+ /* Exapnd this as a string of digits (not an actual quantity) */
+
+ if (number == NULL)
+ r = NULL;
+ else
+ {
+ r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+ indic_number_indiv(val_cdr(number),num_table));
+ }
+
+ return r;
+}
+
+#if 0
+static int indic_nump_old(const char *number)
+{
+ /* True if all (unicode) characters are in num_table's digit table */
+ /* or is a comma or dot */
+ cst_val *p;
+ const cst_val *q;
+ int i;
+ int flag = TRUE;
+ int fflag;
+
+ p = cst_utf8_explode(number);
+ for (q=p; q && (flag==TRUE); q=val_cdr(q))
+ {
+ fflag = FALSE;
+ for (i=0; i<10; i++)
+ {
+ if (indic_digit_to_offset(val_string(val_car(q))) != -1)
+ {
+ fflag = TRUE;
+ break;
+ }
+ }
+ if ((cst_streq(val_string(val_car(q)),",")) ||
+ /* English zeros sometimes occur */
+ (cst_streq(val_string(val_car(q)),"0")))
+ fflag = TRUE;
+ flag = fflag;
+ }
+ delete_val(p); p = NULL;
+
+ return flag;
+
+}
+#endif
+
+
+static int indic_nump(const char *number)
+{
+ /* Check if non-empty string */
+ if (!number[0])
+ return FALSE;
+
+ /* Catch lone commas */
+ if (number[0] == ',')
+ return indic_nump(&number[1]);
+
+
+ /* Returns 2 if all characters are numbers or commas */
+ /* Returns 1 if it starts with a number */
+ cst_val *p;
+ const cst_val *q;
+ int flag = TRUE;
+ int fflag;
+ int ffflag = FALSE; /* Switches to TRUE at first digit found */
+
+ p = cst_utf8_explode(number);
+ for (q=p; q && (flag==TRUE); q=val_cdr(q))
+ {
+ fflag = FALSE;
+ if (indic_digit_to_offset(val_string(val_car(q))) != -1)
+ {
+ fflag = TRUE;
+ ffflag = TRUE;
+ }
+
+ else if (cst_streq(val_string(val_car(q)),","))
+ fflag = TRUE;
+ flag = fflag;
+ }
+ delete_val(p); p = NULL;
+
+ return flag+ffflag;
+
+}
+
+static int indic_hyphenated(const char *number)
+{
+ /* Returns positive if first character is , - / and is followed by a */
+ /* number */
+ int flag = 0;
+ if ((number[0] == '-') || (number[0] == '/') || (number[0] == '.'))
+ flag = indic_nump(&number[1]);
+ return flag;
+}
+
+static int indic_text_splitable(const char *s,int i,int len1)
+{
+ /* Returns true only if this and next chars are not both digits */
+ /* or both non-digits */
+
+ char *ccc, *ddd; /* Store this character and the next character */
+ int len2; /* Length of next character */
+
+ int flag;
+
+ ccc = cst_strdup(&s[i]);
+ ddd = cst_strdup(&s[i+len1]);
+
+ len2 = ts_utf8_sequence_length(ddd[0]);
+
+ ccc[len1] = '\0';
+ ddd[len2] = '\0';
+
+ /* Makeshift NOR */
+ flag = (indic_digit_to_offset(ccc) == -1)? !(indic_digit_to_offset(ddd) == -1):
+ (indic_digit_to_offset(ddd) == -1);
+
+ cst_free(ccc);
+ cst_free(ddd);
+
+ return flag;
+}
+
+
+
+
+static cst_val *indic_num_normalize(const char *number,
+ const indic_num_table *num_table)
+{
+ /* Remove , */
+ cst_val *p, *np;
+ const cst_val *q;
+
+ p = cst_utf8_explode(number);
+ np = NULL;
+ for (q=p; q; q=val_cdr(q))
+ {
+ if (!cst_streq(val_string(val_car(q)),","))
+ np = cons_val(string_val(val_string(val_car(q))),np);
+ }
+ delete_val(p);
+ return val_reverse(np);
+}
+
+static cst_val *cmu_indic_tokentowords_one(cst_item *token, const char *name)
+{
+ /* Return list of words that expand token/name */
+ cst_val *r, *p;
+ const indic_num_table *num_table;
+ const char *variant;
+ cst_utterance *utt;
+
+ /* printf("awb_debug token_name %s name %s\n",item_name(token),name); */
+ r = NULL;
+
+ if (item_feat_present(token,"phones"))
+ return cons_val(string_val(name),NULL);
+
+#if 0
+ if (item_feat_present(token,"nsw"))
+ nsw = item_feat_string(token,"nsw");
+
+ utt = item_utt(token);
+ lex = val_lexicon(feat_val(utt->features,"lexicon"));
+#endif
+ utt = item_utt(token);
+ variant = get_param_string(utt->features, "variant", "none");
+ if (cst_streq(variant,"hin"))
+ num_table = &hin_num_table;
+ else if (cst_streq(variant,"guj"))
+ num_table = &guj_num_table;
+ else if (cst_streq(variant,"kan"))
+ num_table = &kan_num_table;
+ else if (cst_streq(variant,"mar"))
+ num_table = &mar_num_table;
+ else if (cst_streq(variant,"nep"))
+ num_table = &hin_num_table;
+ else if (cst_streq(variant, "pan"))
+ num_table = &pan_num_table;
+ else if (cst_streq(variant, "san"))
+ num_table = &san_num_table;
+ else if (cst_streq(variant,"tam"))
+ num_table = &tam_num_table;
+ else if (cst_streq(variant,"tel"))
+ num_table = &tel_num_table;
+ else
+ num_table = &eng_num_table;
+
+ /* This matches *English* numbers of the form 99,99,999 that require lakh
+ or crore expansion -- otherwise they'll be dropped back to the English
+ front end */
+ if (cst_regex_match(cst_rx_indic_eng_number,name))
+ {
+ /* remove commas */
+ p = indic_num_normalize(name,num_table);
+ if (val_length(p) <= 9)
+ /* Long strings of digits are read as strings of digits */
+ r = indic_number(p, num_table);
+ else
+ r = indic_number_indiv(p,num_table);
+ delete_val(p);
+ }
+ else if (indic_nump(name))
+
+ { /* Its script specific digits (commas/dots) */
+ if (indic_nump(name) == 2)
+ { /* All characters are digits */
+ // printf("nump is 2\n");
+ p = indic_num_normalize(name,num_table);
+ if (val_length(p) <= 9)
+ r = indic_number(p, num_table);
+ else
+ r = indic_number_indiv(p,num_table);
+ delete_val(p);
+ }
+ else if (indic_nump(name) == 1)
+ { /* Some characters are digits */
+ int len = 1;
+ int i = 0;
+ char c0;
+ char *aaa;
+ char *bbb;
+ while(name[i] != '\0')
+ {
+ /* Iterate over UTF-8 string */
+ c0 = name[i];
+ len = ts_utf8_sequence_length(c0);
+ /* Check if char after this is comma */
+ if (name[i+len] == ',')
+ {
+ /* Skip commas */
+ i += len;
+ c0 = name[i];
+ len = ts_utf8_sequence_length(c0);
+ i += len;
+ continue;
+ }
+ /* Find where character type switches to or from digits */
+ if(indic_text_splitable(name, i, len))
+ break;
+ i +=len;
+ }
+ aaa = cst_strdup(name);
+ aaa[i+len] = '\0';
+ bbb = cst_strdup(&name[i+len]);
+ r = val_append(cmu_indic_tokentowords_one(token, aaa),
+ cmu_indic_tokentowords_one(token, bbb));
+ cst_free(aaa);
+ cst_free(bbb);
+ }
+ }
+ else if (indic_hyphenated(name))
+ { /* For numbers seeparated by - / , */
+ char *aaa;
+ aaa = cst_strdup(&name[1]);
+ r = cmu_indic_tokentowords_one(token, aaa);
+ cst_free(aaa);
+ }
+
+ else if (cst_regex_match(cst_rx_not_indic,name))
+ /* Do English analysis on non-unicode tokens */
+ r = us_tokentowords(token);
+ else if (cst_strlen(name) > 0)
+ r = cons_val(string_val(name),0);
+ else
+ r = NULL;
+
+ return r;
+}
+
+int indic_utt_break(cst_tokenstream *ts,
+ const char *token,
+ cst_relation *tokens)
+{
+ const char *postpunct = item_feat_string(relation_tail(tokens), "punc");
+ const char *ltoken = item_name(relation_tail(tokens));
+
+ if (cst_strchr(ts->whitespace,'\n') != cst_strrchr(ts->whitespace,'\n'))
+ /* contains two new lines */
+ return TRUE;
+ else if ((cst_strlen(ltoken) >= 3) &&
+ (cst_streq(<oken[cst_strlen(ltoken)-3],"।"))) /* devanagari '|' */
+ return TRUE;
+ else if (strchr(postpunct,':') ||
+ strchr(postpunct,'?') ||
+ strchr(postpunct,'|') || /* if ascii '|' gets used as dvngr '|' */
+ strchr(postpunct,'!'))
+ return TRUE;
+ else if (strchr(postpunct,'.'))
+ return TRUE;
+ else
+ return FALSE;
+}
+
+DEF_STATIC_CONST_VAL_STRING(val_string_zero,"0");
+DEF_STATIC_CONST_VAL_STRING(val_string_one,"1");
+
+const cst_val *is_english(const cst_item *p)
+{
+ if (p && cst_regex_match(cst_rx_not_indic,
+ flite_ffeature_string(p,"name")))
+ return (cst_val *)&val_string_one;
+ else
+ return (cst_val *)&val_string_zero;
+}
+
+void cmu_indic_lang_init(cst_voice *v)
+{
+ /* Set indic language stuff */
+ feat_set_string(v->features,"language","cmu_indic_lang");
+
+ /* utterance break function */
+ feat_set(v->features,"utt_break",breakfunc_val(&indic_utt_break));
+
+ /* Phoneset -- need to get this from voice */
+ feat_set(v->features,"phoneset",phoneset_val(&cmu_indic_phoneset));
+ feat_set_string(v->features,"silence",cmu_indic_phoneset.silence);
+
+ /* Get information from voice and add to lexicon */
+
+ /* Text analyser -- whitespace defaults */
+ feat_set_string(v->features,"text_whitespace",
+ cst_ts_default_whitespacesymbols);
+ feat_set_string(v->features,"text_prepunctuation",
+ cst_ts_default_prepunctuationsymbols);
+ /* We can't put multi-byte characters in these classes so we can't */
+ /* add devanagari end of sentence '|' here, but would like to -- */
+ /* But we do add ascii '|' to it as it sometimes gets used the same way */
+ feat_set_string(v->features,"text_postpunctuation",
+ indic_postpunctuationsymbols);
+ feat_set_string(v->features,"text_singlecharsymbols",
+ cst_ts_default_singlecharsymbols);
+
+ /* Tokenization tokenization function */
+ feat_set(v->features,"tokentowords_func",itemfunc_val(&cmu_indic_tokentowords));
+ /* Pos tagger (gpos)/induced pos */
+
+ /* Phrasing */
+ feat_set(v->features,"phrasing_cart",cart_val(&cmu_indic_phrasing_cart));
+
+ /* Intonation, Duration and F0 -- part of cg */
+ feat_set_string(v->features,"no_intonation_accent_model","1");
+
+ /* Default ffunctions (required) */
+ basic_ff_register(v->ffunctions);
+
+ /* Indic specific features */
+ ff_register(v->ffunctions, "lisp_is_english", is_english);
+
+ return;
+}
--- a/lang/cmu_indic_lang/Makefile
+++ b/lang/cmu_indic_lang/Makefile
@@ -46,6 +46,7 @@
indic_guj_num_table.h \
indic_tam_num_table.h \
indic_tel_num_table.h \
+ indic_san_num_table.h \
indic_pan_num_table.h
SRCS = cmu_indic_lang.c cmu_indic_phoneset.c cmu_indic_phrasing_cart.c
SCRIPTS =
--- a/lang/cmu_indic_lang/cmu_indic_lang.c
+++ b/lang/cmu_indic_lang/cmu_indic_lang.c
@@ -111,6 +111,7 @@
#include "indic_guj_num_table.h"
#include "indic_kan_num_table.h"
#include "indic_mar_num_table.h"
+#include "indic_san_num_table.h"
#include "indic_tel_num_table.h"
#include "indic_tam_num_table.h"
#include "indic_pan_num_table.h"
@@ -133,6 +134,7 @@
// return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;
// }
+
int indic_digit_to_offset(const char *ind_digit)
{
/* This functions returns int value of a single digit in Indic/English scripts.
@@ -237,6 +239,10 @@
return r;
}
+static cst_val *indic_number_lang(const indic_num_table *num_table)
+{
+ return string_val(num_table->lang);
+}
static cst_val *indic_number_hundred(const indic_num_table *num_table)
{
return string_val(num_table->hundred);
@@ -301,9 +307,15 @@
}
else if (val_length(number) == 3)
{
- r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
- cons_val(indic_number_hundred(num_table),
- indic_number(val_cdr(number),num_table)));
+ if ((!cst_streq(val_string(indic_number_lang(num_table)),"mar")) ||
+ indic_digit_to_offset(val_string(val_car(val_cdr(number)))) ||
+ indic_digit_to_offset(val_string(val_car(val_cdr(val_cdr(number))))))
+ r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+ cons_val(indic_number_hundred(num_table),
+ indic_number(val_cdr(number),num_table)));
+ else
+ r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+ cons_val(string_val("शंभर"), indic_number(val_cdr(number),num_table)));
}
else if (val_length(number) == 4)
{
@@ -409,6 +421,7 @@
}
#endif
+
static int indic_nump(const char *number)
{
/* Check if non-empty string */
@@ -540,14 +553,14 @@
num_table = &mar_num_table;
else if (cst_streq(variant,"nep"))
num_table = &hin_num_table;
- else if (cst_streq(variant,"san"))
- num_table = &hin_num_table;
- else if (cst_streq(variant,"tel"))
- num_table = &tel_num_table;
- else if (cst_streq(variant,"tam"))
- num_table = &tam_num_table;
else if (cst_streq(variant, "pan"))
num_table = &pan_num_table;
+ else if (cst_streq(variant, "san"))
+ num_table = &san_num_table;
+ else if (cst_streq(variant,"tam"))
+ num_table = &tam_num_table;
+ else if (cst_streq(variant,"tel"))
+ num_table = &tel_num_table;
else
num_table = &eng_num_table;
@@ -560,7 +573,7 @@
p = indic_num_normalize(name,num_table);
if (val_length(p) <= 9)
/* Long strings of digits are read as strings of digits */
- r = indic_number(p,num_table);
+ r = indic_number(p, num_table);
else
r = indic_number_indiv(p,num_table);
delete_val(p);
@@ -568,58 +581,58 @@
else if (indic_nump(name))
{ /* Its script specific digits (commas/dots) */
- if (indic_nump(name) == 2)
- { /* All characters are digits */
- // printf("nump is 2\n");
- p = indic_num_normalize(name,num_table);
- if (val_length(p) <= 9)
- r = indic_number(p,num_table);
- else
- r = indic_number_indiv(p,num_table);
- delete_val(p);
- }
- else if (indic_nump(name) == 1)
- { /* Some characters are digits */
- int len = 1;
- int i = 0;
- char c0;
- char *aaa;
- char *bbb;
- while(name[i] != '\0')
- {
- /* Iterate over UTF-8 string */
- c0 = name[i];
- len = ts_utf8_sequence_length(c0);
- /* Check if char after this is comma */
- if (name[i+len] == ',')
- {
- /* Skip commas */
- i += len;
- c0 = name[i];
- len = ts_utf8_sequence_length(c0);
- i += len;
- continue;
- }
- /* Find where character type switches to or from digits */
- if(indic_text_splitable(name, i, len))
- break;
- i +=len;
- }
- aaa = cst_strdup(name);
- aaa[i+len] = '\0';
- bbb = cst_strdup(&name[i+len]);
- r = val_append(cmu_indic_tokentowords_one(token, aaa),
- cmu_indic_tokentowords_one(token, bbb));
- cst_free(aaa);
- cst_free(bbb);
- }
+ if (indic_nump(name) == 2)
+ { /* All characters are digits */
+ // printf("nump is 2\n");
+ p = indic_num_normalize(name,num_table);
+ if (val_length(p) <= 9)
+ r = indic_number(p, num_table);
+ else
+ r = indic_number_indiv(p,num_table);
+ delete_val(p);
+ }
+ else if (indic_nump(name) == 1)
+ { /* Some characters are digits */
+ int len = 1;
+ int i = 0;
+ char c0;
+ char *aaa;
+ char *bbb;
+ while(name[i] != '\0')
+ {
+ /* Iterate over UTF-8 string */
+ c0 = name[i];
+ len = ts_utf8_sequence_length(c0);
+ /* Check if char after this is comma */
+ if (name[i+len] == ',')
+ {
+ /* Skip commas */
+ i += len;
+ c0 = name[i];
+ len = ts_utf8_sequence_length(c0);
+ i += len;
+ continue;
+ }
+ /* Find where character type switches to or from digits */
+ if(indic_text_splitable(name, i, len))
+ break;
+ i +=len;
+ }
+ aaa = cst_strdup(name);
+ aaa[i+len] = '\0';
+ bbb = cst_strdup(&name[i+len]);
+ r = val_append(cmu_indic_tokentowords_one(token, aaa),
+ cmu_indic_tokentowords_one(token, bbb));
+ cst_free(aaa);
+ cst_free(bbb);
+ }
}
else if (indic_hyphenated(name))
{ /* For numbers seeparated by - / , */
- char *aaa;
- aaa = cst_strdup(&name[1]);
- r = cmu_indic_tokentowords_one(token, aaa);
- cst_free(aaa);
+ char *aaa;
+ aaa = cst_strdup(&name[1]);
+ r = cmu_indic_tokentowords_one(token, aaa);
+ cst_free(aaa);
}
else if (cst_regex_match(cst_rx_not_indic,name))
--- /dev/null
+++ b/lang/cmu_indic_lang/indic_san_num_table.h
@@ -1,0 +1,172 @@
+/*************************************************************************/
+/* */
+/* Language Technologies Institute */
+/* Carnegie Mellon University */
+/* Copyright (c) 2015 */
+/* All Rights Reserved. */
+/* */
+/* Permission is hereby granted, free of charge, to use and distribute */
+/* this software and its documentation without restriction, including */
+/* without limitation the rights to use, copy, modify, merge, publish, */
+/* distribute, sublicense, and/or sell copies of this work, and to */
+/* permit persons to whom this work is furnished to do so, subject to */
+/* the following conditions: */
+/* 1. The code must retain the above copyright notice, this list of */
+/* conditions and the following disclaimer. */
+/* 2. Any modifications must be clearly marked as such. */
+/* 3. Original authors' names are not deleted. */
+/* 4. The authors' names are not used to endorse or promote products */
+/* derived from this software without specific prior written */
+/* permission. */
+/* */
+/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
+/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
+/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
+/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
+/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
+/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
+/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
+/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
+/* THIS SOFTWARE. */
+/* */
+/*************************************************************************/
+/* Number pronunciation for (Sanskrit) Indic */
+/*************************************************************************/
+
+#ifndef _indic_san_num_table_h_
+#define _indic_san_num_table_h_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "indic_num_table.h"
+
+static const char * const indic_san_digit[11][2] =
+{
+ { "०", "शून्य" },
+ { "१", "एकं" },
+ { "२", "द्वि" },
+ { "३", "त्रि" },
+ { "४", "चतुर्" },
+ { "५", "पञ्च" },
+ { "६", "षट्" },
+ { "७", "सप्त" },
+ { "८", "अष्ट" },
+ { "९", "नव" },
+ { NULL, NULL },
+};
+
+static const char * const indic_san_two_digit[101][4] =
+{
+ { "१", "०", "दश", NULL },
+ { "१", "१", "एकादश", NULL },
+ { "१", "२", "द्वादश", NULL },
+ { "१", "३", "त्रयोदश", NULL },
+ { "१", "४", "चतुर्दश", NULL },
+ { "१", "५", "पञ्चदश", NULL },
+ { "१", "६", "षोडश", NULL },
+ { "१", "७", "सप्तदश", NULL },
+ { "१", "८", "अष्टादश", NULL },
+ { "१", "९", "एकोनविंशतिः", NULL },
+ { "२", "०", "विंशतिः", NULL },
+ { "२", "१", "एकाविंशतिः", NULL },
+ { "२", "२", "द्वाविंशतिः", NULL },
+ { "२", "३", "त्रयोविंशतिः", NULL },
+ { "२", "४", "चतुर्विंशतिः", NULL },
+ { "२", "५", "पञ्चविंशतिः", NULL },
+ { "२", "६", "षड्विंशतिः", NULL },
+ { "२", "७", "सप्तविंशतिः", NULL },
+ { "२", "८", "अष्टाविंशतिः ", NULL },
+ { "२", "९", "एकोनत्रिंशत्", NULL },
+ { "३", "०", "त्रिंशत्", NULL },
+ { "३", "१", "एकत्रिंशत्", NULL },
+ { "३", "२", "द्वात्रिंशत्", NULL },
+ { "३", "३", "त्रयस्त्रिंशत्", NULL },
+ { "३", "४", "चतुस्त्रिंशत्", NULL },
+ { "३", "५", "पञ्चत्रिंशत्", NULL },
+ { "३", "६", "षट्त्रिंशत्", NULL },
+ { "३", "७", "सप्तत्रिंशत्", NULL },
+ { "३", "८", "अष्टात्रिंशत्", NULL },
+ { "३", "९", "एकोनचत्वारिंशत्", NULL },
+ { "४", "०", "चत्वारिंशत्", NULL },
+ { "४", "१", "एकचत्वारिंशत्", NULL },
+ { "४", "२", "द्विचत्वारिंशत्", NULL },
+ { "४", "३", "त्रिचत्वारिंशत्", NULL },
+ { "४", "४", "चतुश्चत्वारिंशत्", NULL },
+ { "४", "५", "पञ्चचत्वारिंशत्", NULL },
+ { "४", "६", "षट्चत्वारिंशत्", NULL },
+ { "४", "७", "सप्तचत्वारिंशत्", NULL },
+ { "४", "८", "अष्टचत्वारिंशत्", NULL },
+ { "४", "९", "एकोनपञ्चाशत्", NULL },
+ { "५", "०", "पञ्चाशत्", NULL },
+ { "५", "१", "एकपञ्चाशत्", NULL },
+ { "५", "२", "द्विपञ्चाशत्", NULL },
+ { "५", "३", "त्रिपञ्चाशत्", NULL },
+ { "५", "४", "चतुःपञ्चाशत्", NULL },
+ { "५", "५", "पञ्चपञ्चाशत्", NULL },
+ { "५", "६", "षट्पञ्चाशत्", NULL },
+ { "५", "७", "सप्तपञ्चाशत्", NULL },
+ { "५", "८", "अष्टपञ्चाशत्", NULL },
+ { "५", "९", "एकोनषष्टिः", NULL },
+ { "६", "०", "षष्टिः", NULL },
+ { "६", "१", "एकषष्टिः", NULL },
+ { "६", "२", "द्विषष्टिः", NULL },
+ { "६", "३", "त्रिषष्टिः", NULL },
+ { "६", "४", "चतुष्षष्टिः", NULL },
+ { "६", "५", "पञ्चषष्टिः", NULL },
+ { "६", "६", "षट्षष्टिः", NULL },
+ { "६", "७", "सप्तषष्टिः", NULL },
+ { "६", "८", "अष्टषष्टिः", NULL },
+ { "६", "९", "एकोनसप्ततिः", NULL },
+ { "७", "०", "सप्ततिः", NULL },
+ { "७", "१", "एकसप्ततिः", NULL },
+ { "७", "२", "द्विसप्ततिः", NULL },
+ { "७", "३", "त्रिसप्ततिः", NULL },
+ { "७", "४", "चतुस्सप्ततिः", NULL },
+ { "७", "५", "पञ्चसप्ततिः", NULL },
+ { "७", "६", "षट्सप्ततिः", NULL },
+ { "७", "७", "सप्तसप्ततिः", NULL },
+ { "७", "८", "अष्टसप्ततिः", NULL },
+ { "७", "९", "एकोनाशीतिः", NULL },
+ { "८", "०", "अशीतिः", NULL },
+ { "८", "१", "एकाशीतिः", NULL },
+ { "८", "२", "द्वशीतिः", NULL },
+ { "८", "३", "त्र्यशीतिः", NULL },
+ { "८", "४", "चतुरशीतिः", NULL },
+ { "८", "५", "पञ्चाशीतिः", NULL },
+ { "८", "६", "षडशीतिः", NULL },
+ { "८", "७", "सप्ताशीतिः", NULL },
+ { "८", "८", "अष्टाशीतिः", NULL },
+ { "८", "९", "एकोननवतिः", NULL },
+ { "९", "०", "नवतिः", NULL },
+ { "९", "१", "एकनवतिः", NULL },
+ { "९", "२", "द्विनवतिः", NULL },
+ { "९", "३", "त्रिनवतिः", NULL },
+ { "९", "४", "चतुर्नवतिः", NULL },
+ { "९", "५", "पञ्चनवतिः", NULL },
+ { "९", "६", "षण्णवतिः", NULL },
+ { "९", "७", "सप्तनवतिः", NULL },
+ { "९", "८", "अष्टनवतिः", NULL },
+ { "९", "९", "एकोनशतम्", NULL },
+ { NULL, NULL },
+};
+
+const static indic_num_table san_num_table = {
+ "san",
+ &indic_san_digit,
+ &indic_san_two_digit,
+ "शतम्", /* hundred */
+ "सहस्र", /* thousand */
+ "लक्ष", /* lakh */
+ "कोटि", /* crore */
+};
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif
+
+
--- a/lang/cmu_indic_lex/cmu_indic_lex.c
+++ b/lang/cmu_indic_lex/cmu_indic_lex.c
@@ -283,34 +283,6 @@
return cmu_indic_offset_char[c].type;
}
-static int indic_text_splitable(const char *s,int i,int len1)
-{
- /* Returns true only if this and next chars are not both digits */
- /* or both non-digits */
-
- char *ccc, *ddd; /* Store this character and the next character */
- int len2; /* Length of next character */
-
- int flag;
-
- ccc = cst_strdup(&s[i]);
- ddd = cst_strdup(&s[i+len1]);
-
- len2 = utf8_sequence_length(ddd[0]);
-
- ccc[len1] = '\0';
- ddd[len2] = '\0';
-
- /* Makeshift NOR */
- flag = (indic_digit_to_offset(ccc) == -1)? !(indic_digit_to_offset(ddd) == -1):
- (indic_digit_to_offset(ddd) == -1);
-
- cst_free(ccc);
- cst_free(ddd);
-
- return flag;
-}
-
static const char *cmu_indic_get_char_phoneme(const cst_val *indic_char)
{
int c;
@@ -625,7 +597,8 @@
return in_phones;
}
-cst_val *cmu_indic_lex_nasal_postfixes(cst_val *in_phones)
+cst_val *cmu_indic_lex_nasal_postfixes(cst_val *in_phones,
+ const cst_features *feats)
{
/* Given a phone sequence containing a special character nX */
/* (contextual nasal), replace it with the appropriate nasal phone */
@@ -632,6 +605,10 @@
/* based on its context */
char *tmpstr;
const cst_val *p;
+
+ const char *indic_variant = 0;
+
+ indic_variant = get_param_string(feats, "variant", "none");
/* printf("awb_debug: pre "); val_print(stdout,in_phones); printf("\n"); */
for( p=in_phones; p && val_cdr(p); p=val_cdr(p))
@@ -642,7 +619,9 @@
((!val_cdr(val_cdr(p))) ||
(!val_car(val_cdr(val_cdr(p))))))
{
- if (cst_streq("A", val_string(val_car(p))))
+ if (cst_streq(indic_variant,"kan") ||
+ cst_streq(indic_variant,"tel") || /* Dravidian languages don't nasalize */
+ cst_streq("A", val_string(val_car(p))))
{ /* If it's a schwa, it's not nasalized. nX becomes m */
replace_car(val_cdr(p),string_val("m"));
} else {
@@ -961,25 +940,23 @@
return phones;
}
-/* For English derived pronunciation (latin scripted tokens) we map them */
-/* to (hindi) phones -- this has to modified for other indic languages */
-static const char * const eng_to_indic_orig[99][3] =
+static const char * const eng_to_indic[99][3] =
{
{"aa", "A:", NULL },
- {"ae", "A", NULL }, /* changed this to A rather than e */
+ {"ae", "aI", NULL },
{"ah", "A", NULL },
- {"ao", "o", NULL },
- {"aw", "aU", NULL },
+ {"ao", "aU", NULL },
+ {"aw", "A:", "u" },
{"ax", "A", NULL },
- {"axr", "A", NULL },
- {"ay", "aI", NULL },
+ {"axr", "A", "9r" },
+ {"ay", "A:", "i" },
{"b", "b", NULL },
{"ch", "c", NULL },
- {"d", "dB", NULL },
+ {"d", "dr", NULL },
{"dh", "dB", NULL },
- {"eh", "e", NULL },
- {"er", "9r", NULL },
- {"ey", "ay", NULL },
+ {"eh", "E", NULL },
+ {"er", "A", "9r" },
+ {"ey", "e", NULL },
{"f", "ph", NULL },
{"g", "g", NULL },
{"hh", "hv", NULL },
@@ -991,9 +968,9 @@
{"m", "m", NULL },
{"n", "nB", NULL },
{"nx", "nB", NULL },
- {"ng", "nB", NULL },
+ {"ng", "N", NULL },
{"ow", "o", NULL },
- {"oy", "o", "j" },
+ {"oy", "aU", "i" },
{"p", "p", NULL },
{"r", "9r", NULL },
{"s", "s", NULL },
@@ -1005,88 +982,112 @@
{"v", "v", NULL },
{"w", "v", NULL },
{"y", "j", NULL },
- {"z", "s", NULL },
+ {"z", "z", NULL },
{"zh", "c}", NULL },
{NULL, NULL, NULL }
};
-
-
-/* For English derived pronunciation (latin scripted tokens) we map them */
-/* to (hindi) phones -- this has to modified for other indic languages */
-/* Sai Krishna */
-/* 07 July 2017 */
-/* Making this v1 as Shyam's mapping looks a bit different */
-
-static const char * const eng_to_indic_v1[99][3] =
+
+/* Mapping for Tamil taking stress into consideration */
+/* Shyam Krishna, 2018/03/06 */
+static const char * const eng_to_tam_stress[99][3] =
{
- {"aa", "aa", NULL },
- {"ae", "ae", NULL }, /* changed this to A rather than e */
- {"ah", "ah", NULL },
- {"ao", "ao", NULL },
- {"aw", "aw", NULL },
- {"ax", "ax", NULL },
- {"axr", "axr", NULL },
- {"ay", "ay", NULL },
+ {"aa0", "A", NULL },
+ {"aa1", "A:", NULL },
+ {"ae0", "A", NULL },
+ {"ae1", "e", NULL },
+ {"ah1", "A", NULL },
+ {"ao0", "A", NULL },
+ {"ao1", "o:", NULL }, /*TODO: resolve horse-hoarse merger */
+ {"aw0", "aU", NULL },
+ {"aw1", "aU", NULL },
+ {"ax", "A", NULL },
+ {"ax0", "A", NULL },
+ {"ay0", "aI", NULL },
+ {"ay1", "aI", NULL },
{"b", "b", NULL },
- {"ch", "ch", NULL },
- {"d", "d", NULL },
- {"dh", "dh", NULL },
- {"eh", "eh", NULL },
- {"er", "er", NULL },
- {"ey", "ey", NULL },
- {"f", "f", NULL },
+ {"ch", "c", NULL },
+ {"d", "dr", NULL },
+ {"dh", "dB", NULL },
+ {"eh0", "e", NULL },
+ {"eh1", "e", NULL },
+ {"er", "A", "9r" },
+ {"er0", "A", "9r" },
+ {"er1", "A", "9r" },
+ {"ey0", "e", NULL },
+ {"ey1", "e:", NULL },
+ {"f", "p", NULL },
{"g", "g", NULL },
- {"hh", "hh", NULL },
- {"ih", "ih", NULL },
- {"iy", "iy", NULL },
- {"jh", "jh", NULL },
+ {"hh", "hv", NULL },
+ {"ih", "i", NULL },
+ {"ih0", "i", NULL },
+ {"ih1", "i", NULL },
+ {"iy0", "i", NULL },
+ {"iy1", "i:", NULL },
+ {"jh", "J", NULL },
{"k", "k", NULL },
{"l", "l", NULL },
{"m", "m", NULL },
- {"n", "n", NULL },
- {"nx", "n", NULL },
- {"ng", "n", NULL },
- {"ow", "ow", NULL },
- {"oy", "oy", "j" },
+ {"n", "nB", NULL },
+ {"nx", "nB", NULL },
+ {"ng", "N", NULL },
+ {"ow0", "o", NULL },
+ {"ow1", "o:", NULL },
+ {"oy0", "o", "j" },
+ {"oy1", "o:", "j" },
{"p", "p", NULL },
- {"r", "r", NULL },
+ {"r", "9r", NULL },
{"s", "s", NULL },
- {"sh", "sh", NULL },
- {"t", "t", NULL },
- {"th", "th", NULL },
- {"uh", "uh", NULL },
- {"uw", "uw", NULL },
+ {"sh", "sr", NULL },
+ {"t", "tr", NULL },
+ {"th", "tB", NULL },
+ {"uh0", "u", NULL },
+ {"uh1", "u", NULL },
+ {"uw0", "u", NULL },
+ {"uw1", "u:", NULL },
{"v", "v", NULL },
- {"w", "w", NULL },
- {"y", "y", NULL },
- {"z", "z", NULL },
- {"zh", "zh", NULL },
+ {"w", "v", NULL },
+ {"y", "j", NULL },
+ {"z", "s", NULL },
+ {"zh", "sr", NULL },
{NULL, NULL, NULL }
};
-
-static const char * const eng_to_indic[99][3] =
+/* Mapping for Kannada taking stress into consideration */
+/* Shyam Krishna, 2018/04/06 */
+static const char * const eng_to_kan_stress[99][3] =
{
- {"aa", "A:", NULL },
- {"ae", "aI", NULL },
- {"ah", "A", NULL },
- {"ao", "aU", NULL },
- {"aw", "A:", "u" },
+ {"aa0", "A", NULL },
+ {"aa1", "A:", NULL },
+ {"ae0", "A", NULL },
+ {"ae1", "e", NULL },
+ {"ah1", "A", NULL },
+ {"ao0", "A", NULL },
+ {"ao1", "o:", NULL }, /*TODO: resolve horse-hoarse merger */
+ {"aw0", "aU", NULL },
+ {"aw1", "aU", NULL },
{"ax", "A", NULL },
- {"axr", "A", "9r" },
- {"ay", "A:", "i" },
+ {"ax0", "A", NULL },
+ {"ay0", "aI", NULL },
+ {"ay1", "aI", NULL },
{"b", "b", NULL },
{"ch", "c", NULL },
{"d", "dr", NULL },
{"dh", "dB", NULL },
- {"eh", "E", NULL },
+ {"eh0", "e", NULL },
+ {"eh1", "e", NULL },
{"er", "A", "9r" },
- {"ey", "e", NULL },
+ {"er0", "A", "9r" },
+ {"er1", "A", "9r" },
+ {"ey0", "e", NULL },
+ {"ey1", "e:", NULL },
{"f", "ph", NULL },
{"g", "g", NULL },
{"hh", "hv", NULL },
{"ih", "i", NULL },
- {"iy", "i:", NULL },
+ {"ih0", "i", NULL },
+ {"ih1", "i", NULL },
+ {"iy0", "i", NULL },
+ {"iy1", "i:", NULL },
{"jh", "J", NULL },
{"k", "k", NULL },
{"l", "l", NULL },
@@ -1094,25 +1095,29 @@
{"n", "nB", NULL },
{"nx", "nB", NULL },
{"ng", "N", NULL },
- {"ow", "o", NULL },
- {"oy", "aU", "i" },
+ {"ow0", "o", NULL },
+ {"ow1", "o:", NULL },
+ {"oy0", "o", "j" },
+ {"oy1", "o:", "j" },
{"p", "p", NULL },
{"r", "9r", NULL },
{"s", "s", NULL },
{"sh", "c}", NULL },
{"t", "tr", NULL },
- {"th", "tBh", NULL },
- {"uh", "u", NULL },
- {"uw", "u:", NULL },
+ {"th", "tB", NULL },
+ {"uh0", "u", NULL },
+ {"uh1", "u", NULL },
+ {"uw0", "u", NULL },
+ {"uw1", "u:", NULL },
{"v", "v", NULL },
{"w", "v", NULL },
{"y", "j", NULL },
- {"z", "z", NULL },
+ {"z", "s", NULL },
{"zh", "c}", NULL },
{NULL, NULL, NULL }
};
-
+
cst_val *map_english_to_indic_phones(const char *indic_variant,
const cst_val *english_phones)
{
@@ -1125,11 +1130,39 @@
for (v=english_phones; v; v=val_cdr(v))
{
english_phone = cst_strdup(val_string(val_car(v)));
+ /* *** mapping table should be indic variant specific */
+ if(cst_streq(indic_variant, "tam"))
+ {
+ for (i=0; eng_to_tam_stress[i][0]; i++)
+ {
+ if (cst_streq(english_phone,eng_to_tam_stress[i][0]))
+ {
+ ip = cons_val(string_val(eng_to_tam_stress[i][1]),ip);
+ if (eng_to_tam_stress[i][2])
+ ip = cons_val(string_val(eng_to_tam_stress[i][2]),ip);
+ }
+ /* if there is no mapping, we drop the phone */
+ }
+ }
+ else if(cst_streq(indic_variant, "kan"))
+ {
+ for (i=0; eng_to_kan_stress[i][0]; i++)
+ {
+ if (cst_streq(english_phone,eng_to_kan_stress[i][0]))
+ {
+ ip = cons_val(string_val(eng_to_kan_stress[i][1]),ip);
+ if (eng_to_kan_stress[i][2])
+ ip = cons_val(string_val(eng_to_kan_stress[i][2]),ip);
+ }
+ /* if there is no mapping, we drop the phone */
+ }
+ }
+ else
+ {
if ((english_phone[cst_strlen(english_phone)-1] == '0') ||
(english_phone[cst_strlen(english_phone)-1] == '1'))
/* It has a stress value on it */
english_phone[cst_strlen(english_phone)-1] = '\0';
- /* *** mapping table should be indic variant specific */
for (i=0; eng_to_indic[i][0]; i++)
{
if (cst_streq(english_phone,eng_to_indic[i][0]))
@@ -1140,6 +1173,7 @@
}
/* if there is no mapping, we drop the phone */
}
+ }
cst_free(english_phone);
}
ip = val_reverse(ip);
@@ -1182,6 +1216,8 @@
return rphones;
}
+
+/* TODO */
static cst_val *cmu_indic_hindi_schwa_fixes(cst_val *phones)
{
cst_val *dd;
@@ -1240,7 +1276,7 @@
cmu_indic_variant_deletes_word_final_schwa = 0;
} else {
cmu_indic_variant_deletes_word_final_schwa = 0;
- printf("Unknown indic variant: %s\n", indic_variant);
+ printf("Unknown indic variant: %s!\n", indic_variant);
}
if (cst_regex_match(cst_rx_not_indic,word))
@@ -1249,19 +1285,12 @@
/* printf("awb_debug cmu_indic_lex: English >%s<\n",word); */
english_phones = lex_lookup(&cmu_lex,word,pos,feats);
- eng_bilingual_flag = get_param_string(feats, "eng_shared", "0");
+ eng_bilingual_flag = get_param_string(feats, "eng_shared", "none");
- if (cst_streq(eng_bilingual_flag, "1"))
- {
- base_phones = english_phones;
- }
- else
- {
- base_phones =
- map_english_to_indic_phones(indic_variant,english_phones);
- delete_val(english_phones);
- }
-
+ if (cst_streq(eng_bilingual_flag, "1")) { base_phones = english_phones; }
+ else base_phones = map_english_to_indic_phones(indic_variant,english_phones);
+ // base_phones = english_phones;
+ // delete_val(english_phones);
return base_phones;
}
else
@@ -1289,7 +1318,7 @@
printf("Tamil doesnt have anuswara");
}
else
- cmu_indic_lex_nasal_postfixes(base_phones);
+ cmu_indic_lex_nasal_postfixes(base_phones, feats);
base_phones = cmu_indic_lex_jnyan_replacement(base_phones,feats);
/* Postfix Indic Nasals, Voicing, Medial Schwa deletion */
@@ -1314,7 +1343,10 @@
}
if (cst_streq(indic_variant,"kan"))
- cmu_indic_lex_kannada_spelling_postfixes(base_phones);
+ cmu_indic_lex_kannada_spelling_postfixes(base_phones);
+
+ //if (cst_streq(indic_variant,"san"))
+ // base_phones=val_reverse(delete_medial_schwa(val_reverse(base_phones)));
if ((cst_streq(indic_variant,"hin")) || (cst_streq(indic_variant,"mar")) ||