shithub: flite

--- /dev/null

+++ b/cmu_indic_lang.c

@@ -1,0 +1,729 @@

+/*************************************************************************/

+/*                                                                       */

+/*                  Language Technologies Institute                      */

+/*                     Carnegie Mellon University                        */

+/*                         Copyright (c) 2013                            */

+/*                        All Rights Reserved.                           */

+/*                                                                       */

+/*  Permission is hereby granted, free of charge, to use and distribute  */

+/*  this software and its documentation without restriction, including   */

+/*  without limitation the rights to use, copy, modify, merge, publish,  */

+/*  distribute, sublicense, and/or sell copies of this work, and to      */

+/*  permit persons to whom this work is furnished to do so, subject to   */

+/*  the following conditions:                                            */

+/*   1. The code must retain the above copyright notice, this list of    */

+/*      conditions and the following disclaimer.                         */

+/*   2. Any modifications must be clearly marked as such.                */

+/*   3. Original authors' names are not deleted.                         */

+/*   4. The authors' names are not used to endorse or promote products   */

+/*      derived from this software without specific prior written        */

+/*      permission.                                                      */

+/*                                                                       */

+/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */

+/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */

+/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */

+/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */

+/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */

+/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */

+/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */

+/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */

+/*  THIS SOFTWARE.                                                       */

+/*                                                                       */

+/*************************************************************************/

+/*                                                                       */

+/*  indic language support                                            */

+/*                                                                       */

+/*************************************************************************/

+#include "flite.h"

+#include "cst_val.h"

+#include "cst_voice.h"

+#include "cst_lexicon.h"

+#include "cst_ffeatures.h"

+#include "cmu_indic_lang.h"

+#include "cst_tokenstream.h"

+/* ./bin/compile_regexes cst_rx_eng_digits_only "^[0-9,]+$" */

+static const unsigned char cst_rx_eng_digits_only_rxprog[] = {

+   156, 6, 0, 27, 1, 0, 3, 11, 0, 18, 4, 0, 0, 48, 49, 50,

+   51, 52, 53, 54, 55, 56, 57, 44, 0, 2, 0, 3, 0, 0, 0,

+};

+static const cst_regex cst_rx_eng_digits_only_rx = {

+   0, 1, NULL, 0, 31,

+   (char *)cst_rx_eng_digits_only_rxprog

+};

+const cst_regex * const cst_rx_eng_digits_only = &cst_rx_eng_digits_only_rx;

+/* ./bin/compile_regexes cst_rx_not_indic "^[0-9a-zA-Z/:_'-,]+$" */

+static const unsigned char cst_rx_not_indic_rxprog[] = {

+   156, 6, 0, 87, 1, 0, 3, 11, 0, 78, 4, 0, 0, 48, 49, 50,

+   51, 52, 53, 54, 55, 56, 57, 97, 98, 99, 100, 101, 102, 103, 104, 105,

+   106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,

+   122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,

+   80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 47, 58, 95, 39, 40,

+   41, 42, 43, 44, 0, 2, 0, 3, 0, 0, 0,

+};

+static const cst_regex cst_rx_not_indic_rx = {

+   0, 1, NULL, 0, 91,

+   (char *)cst_rx_not_indic_rxprog

+};

+const cst_regex * const cst_rx_not_indic = &cst_rx_not_indic_rx;

+/* ./bin/compile_regexes cst_rx_indic_eng_number "^[1-9][0-9],\\([0-9][0-9],\\)*[0-9][0-9][0-9]$" */

+static const unsigned char cst_rx_indic_eng_number_rxprog[] = {

+   156, 6, 0, 137, 1, 0, 3, 4, 0, 13, 49, 50, 51, 52, 53, 54,

+   55, 56, 57, 0, 4, 0, 14, 48, 49, 50, 51, 52, 53, 54, 55, 56,

+   57, 0, 8, 0, 5, 44, 0, 6, 0, 48, 21, 0, 3, 6, 0, 36,

+   4, 0, 14, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 4, 0,

+   14, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 8, 0, 5, 44,

+   0, 31, 0, 3, 7, 0, 45, 6, 0, 3, 9, 0, 3, 4, 0, 14,

+   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 4, 0, 14, 48, 49,

+   50, 51, 52, 53, 54, 55, 56, 57, 0, 4, 0, 14, 48, 49, 50, 51,

+   52, 53, 54, 55, 56, 57, 0, 2, 0, 3, 0, 0, 0,

+};

+static const cst_regex cst_rx_indic_eng_number_rx = {

+   0, 1, NULL, 0, 141,

+   (char *)cst_rx_indic_eng_number_rxprog

+};

+const cst_regex * const cst_rx_indic_eng_number = &cst_rx_indic_eng_number_rx;

+cst_val *us_tokentowords(cst_item *token);

+/* Note that's an ascii | not the devangari one */

+const cst_string * const indic_postpunctuationsymbols = "\"'`.,:;!?(){}[]|";

+static cst_val *cmu_indic_tokentowords_one(cst_item *token, const char *name);

+cst_val *cmu_indic_tokentowords(cst_item *token) {

+  return cmu_indic_tokentowords_one(token, item_feat_string(token, "name"));

+}

+/* Indic numbers.  This deals with all (quantity) numbers found in any Indic */

+/* language no matter what script they are written in.  We use the Indic_Nums */

+/* table to convert the strings of digits (points and commas) into lists of */

+/* words for those scripts' language.  Thus Telugu digits get converted to  */

+/* Telugu words (even if the voice is a Hindi voice).                       */

+/* We assume use lakh and crore examples when there is commas to identify   */

+/* thus 10,34,123 (in English digits) will be expanded to 10 lakh, thirty   */

+/* four thousand one hundred (and) twenty three                             */

+/* We do English too, so I can debug it, and so lakh and crore are right */

+#include "indic_eng_num_table.h"

+#include "indic_hin_num_table.h"

+#include "indic_guj_num_table.h"

+#include "indic_kan_num_table.h"

+#include "indic_mar_num_table.h"

+#include "indic_san_num_table.h"

+#include "indic_tel_num_table.h"

+#include "indic_tam_num_table.h"

+#include "indic_pan_num_table.h"

+int ts_utf8_sequence_length(char c0);

+// inline int utf8_sequence_length(char c0)

+// {

+    // Get the expected length of UTF8 sequence given its most

+    // significant byte

+//    return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;

+// }

+int ts_utf8_sequence_length(char c0);

+// inline int utf8_sequence_length(char c0)

+// {

+    // Get the expected length of UTF8 sequence given its most

+    // significant byte

+//    return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;

+// }

+int indic_digit_to_offset(const char *ind_digit)

+{

+  /* This functions returns int value of a single digit in Indic/English scripts.

+     Also, it returns -1 if the character isn't a digit */

+  int output=-1;

+  int i;

+  int offset=-1;

+  i = cst_utf8_ord_string(ind_digit);

+  if ((i >= 0x0030) && (i <= 0x0039))   /*ASCII*/

+    offset = 0x0030;

+  if ((i >= 0x0966) && (i <= 0x096F))   /*Devanagari*/

+    offset = 0x0966;

+  if ((i >= 0x09E6) && (i <= 0x09EF))   /*Bengali*/

+    offset = 0x09E6;

+  if ((i >= 0x0A66) && (i <= 0x0A6F))   /*Gurmukhi*/

+    offset = 0x0A66;

+  if ((i >= 0x0AE6) && (i <= 0x0AEF))   /*Gujarati*/

+    offset = 0x0AE6;

+  if ((i >= 0x0B66) && (i <= 0x0B6F))   /*Oriya*/

+    offset = 0x0B66;

+  if ((i >= 0x0BE6) && (i <= 0x0BEF))   /*Tamil*/

+    offset = 0x0BE6;

+  if ((i >= 0x0C66) && (i <= 0x0C6F))   /*Telugu*/

+    offset = 0x0C66;

+  if ((i >= 0x0CE6) && (i <= 0x0CEF))   /*Kannada*/

+    offset = 0x0CE6;

+  if ((i >= 0x0D66) && (i <= 0x0D6F))   /*Malayalam*/

+    offset = 0x0D66;

+  if (offset == -1)

+  {

+     /* Not a digit */

+     return -1;

+  }

+  output = i - offset;

+  return output;

+}

+static cst_val *indic_number_digit(const char *digit,const indic_num_table *t)

+{

+    int i;

+    if ((digit == NULL) || (t == NULL))

+        return NULL;

+    i = indic_digit_to_offset(digit);

+    if (i == -1)

+    {

+        printf("Error in getting int from digit %s\n", digit);

+        return NULL;

+    }

+    /* The ith array index corresponds to the exact single digit number*/

+    return cons_val(string_val(num_table_digit(t,i,1)),NULL);

+}

+static cst_val *indic_number_two_digit(const char *digit1,

+                                       const char *digit2,

+                                       const indic_num_table *t)

+{

+    int i,j;

+    cst_val *r = NULL;

+    if ((digit1 == NULL) || (digit2 == NULL) || (t == NULL))

+        return NULL;

+    i = indic_digit_to_offset(digit1);

+    j = indic_digit_to_offset(digit2);

+    if (i == -1)

+    {

+        printf("Error in getting int from digit %s\n", digit1);

+        return NULL;

+    }

+    if (j == -1)

+    {

+        printf("Error in getting int from digit %s\n", digit2);

+        return NULL;

+    }

+    if (i == 0)

+    {

+        printf("Single digit erroneously processed as double digit %s\n", digit2);

+        return cons_val(string_val(num_table_digit(t,i,1)),NULL);

+    }

+    /*10*(i-1)+j given correct two digit index*/

+    if (num_table_two_digit(t,10*(i-1)+j,3) != NULL)

+        r = cons_val(string_val(num_table_two_digit(t,10*(i-1)+j,3)),r);

+    if (num_table_two_digit(t,10*(i-1)+j,2) != NULL)

+        r = cons_val(string_val(num_table_two_digit(t,10*(i-1)+j,2)),r);

+    return r;

+}

+static cst_val *indic_number_lang(const indic_num_table *num_table)

+{

+    return string_val(num_table->lang);

+}

+static cst_val *indic_number_hundred(const indic_num_table *num_table)

+{

+    return string_val(num_table->hundred);

+}

+static cst_val *indic_number_thousand(const indic_num_table *num_table)

+{

+    return string_val(num_table->thousand);

+}

+static cst_val *indic_number_lakh(const indic_num_table *num_table)

+{

+    return string_val(num_table->lakh);

+}

+static cst_val *indic_number_crore(const indic_num_table *num_table)

+{

+    return string_val(num_table->crore);

+}

+cst_val *indic_number(const cst_val *number,

+                      const indic_num_table *num_table)

+{

+    cst_val *r = NULL;

+    /* so its a number in some script (we actually don't care which script) */

+#if 0

+    printf("awb_debug enter indic num ");

+    val_print(stdout,number); printf("\n");

+#endif

+    if (number == NULL)

+        r = NULL;

+    /* If zero is the penultimate digit */

+    else if ((indic_digit_to_offset(val_string(val_car(number))) == 0) &&

+             (val_length(number) == 2))

+    {

+        /* If the last digit is non-zero */

+        if (indic_digit_to_offset(val_string(val_car(val_cdr(number)))) != 0)

+        {

+            r = indic_number_digit(val_string(val_car(val_cdr(number))),num_table);

+        }

+        else

+        {

+            /* So it doesn't say zero in the end*/

+        }

+    }

+ /* If the current digit is a 0 and there is a next digit */

+    else if ((indic_digit_to_offset(val_string(val_car(number))) == 0) &&

+             (val_cdr(number) != NULL))

+    {

+        r = indic_number(val_cdr(number),num_table);

+    }

+    else if (val_length(number) == 1)

+    {

+        r = indic_number_digit(val_string(val_car(number)),num_table);

+    }

+    else if (val_length(number) == 2)

+    {

+        r = indic_number_two_digit(val_string(val_car(number)),

+                                     val_string(val_car(val_cdr(number))),

+                                     num_table);

+    }

+    else if (val_length(number) == 3)

+    {

+        if ((!cst_streq(val_string(indic_number_lang(num_table)),"mar")) ||

+            indic_digit_to_offset(val_string(val_car(val_cdr(number)))) ||

+            indic_digit_to_offset(val_string(val_car(val_cdr(val_cdr(number))))))

+            r = val_append(indic_number_digit(val_string(val_car(number)),num_table),

+                    cons_val(indic_number_hundred(num_table),

+                            indic_number(val_cdr(number),num_table)));

+        else

+            r = val_append(indic_number_digit(val_string(val_car(number)),num_table),

+                    cons_val(string_val("शंभर"), indic_number(val_cdr(number),num_table)));

+    }

+    else if (val_length(number) == 4)

+    {

+        r = val_append(indic_number_digit(val_string(val_car(number)),num_table),

+                 cons_val(indic_number_thousand(num_table),

+                          indic_number(val_cdr(number),num_table)));

+    }

+    else if (val_length(number) == 5)

+    {

+        r = val_append(indic_number_two_digit(val_string(val_car(number)),

+                                       val_string(val_car(val_cdr(number))),

+                                       num_table),

+                 cons_val(indic_number_thousand(num_table),

+                          indic_number(val_cdr(val_cdr(number)),num_table)));

+    }

+    else if (val_length(number) == 6)

+    {

+        r = val_append(indic_number_digit(val_string(val_car(number)),num_table),

+                 cons_val(indic_number_lakh(num_table),

+                          indic_number(val_cdr(number),num_table)));

+    }

+    else if (val_length(number) == 7)

+    {

+        r = val_append(indic_number_two_digit(val_string(val_car(number)),

+                                       val_string(val_car(val_cdr(number))),

+                                       num_table),

+                       cons_val(indic_number_lakh(num_table),

+                          indic_number(val_cdr(val_cdr(number)),num_table)));

+    }

+    else if (val_length(number) == 8)

+    {

+        r = val_append(indic_number_digit(val_string(val_car(number)),num_table),

+                 cons_val(indic_number_crore(num_table),

+                          indic_number(val_cdr(number),num_table)));

+    }

+    else if (val_length(number) == 9)

+    {

+        r = val_append(indic_number_two_digit(val_string(val_car(number)),

+                                        val_string(val_car(val_cdr(number))),

+                                       num_table),

+                 cons_val(indic_number_crore(num_table),

+                          indic_number(val_cdr(val_cdr(number)),num_table)));

+    }

+#if 0

+    printf("awb_debug end of indic num ");

+    val_print(stdout,r); printf("\n");

+#endif

+    return r;

+}

+cst_val *indic_number_indiv(const cst_val *number,

+                      const indic_num_table *num_table)

+{

+    cst_val *r = NULL;

+    /* Exapnd this as a string of digits (not an actual quantity) */

+    if (number == NULL)

+        r = NULL;

+    else

+    {

+        r = val_append(indic_number_digit(val_string(val_car(number)),num_table),

+                       indic_number_indiv(val_cdr(number),num_table));

+    }

+    return r;

+}

+#if 0

+static int indic_nump_old(const char *number)

+{

+    /* True if all (unicode) characters are in num_table's digit table */

+    /* or is a comma or dot */

+    cst_val *p;

+    const cst_val *q;

+    int i;

+    int flag = TRUE;

+    int fflag;

+    p = cst_utf8_explode(number);

+    for (q=p; q && (flag==TRUE); q=val_cdr(q))

+    {

+        fflag = FALSE;

+        for (i=0; i<10; i++)

+        {

+            if (indic_digit_to_offset(val_string(val_car(q))) != -1)

+            {

+                fflag = TRUE;

+                break;

+            }

+        }

+        if ((cst_streq(val_string(val_car(q)),",")) ||

+            /* English zeros sometimes occur */

+            (cst_streq(val_string(val_car(q)),"0")))

+            fflag = TRUE;

+        flag = fflag;

+    }

+    delete_val(p); p = NULL;

+    return flag;

+}

+#endif

+static int indic_nump(const char *number)

+{

+    /* Check if non-empty string */

+    if (!number[0])

+        return FALSE;

+    /* Catch lone commas */

+    if (number[0] == ',')

+        return indic_nump(&number[1]);

+    /* Returns 2 if all characters are numbers or commas */

+    /* Returns 1 if it starts with a number */

+    cst_val *p;

+    const cst_val *q;

+    int flag = TRUE;

+    int fflag;

+    int ffflag = FALSE; /* Switches to TRUE at first digit found */

+    p = cst_utf8_explode(number);

+    for (q=p; q && (flag==TRUE); q=val_cdr(q))

+    {

+        fflag = FALSE;

+        if (indic_digit_to_offset(val_string(val_car(q))) != -1)

+        {

+            fflag = TRUE;

+		    ffflag = TRUE;

+        }

+        else if (cst_streq(val_string(val_car(q)),","))

+            fflag = TRUE;

+        flag = fflag;

+    }

+    delete_val(p); p = NULL;

+    return flag+ffflag;

+}

+static int indic_hyphenated(const char *number)

+{

+    /* Returns positive if first character is , - / and is followed by a */

+    /* number */

+    int flag = 0;

+    if ((number[0] == '-') || (number[0] == '/') || (number[0] == '.'))

+	    flag = indic_nump(&number[1]);

+    return flag;

+}

+static int indic_text_splitable(const char *s,int i,int len1)

+{

+    /* Returns true only if this and next chars are not both digits */

+    /* or both non-digits */

+    char *ccc, *ddd;    /* Store this character and the next character */

+    int len2;           /* Length of next character */

+    int flag;

+    ccc = cst_strdup(&s[i]);

+    ddd = cst_strdup(&s[i+len1]);

+    len2 = ts_utf8_sequence_length(ddd[0]);

+    ccc[len1] = '\0';

+    ddd[len2] = '\0';

+    /* Makeshift NOR */

+    flag = (indic_digit_to_offset(ccc) == -1)? !(indic_digit_to_offset(ddd) == -1):

+	       (indic_digit_to_offset(ddd) == -1);

+    cst_free(ccc);

+    cst_free(ddd);

+    return flag;

+}

+static cst_val *indic_num_normalize(const char *number,

+                                    const indic_num_table *num_table)

+{

+    /* Remove , */

+    cst_val *p, *np;

+    const cst_val *q;

+    p = cst_utf8_explode(number);

+    np = NULL;

+    for (q=p; q; q=val_cdr(q))

+    {

+        if (!cst_streq(val_string(val_car(q)),","))

+            np = cons_val(string_val(val_string(val_car(q))),np);

+    }

+    delete_val(p);

+    return val_reverse(np);

+}

+static cst_val *cmu_indic_tokentowords_one(cst_item *token, const char *name)

+{

+    /* Return list of words that expand token/name */

+    cst_val *r, *p;

+    const indic_num_table *num_table;

+    const char *variant;

+    cst_utterance *utt;

+    /* printf("awb_debug token_name %s name %s\n",item_name(token),name); */

+    r = NULL;

+    if (item_feat_present(token,"phones"))

+	return cons_val(string_val(name),NULL);

+#if 0

+    if (item_feat_present(token,"nsw"))

+	nsw = item_feat_string(token,"nsw");

+    utt = item_utt(token);

+    lex = val_lexicon(feat_val(utt->features,"lexicon"));

+#endif

+    utt = item_utt(token);

+    variant = get_param_string(utt->features, "variant", "none");

+    if (cst_streq(variant,"hin"))

+        num_table = &hin_num_table;

+    else if (cst_streq(variant,"guj"))

+        num_table = &guj_num_table;

+    else if (cst_streq(variant,"kan"))

+        num_table = &kan_num_table;

+    else if (cst_streq(variant,"mar"))

+        num_table = &mar_num_table;

+    else if (cst_streq(variant,"nep"))

+        num_table = &hin_num_table;

+    else if (cst_streq(variant, "pan"))

+        num_table = &pan_num_table;

+    else if (cst_streq(variant, "san"))

+        num_table = &san_num_table;

+    else if (cst_streq(variant,"tam"))

+        num_table = &tam_num_table;

+    else if (cst_streq(variant,"tel"))

+        num_table = &tel_num_table;

+    else

+        num_table = &eng_num_table;

+    /* This matches *English* numbers of the form 99,99,999 that require lakh

+       or crore expansion -- otherwise they'll be dropped back to the English

+       front end */

+    if (cst_regex_match(cst_rx_indic_eng_number,name))

+    {

+        /* remove commas */

+        p = indic_num_normalize(name,num_table);

+        if (val_length(p) <= 9)

+            /* Long strings of digits are read as strings of digits */

+            r = indic_number(p, num_table);

+	else

+            r = indic_number_indiv(p,num_table);

+        delete_val(p);

+    }

+    else if (indic_nump(name))

+    {   /* Its script specific digits (commas/dots) */

+	    if (indic_nump(name) == 2)

+	    {   /* All characters are digits */

+           // printf("nump is 2\n");

+	        p = indic_num_normalize(name,num_table);

+	        if (val_length(p) <= 9)

+		    r = indic_number(p, num_table);

+	        else

+		    r = indic_number_indiv(p,num_table);

+	        delete_val(p);

+	    }

+	    else if (indic_nump(name) == 1)

+	    {   /* Some characters are digits */

+	        int len = 1;

+	        int i = 0;

+	        char c0;

+                char *aaa;

+                char *bbb;

+	        while(name[i] != '\0')

+	        {

+		        /* Iterate over UTF-8 string */

+		        c0 = name[i];

+		        len = ts_utf8_sequence_length(c0);

+                        /* Check if char after this is comma */

+                        if (name[i+len] == ',')

+                        {

+                          /* Skip commas */

+                        i += len;

+                        c0 = name[i];

+                        len = ts_utf8_sequence_length(c0);

+                        i += len;

+                        continue;

+                        }

+		        /* Find where character type switches to or from digits */

+		        if(indic_text_splitable(name, i, len))

+		            break;

+		        i +=len;

+	        }

+	        aaa = cst_strdup(name);

+	        aaa[i+len] = '\0';

+	        bbb = cst_strdup(&name[i+len]);

+	        r = val_append(cmu_indic_tokentowords_one(token, aaa),

+			        cmu_indic_tokentowords_one(token, bbb));

+	        cst_free(aaa);

+	        cst_free(bbb);

+	    }

+    }

+    else if (indic_hyphenated(name))

+    {	/* For numbers seeparated by - / , */

+            char *aaa;

+	    aaa = cst_strdup(&name[1]);

+	    r = cmu_indic_tokentowords_one(token, aaa);

+	    cst_free(aaa);

+    }

+    else if (cst_regex_match(cst_rx_not_indic,name))

+        /* Do English analysis on non-unicode tokens */

+        r = us_tokentowords(token);

+    else if (cst_strlen(name) > 0)

+        r = cons_val(string_val(name),0);

+    else

+        r = NULL;

+    return r;

+}

+int indic_utt_break(cst_tokenstream *ts,

+                    const char *token,

+                    cst_relation *tokens)

+{

+  const char *postpunct = item_feat_string(relation_tail(tokens), "punc");

+  const char *ltoken = item_name(relation_tail(tokens));

+  if (cst_strchr(ts->whitespace,'\n') != cst_strrchr(ts->whitespace,'\n'))

+    /* contains two new lines */

+    return TRUE;

+  else if ((cst_strlen(ltoken) >= 3) &&

+           (cst_streq(&ltoken[cst_strlen(ltoken)-3],"।"))) /* devanagari '|' */

+      return TRUE;

+  else if (strchr(postpunct,':') ||

+           strchr(postpunct,'?') ||

+           strchr(postpunct,'|') ||  /* if ascii '|' gets used as dvngr '|' */

+           strchr(postpunct,'!'))

+    return TRUE;

+  else if (strchr(postpunct,'.'))

+    return TRUE;

+  else

+    return FALSE;

+}

+DEF_STATIC_CONST_VAL_STRING(val_string_zero,"0");

+DEF_STATIC_CONST_VAL_STRING(val_string_one,"1");

+const cst_val *is_english(const cst_item *p)

+{

+    if (p && cst_regex_match(cst_rx_not_indic,

+                             flite_ffeature_string(p,"name")))

+        return (cst_val *)&val_string_one;

+    else

+        return (cst_val *)&val_string_zero;

+}

+void cmu_indic_lang_init(cst_voice *v)

+{

+    /* Set indic language stuff */

+    feat_set_string(v->features,"language","cmu_indic_lang");

+    /* utterance break function */

+    feat_set(v->features,"utt_break",breakfunc_val(&indic_utt_break));

+    /* Phoneset -- need to get this from voice */

+    feat_set(v->features,"phoneset",phoneset_val(&cmu_indic_phoneset));

+    feat_set_string(v->features,"silence",cmu_indic_phoneset.silence);

+    /* Get information from voice and add to lexicon */

+    /* Text analyser -- whitespace defaults */

+    feat_set_string(v->features,"text_whitespace",

+                    cst_ts_default_whitespacesymbols);

+    feat_set_string(v->features,"text_prepunctuation",

+                    cst_ts_default_prepunctuationsymbols);

+    /* We can't put multi-byte characters in these classes so we can't */

+    /* add devanagari end of sentence '|' here, but would like to --   */

+    /* But we do add ascii '|' to it as it sometimes gets used the same way */

+    feat_set_string(v->features,"text_postpunctuation",

+                    indic_postpunctuationsymbols);

+    feat_set_string(v->features,"text_singlecharsymbols",

+                    cst_ts_default_singlecharsymbols);

+    /* Tokenization tokenization function */

+    feat_set(v->features,"tokentowords_func",itemfunc_val(&cmu_indic_tokentowords));

+    /* Pos tagger (gpos)/induced pos */

+    /* Phrasing */

+    feat_set(v->features,"phrasing_cart",cart_val(&cmu_indic_phrasing_cart));

+    /* Intonation, Duration and F0 -- part of cg */

+    feat_set_string(v->features,"no_intonation_accent_model","1");

+    /* Default ffunctions (required) */

+    basic_ff_register(v->ffunctions);

+    /* Indic specific features */

+    ff_register(v->ffunctions, "lisp_is_english", is_english);

+    return;

+}

--- a/lang/cmu_indic_lang/Makefile

+++ b/lang/cmu_indic_lang/Makefile

@@ -46,6 +46,7 @@

     indic_guj_num_table.h \

     indic_tam_num_table.h \

     indic_tel_num_table.h \

+    indic_san_num_table.h \

     indic_pan_num_table.h

 SRCS = cmu_indic_lang.c cmu_indic_phoneset.c cmu_indic_phrasing_cart.c

 SCRIPTS =

--- a/lang/cmu_indic_lang/cmu_indic_lang.c

+++ b/lang/cmu_indic_lang/cmu_indic_lang.c

@@ -111,6 +111,7 @@

 #include "indic_guj_num_table.h"

 #include "indic_kan_num_table.h"

 #include "indic_mar_num_table.h"

+#include "indic_san_num_table.h"

 #include "indic_tel_num_table.h"

 #include "indic_tam_num_table.h"

 #include "indic_pan_num_table.h"

@@ -133,6 +134,7 @@

 //    return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;

 // }

 int indic_digit_to_offset(const char *ind_digit)

   /* This functions returns int value of a single digit in Indic/English scripts.

@@ -237,6 +239,10 @@

     return r;

+static cst_val *indic_number_lang(const indic_num_table *num_table)

+{

+    return string_val(num_table->lang);

+}

 static cst_val *indic_number_hundred(const indic_num_table *num_table)

     return string_val(num_table->hundred);

@@ -301,9 +307,15 @@

     else if (val_length(number) == 3)

-        r = val_append(indic_number_digit(val_string(val_car(number)),num_table),

-                 cons_val(indic_number_hundred(num_table),

-                          indic_number(val_cdr(number),num_table)));

+        if ((!cst_streq(val_string(indic_number_lang(num_table)),"mar")) ||

+            indic_digit_to_offset(val_string(val_car(val_cdr(number)))) ||

+            indic_digit_to_offset(val_string(val_car(val_cdr(val_cdr(number))))))

+            r = val_append(indic_number_digit(val_string(val_car(number)),num_table),

+                    cons_val(indic_number_hundred(num_table),

+                            indic_number(val_cdr(number),num_table)));

+        else

+            r = val_append(indic_number_digit(val_string(val_car(number)),num_table),

+                    cons_val(string_val("शंभर"), indic_number(val_cdr(number),num_table)));

     else if (val_length(number) == 4)

@@ -409,6 +421,7 @@

 #endif

 static int indic_nump(const char *number)

     /* Check if non-empty string */

@@ -540,14 +553,14 @@

         num_table = &mar_num_table;

     else if (cst_streq(variant,"nep"))

         num_table = &hin_num_table;

-    else if (cst_streq(variant,"san"))

-        num_table = &hin_num_table;

-    else if (cst_streq(variant,"tel"))

-        num_table = &tel_num_table;

-    else if (cst_streq(variant,"tam"))

-        num_table = &tam_num_table;

     else if (cst_streq(variant, "pan"))

         num_table = &pan_num_table;

+    else if (cst_streq(variant, "san"))

+        num_table = &san_num_table;

+    else if (cst_streq(variant,"tam"))

+        num_table = &tam_num_table;

+    else if (cst_streq(variant,"tel"))

+        num_table = &tel_num_table;

     else

         num_table = &eng_num_table;

@@ -560,7 +573,7 @@

         p = indic_num_normalize(name,num_table);

         if (val_length(p) <= 9)

             /* Long strings of digits are read as strings of digits */

-            r = indic_number(p,num_table);

+            r = indic_number(p, num_table);

 	else

             r = indic_number_indiv(p,num_table);

         delete_val(p);

@@ -568,58 +581,58 @@

     else if (indic_nump(name))

     {   /* Its script specific digits (commas/dots) */

-        if (indic_nump(name) == 2)

-        {   /* All characters are digits */

-            // printf("nump is 2\n");

-            p = indic_num_normalize(name,num_table);

-            if (val_length(p) <= 9)

-                r = indic_number(p,num_table);

-            else

-                r = indic_number_indiv(p,num_table);

-            delete_val(p);

-        }

-        else if (indic_nump(name) == 1)

-        {   /* Some characters are digits */

-            int len = 1;

-            int i = 0;

-            char c0;

-            char *aaa;

-            char *bbb;

-            while(name[i] != '\0')

-            {

-                /* Iterate over UTF-8 string */

-                c0 = name[i];

-                len = ts_utf8_sequence_length(c0);

-                /* Check if char after this is comma */

-                if (name[i+len] == ',')

-                {

-                    /* Skip commas */

-                    i += len;

-                    c0 = name[i];

-                    len = ts_utf8_sequence_length(c0);

-                    i += len;

-                    continue;

-                }

-                /* Find where character type switches to or from digits */

-                if(indic_text_splitable(name, i, len))

-                    break;

-                i +=len;

-            }

-            aaa = cst_strdup(name);

-            aaa[i+len] = '\0';

-            bbb = cst_strdup(&name[i+len]);

-            r = val_append(cmu_indic_tokentowords_one(token, aaa),

-                           cmu_indic_tokentowords_one(token, bbb));

-            cst_free(aaa);

-            cst_free(bbb);

-        }

+	    if (indic_nump(name) == 2)

+	    {   /* All characters are digits */

+           // printf("nump is 2\n");

+	        p = indic_num_normalize(name,num_table);

+	        if (val_length(p) <= 9)

+		    r = indic_number(p, num_table);

+	        else

+		    r = indic_number_indiv(p,num_table);

+	        delete_val(p);

+	    }

+	    else if (indic_nump(name) == 1)

+	    {   /* Some characters are digits */

+	        int len = 1;

+	        int i = 0;

+	        char c0;

+                char *aaa;

+                char *bbb;

+	        while(name[i] != '\0')

+	        {

+		        /* Iterate over UTF-8 string */

+		        c0 = name[i];

+		        len = ts_utf8_sequence_length(c0);

+                        /* Check if char after this is comma */

+                        if (name[i+len] == ',')

+                        {

+                          /* Skip commas */

+                        i += len;

+                        c0 = name[i];

+                        len = ts_utf8_sequence_length(c0);

+                        i += len;

+                        continue;

+                        }

+		        /* Find where character type switches to or from digits */

+		        if(indic_text_splitable(name, i, len))

+		            break;

+		        i +=len;

+	        }

+	        aaa = cst_strdup(name);

+	        aaa[i+len] = '\0';

+	        bbb = cst_strdup(&name[i+len]);

+	        r = val_append(cmu_indic_tokentowords_one(token, aaa),

+			        cmu_indic_tokentowords_one(token, bbb));

+	        cst_free(aaa);

+	        cst_free(bbb);

+	    }

     else if (indic_hyphenated(name))

     {	/* For numbers seeparated by - / , */

-        char *aaa;

-        aaa = cst_strdup(&name[1]);

-        r = cmu_indic_tokentowords_one(token, aaa);

-        cst_free(aaa);

+            char *aaa;

+	    aaa = cst_strdup(&name[1]);

+	    r = cmu_indic_tokentowords_one(token, aaa);

+	    cst_free(aaa);

     else if (cst_regex_match(cst_rx_not_indic,name))

--- /dev/null

+++ b/lang/cmu_indic_lang/indic_san_num_table.h

@@ -1,0 +1,172 @@

+/*************************************************************************/

+/*                                                                       */

+/*                  Language Technologies Institute                      */

+/*                     Carnegie Mellon University                        */

+/*                         Copyright (c) 2015                            */

+/*                        All Rights Reserved.                           */

+/*                                                                       */

+/*  Permission is hereby granted, free of charge, to use and distribute  */

+/*  this software and its documentation without restriction, including   */

+/*  without limitation the rights to use, copy, modify, merge, publish,  */

+/*  distribute, sublicense, and/or sell copies of this work, and to      */

+/*  permit persons to whom this work is furnished to do so, subject to   */

+/*  the following conditions:                                            */

+/*   1. The code must retain the above copyright notice, this list of    */

+/*      conditions and the following disclaimer.                         */

+/*   2. Any modifications must be clearly marked as such.                */

+/*   3. Original authors' names are not deleted.                         */

+/*   4. The authors' names are not used to endorse or promote products   */

+/*      derived from this software without specific prior written        */

+/*      permission.                                                      */

+/*                                                                       */

+/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */

+/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */

+/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */

+/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */

+/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */

+/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */

+/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */

+/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */

+/*  THIS SOFTWARE.                                                       */

+/*                                                                       */

+/*************************************************************************/

+/*  Number pronunciation for (Sanskrit) Indic                             */

+/*************************************************************************/

+#ifndef _indic_san_num_table_h_

+#define _indic_san_num_table_h_

+#ifdef __cplusplus

+extern "C" {

+#endif /* __cplusplus */

+#include "indic_num_table.h"

+static const char * const indic_san_digit[11][2] =

+{

+    { "०", "शून्य" },

+    { "१", "एकं" },

+    { "२", "द्वि" },

+    { "३", "त्रि" },

+    { "४", "चतुर्" },

+    { "५", "पञ्च" },

+    { "६", "षट्" },

+    { "७", "सप्त" },

+    { "८", "अष्ट" },

+    { "९", "नव" },

+    { NULL, NULL },

+};

+static const char * const indic_san_two_digit[101][4] =

+{

+    { "१", "०", "दश", NULL },

+    { "१", "१", "एकादश", NULL },

+    { "१", "२", "द्वादश", NULL },

+    { "१", "३", "त्रयोदश", NULL },

+    { "१", "४", "चतुर्दश", NULL },

+    { "१", "५", "पञ्चदश", NULL },

+    { "१", "६", "षोडश", NULL },

+    { "१", "७", "सप्तदश", NULL },

+    { "१", "८", "अष्टादश", NULL },

+    { "१", "९", "एकोनविंशतिः", NULL },

+    { "२", "०", "विंशतिः", NULL },

+    { "२", "१", "एकाविंशतिः", NULL },

+    { "२", "२", "द्वाविंशतिः", NULL },

+    { "२", "३", "त्रयोविंशतिः", NULL },

+    { "२", "४", "चतुर्विंशतिः", NULL },

+    { "२", "५", "पञ्चविंशतिः", NULL },

+    { "२", "६", "षड्विंशतिः", NULL },

+    { "२", "७", "सप्तविंशतिः", NULL },

+    { "२", "८", "अष्टाविंशतिः ", NULL },

+    { "२", "९", "एकोनत्रिंशत्", NULL },

+    { "३", "०", "त्रिंशत्", NULL },

+    { "३", "१", "एकत्रिंशत्", NULL },

+    { "३", "२", "द्वात्रिंशत्", NULL },

+    { "३", "३", "त्रयस्त्रिंशत्", NULL },

+    { "३", "४", "चतुस्त्रिंशत्", NULL },

+    { "३", "५", "पञ्चत्रिंशत्", NULL },

+    { "३", "६", "षट्त्रिंशत्", NULL },

+    { "३", "७", "सप्तत्रिंशत्", NULL },

+    { "३", "८", "अष्टात्रिंशत्", NULL },

+    { "३", "९", "एकोनचत्वारिंशत्", NULL },

+    { "४", "०", "चत्वारिंशत्", NULL },

+    { "४", "१", "एकचत्वारिंशत्", NULL },

+    { "४", "२", "द्विचत्वारिंशत्", NULL },

+    { "४", "३", "त्रिचत्वारिंशत्", NULL },

+    { "४", "४", "चतुश्चत्वारिंशत्", NULL },

+    { "४", "५", "पञ्चचत्वारिंशत्", NULL },

+    { "४", "६", "षट्चत्वारिंशत्", NULL },

+    { "४", "७", "सप्तचत्वारिंशत्", NULL },

+    { "४", "८", "अष्टचत्वारिंशत्", NULL },

+    { "४", "९", "एकोनपञ्चाशत्", NULL },

+    { "५", "०", "पञ्चाशत्", NULL },

+    { "५", "१", "एकपञ्चाशत्", NULL },

+    { "५", "२", "द्विपञ्चाशत्", NULL },

+    { "५", "३", "त्रिपञ्चाशत्", NULL },

+    { "५", "४", "चतुःपञ्चाशत्", NULL },

+    { "५", "५", "पञ्चपञ्चाशत्", NULL },

+    { "५", "६", "षट्पञ्चाशत्", NULL },

+    { "५", "७", "सप्तपञ्चाशत्", NULL },

+    { "५", "८", "अष्टपञ्चाशत्", NULL },

+    { "५", "९", "एकोनषष्टिः", NULL },

+    { "६", "०", "षष्टिः", NULL },

+    { "६", "१", "एकषष्टिः", NULL },

+    { "६", "२", "द्विषष्टिः", NULL },

+    { "६", "३", "त्रिषष्टिः", NULL },

+    { "६", "४", "चतुष्षष्टिः", NULL },

+    { "६", "५", "पञ्चषष्टिः", NULL },

+    { "६", "६", "षट्षष्टिः", NULL },

+    { "६", "७", "सप्तषष्टिः", NULL },

+    { "६", "८", "अष्टषष्टिः", NULL },

+    { "६", "९", "एकोनसप्ततिः", NULL },

+    { "७", "०", "सप्ततिः", NULL },

+    { "७", "१", "एकसप्ततिः", NULL },

+    { "७", "२", "द्विसप्ततिः", NULL },

+    { "७", "३", "त्रिसप्ततिः", NULL },

+    { "७", "४", "चतुस्सप्ततिः", NULL },

+    { "७", "५", "पञ्चसप्ततिः", NULL },

+    { "७", "६", "षट्सप्ततिः", NULL },

+    { "७", "७", "सप्तसप्ततिः", NULL },

+    { "७", "८", "अष्टसप्ततिः", NULL },

+    { "७", "९", "एकोनाशीतिः", NULL },

+    { "८", "०", "अशीतिः", NULL },

+    { "८", "१", "एकाशीतिः", NULL },

+    { "८", "२", "द्वशीतिः", NULL },

+    { "८", "३", "त्र्यशीतिः", NULL },

+    { "८", "४", "चतुरशीतिः", NULL },

+    { "८", "५", "पञ्चाशीतिः", NULL },

+    { "८", "६", "षडशीतिः", NULL },

+    { "८", "७", "सप्ताशीतिः", NULL },

+    { "८", "८", "अष्टाशीतिः", NULL },

+    { "८", "९", "एकोननवतिः", NULL },

+    { "९", "०", "नवतिः", NULL },

+    { "९", "१", "एकनवतिः", NULL },

+    { "९", "२", "द्विनवतिः", NULL },

+    { "९", "३", "त्रिनवतिः", NULL },

+    { "९", "४", "चतुर्नवतिः", NULL },

+    { "९", "५", "पञ्चनवतिः", NULL },

+    { "९", "६", "षण्णवतिः", NULL },

+    { "९", "७", "सप्तनवतिः", NULL },

+    { "९", "८", "अष्टनवतिः", NULL },

+    { "९", "९", "एकोनशतम्", NULL },

+    { NULL, NULL },

+};

+const static indic_num_table san_num_table = {

+    "san",

+    &indic_san_digit,

+    &indic_san_two_digit,

+    "शतम्",   /* hundred */

+    "सहस्र", /* thousand */

+    "लक्ष",  /* lakh */

+    "कोटि", /* crore */

+};

+#ifdef __cplusplus

+} /* extern "C" */

+#endif /* __cplusplus */

+#endif

--- a/lang/cmu_indic_lex/cmu_indic_lex.c

+++ b/lang/cmu_indic_lex/cmu_indic_lex.c

@@ -283,34 +283,6 @@

     return cmu_indic_offset_char[c].type;

-static int indic_text_splitable(const char *s,int i,int len1)

-{

-    /* Returns true only if this and next chars are not both digits */

-    /* or both non-digits */

-    char *ccc, *ddd;    /* Store this character and the next character */

-    int len2;           /* Length of next character */

-    int flag;

-    ccc = cst_strdup(&s[i]);

-    ddd = cst_strdup(&s[i+len1]);

-    len2 = utf8_sequence_length(ddd[0]);

-    ccc[len1] = '\0';

-    ddd[len2] = '\0';

-    /* Makeshift NOR */

-    flag = (indic_digit_to_offset(ccc) == -1)? !(indic_digit_to_offset(ddd) == -1):

-	       (indic_digit_to_offset(ddd) == -1);

-    cst_free(ccc);

-    cst_free(ddd);

-    return flag;

-}

 static const char *cmu_indic_get_char_phoneme(const cst_val *indic_char)

     int c;

@@ -625,7 +597,8 @@

     return in_phones;

-cst_val *cmu_indic_lex_nasal_postfixes(cst_val *in_phones)

+cst_val *cmu_indic_lex_nasal_postfixes(cst_val *in_phones,

+                                                const cst_features *feats)

     /* Given a phone sequence containing a special character nX        */

     /* (contextual nasal), replace it with the appropriate nasal phone */

@@ -632,6 +605,10 @@

     /* based on its context                                            */

     char *tmpstr;

     const cst_val *p;

+    const char *indic_variant = 0;

+    indic_variant = get_param_string(feats, "variant", "none");

     /* printf("awb_debug: pre "); val_print(stdout,in_phones); printf("\n"); */

     for( p=in_phones; p && val_cdr(p); p=val_cdr(p))

@@ -642,7 +619,9 @@

             ((!val_cdr(val_cdr(p))) ||

              (!val_car(val_cdr(val_cdr(p))))))

-            if (cst_streq("A", val_string(val_car(p))))

+            if (cst_streq(indic_variant,"kan") ||

+                cst_streq(indic_variant,"tel") || /* Dravidian languages don't nasalize */

+                cst_streq("A", val_string(val_car(p))))

             {   /* If it's a schwa, it's not nasalized. nX becomes m */

                 replace_car(val_cdr(p),string_val("m"));

             } else {

@@ -961,25 +940,23 @@

     return phones;

-/* For English derived pronunciation (latin scripted tokens) we map them */

-/* to (hindi) phones -- this has to modified for other indic languages */

-static const char * const eng_to_indic_orig[99][3] =

+static const char * const eng_to_indic[99][3] =

         {"aa", "A:", NULL },

-        {"ae", "A", NULL },  /* changed this to A rather than e */

+        {"ae", "aI", NULL },

         {"ah", "A", NULL },

-        {"ao", "o", NULL },

-        {"aw", "aU", NULL },

+        {"ao", "aU", NULL },

+        {"aw", "A:", "u" },

         {"ax", "A", NULL },

-        {"axr", "A", NULL },

-        {"ay", "aI", NULL },

+        {"axr", "A", "9r" },

+        {"ay", "A:", "i" },

         {"b", "b", NULL },

         {"ch", "c", NULL },

-        {"d", "dB", NULL },

+        {"d", "dr", NULL },

         {"dh", "dB", NULL },

-        {"eh", "e", NULL },

-        {"er", "9r", NULL },

-        {"ey", "ay", NULL },

+        {"eh", "E", NULL },

+        {"er", "A", "9r" },

+        {"ey", "e", NULL },

         {"f", "ph", NULL },

         {"g", "g", NULL },

         {"hh", "hv", NULL },

@@ -991,9 +968,9 @@

         {"m", "m", NULL },

         {"n", "nB", NULL },

         {"nx", "nB", NULL },

-        {"ng", "nB", NULL },

+        {"ng", "N", NULL },

         {"ow", "o", NULL },

-        {"oy", "o", "j" },

+        {"oy", "aU", "i" },

         {"p", "p", NULL },

         {"r", "9r", NULL },

         {"s", "s", NULL },

@@ -1005,88 +982,112 @@

         {"v", "v", NULL },

         {"w", "v", NULL },

         {"y", "j", NULL },

-        {"z", "s", NULL },

+        {"z", "z", NULL },

         {"zh", "c}", NULL },

         {NULL, NULL, NULL }

};

-/* For English derived pronunciation (latin scripted tokens) we map them */

-/* to (hindi) phones -- this has to modified for other indic languages */

-/* Sai Krishna */

-/* 07 July 2017 */

-/* Making this v1 as Shyam's mapping looks a bit different */

-static const char * const eng_to_indic_v1[99][3] =

+/* Mapping for Tamil taking stress into consideration */

+/* Shyam Krishna, 2018/03/06 */

+static const char * const eng_to_tam_stress[99][3] =

-        {"aa", "aa", NULL },

-        {"ae", "ae", NULL },  /* changed this to A rather than e */

-        {"ah", "ah", NULL },

-        {"ao", "ao", NULL },

-        {"aw", "aw", NULL },

-        {"ax", "ax", NULL },

-        {"axr", "axr", NULL },

-        {"ay", "ay", NULL },

+        {"aa0", "A", NULL },

+        {"aa1", "A:", NULL },

+        {"ae0", "A", NULL },

+        {"ae1", "e", NULL },

+        {"ah1", "A", NULL },

+        {"ao0", "A", NULL },

+        {"ao1", "o:", NULL }, /*TODO: resolve horse-hoarse merger */

+        {"aw0", "aU", NULL },

+        {"aw1", "aU", NULL },

+        {"ax", "A", NULL },

+        {"ax0", "A", NULL },

+        {"ay0", "aI", NULL },

+        {"ay1", "aI", NULL },

         {"b", "b", NULL },

-        {"ch", "ch", NULL },

-        {"d", "d", NULL },

-        {"dh", "dh", NULL },

-        {"eh", "eh", NULL },

-        {"er", "er", NULL },

-        {"ey", "ey", NULL },

-        {"f", "f", NULL },

+        {"ch", "c", NULL },

+        {"d", "dr", NULL },

+        {"dh", "dB", NULL },

+        {"eh0", "e", NULL },

+        {"eh1", "e", NULL },

+        {"er", "A", "9r" },

+        {"er0", "A", "9r" },

+        {"er1", "A", "9r" },

+        {"ey0", "e", NULL },

+        {"ey1", "e:", NULL },

+        {"f", "p", NULL },

         {"g", "g", NULL },

-        {"hh", "hh", NULL },

-        {"ih", "ih", NULL },

-        {"iy", "iy", NULL },

-        {"jh", "jh", NULL },

+        {"hh", "hv", NULL },

+        {"ih", "i", NULL },

+        {"ih0", "i", NULL },

+        {"ih1", "i", NULL },

+        {"iy0", "i", NULL },

+        {"iy1", "i:", NULL },

+        {"jh", "J", NULL },

         {"k", "k", NULL },

         {"l", "l", NULL },

         {"m", "m", NULL },

-        {"n", "n", NULL },

-        {"nx", "n", NULL },

-        {"ng", "n", NULL },

-        {"ow", "ow", NULL },

-        {"oy", "oy", "j" },

+        {"n", "nB", NULL },

+        {"nx", "nB", NULL },

+        {"ng", "N", NULL },

+        {"ow0", "o", NULL },

+        {"ow1", "o:", NULL },

+        {"oy0", "o", "j" },

+        {"oy1", "o:", "j" },

         {"p", "p", NULL },

-        {"r", "r", NULL },

+        {"r", "9r", NULL },

         {"s", "s", NULL },

-        {"sh", "sh", NULL },

-        {"t", "t", NULL },

-        {"th", "th", NULL },

-        {"uh", "uh", NULL },

-        {"uw", "uw", NULL },

+        {"sh", "sr", NULL },

+        {"t", "tr", NULL },

+        {"th", "tB", NULL },

+        {"uh0", "u", NULL },

+        {"uh1", "u", NULL },

+        {"uw0", "u", NULL },

+        {"uw1", "u:", NULL },

         {"v", "v", NULL },

-        {"w", "w", NULL },

-        {"y", "y", NULL },

-        {"z", "z", NULL },

-        {"zh", "zh", NULL },

+        {"w", "v", NULL },

+        {"y", "j", NULL },

+        {"z", "s", NULL },

+        {"zh", "sr", NULL },

         {NULL, NULL, NULL }

};

-static const char * const eng_to_indic[99][3] =

+/* Mapping for Kannada taking stress into consideration */

+/* Shyam Krishna, 2018/04/06 */

+static const char * const eng_to_kan_stress[99][3] =

-        {"aa", "A:", NULL },

-        {"ae", "aI", NULL },

-        {"ah", "A", NULL },

-        {"ao", "aU", NULL },

-        {"aw", "A:", "u" },

+        {"aa0", "A", NULL },

+        {"aa1", "A:", NULL },

+        {"ae0", "A", NULL },

+        {"ae1", "e", NULL },

+        {"ah1", "A", NULL },

+        {"ao0", "A", NULL },

+        {"ao1", "o:", NULL }, /*TODO: resolve horse-hoarse merger */

+        {"aw0", "aU", NULL },

+        {"aw1", "aU", NULL },

         {"ax", "A", NULL },

-        {"axr", "A", "9r" },

-        {"ay", "A:", "i" },

+        {"ax0", "A", NULL },

+        {"ay0", "aI", NULL },

+        {"ay1", "aI", NULL },

         {"b", "b", NULL },

         {"ch", "c", NULL },

         {"d", "dr", NULL },

         {"dh", "dB", NULL },

-        {"eh", "E", NULL },

+        {"eh0", "e", NULL },

+        {"eh1", "e", NULL },

         {"er", "A", "9r" },

-        {"ey", "e", NULL },

+        {"er0", "A", "9r" },

+        {"er1", "A", "9r" },

+        {"ey0", "e", NULL },

+        {"ey1", "e:", NULL },

         {"f", "ph", NULL },

         {"g", "g", NULL },

         {"hh", "hv", NULL },

         {"ih", "i", NULL },

-        {"iy", "i:", NULL },

+        {"ih0", "i", NULL },

+        {"ih1", "i", NULL },

+        {"iy0", "i", NULL },

+        {"iy1", "i:", NULL },

         {"jh", "J", NULL },

         {"k", "k", NULL },

         {"l", "l", NULL },

@@ -1094,25 +1095,29 @@

         {"n", "nB", NULL },

         {"nx", "nB", NULL },

         {"ng", "N", NULL },

-        {"ow", "o", NULL },

-        {"oy", "aU", "i" },

+        {"ow0", "o", NULL },

+        {"ow1", "o:", NULL },

+        {"oy0", "o", "j" },

+        {"oy1", "o:", "j" },

         {"p", "p", NULL },

         {"r", "9r", NULL },

         {"s", "s", NULL },

         {"sh", "c}", NULL },

         {"t", "tr", NULL },

-        {"th", "tBh", NULL },

-        {"uh", "u", NULL },

-        {"uw", "u:", NULL },

+        {"th", "tB", NULL },

+        {"uh0", "u", NULL },

+        {"uh1", "u", NULL },

+        {"uw0", "u", NULL },

+        {"uw1", "u:", NULL },

         {"v", "v", NULL },

         {"w", "v", NULL },

         {"y", "j", NULL },

-        {"z", "z", NULL },

+        {"z", "s", NULL },

         {"zh", "c}", NULL },

         {NULL, NULL, NULL }

};

 cst_val *map_english_to_indic_phones(const char *indic_variant,

                                      const cst_val *english_phones)

@@ -1125,11 +1130,39 @@

     for (v=english_phones; v; v=val_cdr(v))

         english_phone = cst_strdup(val_string(val_car(v)));

+        /* *** mapping table should be indic variant specific */

+        if(cst_streq(indic_variant, "tam"))

+        {

+        for (i=0; eng_to_tam_stress[i][0]; i++)

+        {

+            if (cst_streq(english_phone,eng_to_tam_stress[i][0]))

+            {

+                ip = cons_val(string_val(eng_to_tam_stress[i][1]),ip);

+                if (eng_to_tam_stress[i][2])

+                    ip = cons_val(string_val(eng_to_tam_stress[i][2]),ip);

+            }

+            /* if there is no mapping, we drop the phone */

+        }

+        }

+        else if(cst_streq(indic_variant, "kan"))

+        {

+        for (i=0; eng_to_kan_stress[i][0]; i++)

+        {

+            if (cst_streq(english_phone,eng_to_kan_stress[i][0]))

+            {

+                ip = cons_val(string_val(eng_to_kan_stress[i][1]),ip);

+                if (eng_to_kan_stress[i][2])

+                    ip = cons_val(string_val(eng_to_kan_stress[i][2]),ip);

+            }

+            /* if there is no mapping, we drop the phone */

+        }

+        }

+        else

+        {

         if ((english_phone[cst_strlen(english_phone)-1] == '0') ||

             (english_phone[cst_strlen(english_phone)-1] == '1'))

             /* It has a stress value on it */

             english_phone[cst_strlen(english_phone)-1] = '\0';

-        /* *** mapping table should be indic variant specific */

         for (i=0; eng_to_indic[i][0]; i++)

             if (cst_streq(english_phone,eng_to_indic[i][0]))

@@ -1140,6 +1173,7 @@

             /* if there is no mapping, we drop the phone */

+        }

         cst_free(english_phone);

     ip = val_reverse(ip);

@@ -1182,6 +1216,8 @@

     return rphones;

+/* TODO */

 static cst_val *cmu_indic_hindi_schwa_fixes(cst_val *phones)

     cst_val *dd;

@@ -1240,7 +1276,7 @@

       cmu_indic_variant_deletes_word_final_schwa = 0;

     } else {

       cmu_indic_variant_deletes_word_final_schwa = 0;

-      printf("Unknown indic variant: %s\n", indic_variant);

+      printf("Unknown indic variant: %s!\n", indic_variant);

     if (cst_regex_match(cst_rx_not_indic,word))

@@ -1249,19 +1285,12 @@

         /* printf("awb_debug cmu_indic_lex: English >%s<\n",word); */

         english_phones = lex_lookup(&cmu_lex,word,pos,feats);

-        eng_bilingual_flag = get_param_string(feats, "eng_shared", "0");

+        eng_bilingual_flag = get_param_string(feats, "eng_shared", "none");

-        if (cst_streq(eng_bilingual_flag, "1"))

-        {

-            base_phones = english_phones;

-        }

-        else

-        {

-            base_phones =

-                map_english_to_indic_phones(indic_variant,english_phones);

-            delete_val(english_phones);

-        }

+        if (cst_streq(eng_bilingual_flag, "1")) { base_phones = english_phones; }

+        else base_phones = map_english_to_indic_phones(indic_variant,english_phones);

+        // base_phones = english_phones;

+        // delete_val(english_phones);

         return base_phones;

     else

@@ -1289,7 +1318,7 @@

        printf("Tamil doesnt have anuswara");

     else

-    cmu_indic_lex_nasal_postfixes(base_phones);

+    cmu_indic_lex_nasal_postfixes(base_phones, feats);

     base_phones = cmu_indic_lex_jnyan_replacement(base_phones,feats);

     /* Postfix Indic Nasals, Voicing, Medial Schwa deletion */

@@ -1314,7 +1343,10 @@

     if (cst_streq(indic_variant,"kan"))

-      cmu_indic_lex_kannada_spelling_postfixes(base_phones);

+      cmu_indic_lex_kannada_spelling_postfixes(base_phones);

+    //if (cst_streq(indic_variant,"san"))

+    //    base_phones=val_reverse(delete_medial_schwa(val_reverse(base_phones)));

     if ((cst_streq(indic_variant,"hin")) || (cst_streq(indic_variant,"mar")) ||

--

⑨