shithub: flite

Download patch

ref: 0d060442250e7966087cc4e1678cf46ea5978671
parent: 1a65079b2b8d199e5a49564c82f396186697467d
author: Shyam Krishna <krishnshyam@gmail.com>
date: Fri Apr 20 10:52:50 EDT 2018

Add Sanskrit number support

Add (partial) support for Sanskrit numerals

--- a/lang/cmu_indic_lang/Makefile
+++ b/lang/cmu_indic_lang/Makefile
@@ -46,6 +46,7 @@
     indic_guj_num_table.h \
     indic_tam_num_table.h \
     indic_tel_num_table.h \
+    indic_san_num_table.h \
     indic_pan_num_table.h
 SRCS = cmu_indic_lang.c cmu_indic_phoneset.c cmu_indic_phrasing_cart.c
 SCRIPTS = 
--- a/lang/cmu_indic_lang/cmu_indic_lang.c
+++ b/lang/cmu_indic_lang/cmu_indic_lang.c
@@ -111,21 +111,25 @@
 #include "indic_guj_num_table.h"
 #include "indic_kan_num_table.h"
 #include "indic_mar_num_table.h"
+#include "indic_san_num_table.h"
 #include "indic_tel_num_table.h"
 #include "indic_tam_num_table.h"
 #include "indic_pan_num_table.h"
 
 
-int ts_utf8_sequence_length(char c0);
-// inline int utf8_sequence_length(char c0)
-// {
-    // Get the expected length of UTF8 sequence given its most
-    // significant byte
-//    return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;
-// }
+#ifdef _WIN32
+__inline int ts_utf8_sequence_length(char c0)
+#else
+int ts_utf8_sequence_length(char c0)
+#endif
+{
+	/* Get the expected length of UTF8 sequence given its most */
+	/* significant byte */
+	return ((0xE5000000 >> ((c0 >> 3) & 0x1E)) & 3) + 1;
+}
 
 
-int ts_utf8_sequence_length(char c0);
+// int ts_utf8_sequence_length(char c0);
 // inline int utf8_sequence_length(char c0)
 // {
     // Get the expected length of UTF8 sequence given its most
@@ -237,6 +241,10 @@
     return r;
 }
 
+static cst_val *indic_number_lang(const indic_num_table *num_table)
+{
+    return string_val(num_table->lang);
+}
 static cst_val *indic_number_hundred(const indic_num_table *num_table)
 {
     return string_val(num_table->hundred);
@@ -301,9 +309,15 @@
     }
     else if (val_length(number) == 3)
     {
-        r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
-                 cons_val(indic_number_hundred(num_table),
-                          indic_number(val_cdr(number),num_table)));
+        if ((!cst_streq(val_string(indic_number_lang(num_table)),"mar")) ||
+            indic_digit_to_offset(val_string(val_car(val_cdr(number)))) ||
+            indic_digit_to_offset(val_string(val_car(val_cdr(val_cdr(number))))))
+            r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+                    cons_val(indic_number_hundred(num_table),
+                            indic_number(val_cdr(number),num_table)));
+        else
+            r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+                    cons_val(string_val("शंभर"), indic_number(val_cdr(number),num_table)));
     }
     else if (val_length(number) == 4)
     {
@@ -373,7 +387,6 @@
     return r;
 }
 
-#if 0
 static int indic_nump_old(const char *number)
 {
     /* True if all (unicode) characters are in num_table's digit table */
@@ -407,8 +420,8 @@
     return flag;
 
 }
-#endif
 
+
 static int indic_nump(const char *number)
 {
     /* Check if non-empty string */
@@ -516,7 +529,6 @@
     cst_utterance *utt;
 
     /* printf("awb_debug token_name %s name %s\n",item_name(token),name); */
-    r = NULL;
 
     if (item_feat_present(token,"phones"))
 	return cons_val(string_val(name),NULL);
@@ -538,10 +550,6 @@
         num_table = &kan_num_table;
     else if (cst_streq(variant,"mar"))
         num_table = &mar_num_table;
-    else if (cst_streq(variant,"nep"))
-        num_table = &hin_num_table;
-    else if (cst_streq(variant,"san"))
-        num_table = &hin_num_table;
     else if (cst_streq(variant,"tel"))
         num_table = &tel_num_table;
     else if (cst_streq(variant,"tam"))
@@ -548,6 +556,8 @@
         num_table = &tam_num_table;    
     else if (cst_streq(variant, "pan"))
         num_table = &pan_num_table;
+    else if (cst_streq(variant, "san"))
+        num_table = &san_num_table;
     else
         num_table = &eng_num_table;
 
@@ -560,7 +570,7 @@
         p = indic_num_normalize(name,num_table);
         if (val_length(p) <= 9)
             /* Long strings of digits are read as strings of digits */
-            r = indic_number(p,num_table);
+            r = indic_number(p, num_table);
 	else
             r = indic_number_indiv(p,num_table);
         delete_val(p);
@@ -568,58 +578,58 @@
     else if (indic_nump(name))
             
     {   /* Its script specific digits (commas/dots) */
-        if (indic_nump(name) == 2)
-        {   /* All characters are digits */ 
-            // printf("nump is 2\n");
-            p = indic_num_normalize(name,num_table);
-            if (val_length(p) <= 9)
-                r = indic_number(p,num_table);
-            else
-                r = indic_number_indiv(p,num_table);
-            delete_val(p);
-        }
-        else if (indic_nump(name) == 1)
-        {   /* Some characters are digits */
-            int len = 1;
-            int i = 0;
-            char c0;
-            char *aaa;
-            char *bbb;
-            while(name[i] != '\0')
-            {
-                /* Iterate over UTF-8 string */
-                c0 = name[i];
-                len = ts_utf8_sequence_length(c0);
-                /* Check if char after this is comma */
-                if (name[i+len] == ',')
-                {   
-                    /* Skip commas */
-                    i += len;
-                    c0 = name[i];
-                    len = ts_utf8_sequence_length(c0);
-                    i += len;
-                    continue;
-                }
-                /* Find where character type switches to or from digits */
-                if(indic_text_splitable(name, i, len))
-                    break;
-                i +=len;
-            }
-            aaa = cst_strdup(name);
-            aaa[i+len] = '\0';
-            bbb = cst_strdup(&name[i+len]);
-            r = val_append(cmu_indic_tokentowords_one(token, aaa),
-                           cmu_indic_tokentowords_one(token, bbb));
-            cst_free(aaa);
-            cst_free(bbb);
-        }
+	    if (indic_nump(name) == 2)
+	    {   /* All characters are digits */ 
+           // printf("nump is 2\n");
+	        p = indic_num_normalize(name,num_table);
+	        if (val_length(p) <= 9)
+		    r = indic_number(p, num_table);
+	        else
+		    r = indic_number_indiv(p,num_table);
+	        delete_val(p);
+	    }
+	    else if (indic_nump(name) == 1)
+	    {   /* Some characters are digits */
+	        int len = 1;
+	        int i = 0;
+	        char c0;
+                char *aaa;
+                char *bbb;
+	        while(name[i] != '\0')
+	        {
+		        /* Iterate over UTF-8 string */
+		        c0 = name[i];
+		        len = ts_utf8_sequence_length(c0);
+                        /* Check if char after this is comma */
+                        if (name[i+len] == ',')
+                        {   
+                          /* Skip commas */
+                        i += len;
+                        c0 = name[i];
+                        len = ts_utf8_sequence_length(c0);
+                        i += len;
+                        continue;
+                        }
+		        /* Find where character type switches to or from digits */
+		        if(indic_text_splitable(name, i, len))
+		            break;
+		        i +=len;
+	        }
+	        aaa = cst_strdup(name);
+	        aaa[i+len] = '\0';
+	        bbb = cst_strdup(&name[i+len]);
+	        r = val_append(cmu_indic_tokentowords_one(token, aaa),
+			        cmu_indic_tokentowords_one(token, bbb));
+	        cst_free(aaa);
+	        cst_free(bbb);
+	    }
     }
     else if (indic_hyphenated(name))
     {	/* For numbers seeparated by - / , */
-        char *aaa;
-        aaa = cst_strdup(&name[1]);
-        r = cmu_indic_tokentowords_one(token, aaa);
-        cst_free(aaa);
+            char *aaa;
+	    aaa = cst_strdup(&name[1]);
+	    r = cmu_indic_tokentowords_one(token, aaa);
+	    cst_free(aaa);
     }
 
     else if (cst_regex_match(cst_rx_not_indic,name))
@@ -657,18 +667,6 @@
     return FALSE;
 }
 
-DEF_STATIC_CONST_VAL_STRING(val_string_zero,"0");
-DEF_STATIC_CONST_VAL_STRING(val_string_one,"1");
-
-const cst_val *is_english(const cst_item *p)
-{
-    if (p && cst_regex_match(cst_rx_not_indic,
-                             flite_ffeature_string(p,"name")))
-        return (cst_val *)&val_string_one;
-    else
-        return (cst_val *)&val_string_zero;
-}
-
 void cmu_indic_lang_init(cst_voice *v)
 {
     /* Set indic language stuff */
@@ -708,9 +706,6 @@
 
     /* Default ffunctions (required) */
     basic_ff_register(v->ffunctions);
-
-    /* Indic specific features */
-    ff_register(v->ffunctions, "lisp_is_english", is_english);
 
     return;
 }
--- /dev/null
+++ b/lang/cmu_indic_lang/indic_san_num_table.h
@@ -1,0 +1,172 @@
+/*************************************************************************/
+/*                                                                       */
+/*                  Language Technologies Institute                      */
+/*                     Carnegie Mellon University                        */
+/*                         Copyright (c) 2015                            */
+/*                        All Rights Reserved.                           */
+/*                                                                       */
+/*  Permission is hereby granted, free of charge, to use and distribute  */
+/*  this software and its documentation without restriction, including   */
+/*  without limitation the rights to use, copy, modify, merge, publish,  */
+/*  distribute, sublicense, and/or sell copies of this work, and to      */
+/*  permit persons to whom this work is furnished to do so, subject to   */
+/*  the following conditions:                                            */
+/*   1. The code must retain the above copyright notice, this list of    */
+/*      conditions and the following disclaimer.                         */
+/*   2. Any modifications must be clearly marked as such.                */
+/*   3. Original authors' names are not deleted.                         */
+/*   4. The authors' names are not used to endorse or promote products   */
+/*      derived from this software without specific prior written        */
+/*      permission.                                                      */
+/*                                                                       */
+/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
+/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
+/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
+/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
+/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
+/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
+/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
+/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
+/*  THIS SOFTWARE.                                                       */
+/*                                                                       */
+/*************************************************************************/
+/*  Number pronunciation for (Sanskrit) Indic                             */
+/*************************************************************************/
+
+#ifndef _indic_san_num_table_h_
+#define _indic_san_num_table_h_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "indic_num_table.h"
+
+static const char * const indic_san_digit[11][2] =
+{
+    { "०", "शून्य" },
+    { "१", "एकं" },
+    { "२", "द्वि" },
+    { "३", "त्रि" },
+    { "४", "चतुर्" },
+    { "५", "पञ्च" },
+    { "६", "षट्" },
+    { "७", "सप्त" },
+    { "८", "अष्ट" },
+    { "९", "नव" },
+    { NULL, NULL },
+};
+
+static const char * const indic_san_two_digit[101][4] =
+{
+    { "१", "०", "दश", NULL },
+    { "१", "१", "एकादश", NULL },
+    { "१", "२", "द्वादश", NULL },
+    { "१", "३", "त्रयोदश", NULL },
+    { "१", "४", "चतुर्दश", NULL },
+    { "१", "५", "पञ्चदश", NULL },
+    { "१", "६", "षोडश", NULL },
+    { "१", "७", "सप्तदश", NULL },
+    { "१", "८", "अष्टादश", NULL },
+    { "१", "९", "एकोनविंशतिः", NULL },
+    { "२", "०", "विंशतिः", NULL },
+    { "२", "१", "एकाविंशतिः", NULL },
+    { "२", "२", "द्वाविंशतिः", NULL },
+    { "२", "३", "त्रयोविंशतिः", NULL },
+    { "२", "४", "चतुर्विंशतिः", NULL },
+    { "२", "५", "पञ्चविंशतिः", NULL },
+    { "२", "६", "षड्विंशतिः", NULL },
+    { "२", "७", "सप्तविंशतिः", NULL },
+    { "२", "८", "अष्टाविंशतिः ", NULL },
+    { "२", "९", "एकोनत्रिंशत्", NULL },
+    { "३", "०", "त्रिंशत्", NULL },
+    { "३", "१", "एकत्रिंशत्", NULL },
+    { "३", "२", "द्वात्रिंशत्", NULL },
+    { "३", "३", "त्रयस्त्रिंशत्", NULL },
+    { "३", "४", "चतुस्त्रिंशत्", NULL },
+    { "३", "५", "पञ्चत्रिंशत्", NULL },
+    { "३", "६", "षट्त्रिंशत्", NULL },
+    { "३", "७", "सप्तत्रिंशत्", NULL },
+    { "३", "८", "अष्टात्रिंशत्", NULL },
+    { "३", "९", "एकोनचत्वारिंशत्", NULL },
+    { "४", "०", "चत्वारिंशत्", NULL },
+    { "४", "१", "एकचत्वारिंशत्", NULL },
+    { "४", "२", "द्विचत्वारिंशत्", NULL },
+    { "४", "३", "त्रिचत्वारिंशत्", NULL },
+    { "४", "४", "चतुश्चत्वारिंशत्", NULL },
+    { "४", "५", "पञ्चचत्वारिंशत्", NULL },
+    { "४", "६", "षट्चत्वारिंशत्", NULL },
+    { "४", "७", "सप्तचत्वारिंशत्", NULL },
+    { "४", "८", "अष्टचत्वारिंशत्", NULL },
+    { "४", "९", "एकोनपञ्चाशत्", NULL },
+    { "५", "०", "पञ्चाशत्", NULL },
+    { "५", "१", "एकपञ्चाशत्", NULL },
+    { "५", "२", "द्विपञ्चाशत्", NULL },
+    { "५", "३", "त्रिपञ्चाशत्", NULL },
+    { "५", "४", "चतुःपञ्चाशत्", NULL },
+    { "५", "५", "पञ्चपञ्चाशत्", NULL },
+    { "५", "६", "षट्पञ्चाशत्", NULL },
+    { "५", "७", "सप्तपञ्चाशत्", NULL },
+    { "५", "८", "अष्टपञ्चाशत्", NULL },
+    { "५", "९", "एकोनषष्टिः", NULL },
+    { "६", "०", "षष्टिः", NULL },
+    { "६", "१", "एकषष्टिः", NULL },
+    { "६", "२", "द्विषष्टिः", NULL },
+    { "६", "३", "त्रिषष्टिः", NULL },
+    { "६", "४", "चतुष्षष्टिः", NULL },
+    { "६", "५", "पञ्चषष्टिः", NULL },
+    { "६", "६", "षट्षष्टिः", NULL },
+    { "६", "७", "सप्तषष्टिः", NULL },
+    { "६", "८", "अष्टषष्टिः", NULL },
+    { "६", "९", "एकोनसप्ततिः", NULL },
+    { "७", "०", "सप्ततिः", NULL },
+    { "७", "१", "एकसप्ततिः", NULL },
+    { "७", "२", "द्विसप्ततिः", NULL },
+    { "७", "३", "त्रिसप्ततिः", NULL },
+    { "७", "४", "चतुस्सप्ततिः", NULL },
+    { "७", "५", "पञ्चसप्ततिः", NULL },
+    { "७", "६", "षट्सप्ततिः", NULL },
+    { "७", "७", "सप्तसप्ततिः", NULL },
+    { "७", "८", "अष्टसप्ततिः", NULL },
+    { "७", "९", "एकोनाशीतिः", NULL },
+    { "८", "०", "अशीतिः", NULL },
+    { "८", "१", "एकाशीतिः", NULL },
+    { "८", "२", "द्वशीतिः", NULL },
+    { "८", "३", "त्र्यशीतिः", NULL },
+    { "८", "४", "चतुरशीतिः", NULL },
+    { "८", "५", "पञ्चाशीतिः", NULL },
+    { "८", "६", "षडशीतिः", NULL },
+    { "८", "७", "सप्ताशीतिः", NULL },
+    { "८", "८", "अष्टाशीतिः", NULL },
+    { "८", "९", "एकोननवतिः", NULL },
+    { "९", "०", "नवतिः", NULL },
+    { "९", "१", "एकनवतिः", NULL },
+    { "९", "२", "द्विनवतिः", NULL },
+    { "९", "३", "त्रिनवतिः", NULL },
+    { "९", "४", "चतुर्नवतिः", NULL },
+    { "९", "५", "पञ्चनवतिः", NULL },
+    { "९", "६", "षण्णवतिः", NULL },
+    { "९", "७", "सप्तनवतिः", NULL },
+    { "९", "८", "अष्टनवतिः", NULL },
+    { "९", "९", "एकोनशतम्", NULL },
+    { NULL, NULL },
+};
+
+const static indic_num_table san_num_table = {
+    "san",
+    &indic_san_digit,
+    &indic_san_two_digit,
+    "शतम्",   /* hundred */
+    "सहस्र", /* thousand */
+    "लक्ष",  /* lakh */
+    "कोटि", /* crore */
+};
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif
+
+