ref: 0d060442250e7966087cc4e1678cf46ea5978671
parent: 1a65079b2b8d199e5a49564c82f396186697467d
author: Shyam Krishna <krishnshyam@gmail.com>
date: Fri Apr 20 10:52:50 EDT 2018
Add Sanskrit number support Add (partial) support for Sanskrit numerals
--- a/lang/cmu_indic_lang/Makefile
+++ b/lang/cmu_indic_lang/Makefile
@@ -46,6 +46,7 @@
indic_guj_num_table.h \
indic_tam_num_table.h \
indic_tel_num_table.h \
+ indic_san_num_table.h \
indic_pan_num_table.h
SRCS = cmu_indic_lang.c cmu_indic_phoneset.c cmu_indic_phrasing_cart.c
SCRIPTS =
--- a/lang/cmu_indic_lang/cmu_indic_lang.c
+++ b/lang/cmu_indic_lang/cmu_indic_lang.c
@@ -111,21 +111,25 @@
#include "indic_guj_num_table.h"
#include "indic_kan_num_table.h"
#include "indic_mar_num_table.h"
+#include "indic_san_num_table.h"
#include "indic_tel_num_table.h"
#include "indic_tam_num_table.h"
#include "indic_pan_num_table.h"
-int ts_utf8_sequence_length(char c0);
-// inline int utf8_sequence_length(char c0)
-// {
- // Get the expected length of UTF8 sequence given its most
- // significant byte
-// return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;
-// }
+#ifdef _WIN32
+__inline int ts_utf8_sequence_length(char c0)
+#else
+int ts_utf8_sequence_length(char c0)
+#endif
+{
+ /* Get the expected length of UTF8 sequence given its most */
+ /* significant byte */
+ return ((0xE5000000 >> ((c0 >> 3) & 0x1E)) & 3) + 1;
+}
-int ts_utf8_sequence_length(char c0);
+// int ts_utf8_sequence_length(char c0);
// inline int utf8_sequence_length(char c0)
// {
// Get the expected length of UTF8 sequence given its most
@@ -237,6 +241,10 @@
return r;
}
+static cst_val *indic_number_lang(const indic_num_table *num_table)
+{
+ return string_val(num_table->lang);
+}
static cst_val *indic_number_hundred(const indic_num_table *num_table)
{
return string_val(num_table->hundred);
@@ -301,9 +309,15 @@
}
else if (val_length(number) == 3)
{
- r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
- cons_val(indic_number_hundred(num_table),
- indic_number(val_cdr(number),num_table)));
+ if ((!cst_streq(val_string(indic_number_lang(num_table)),"mar")) ||
+ indic_digit_to_offset(val_string(val_car(val_cdr(number)))) ||
+ indic_digit_to_offset(val_string(val_car(val_cdr(val_cdr(number))))))
+ r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+ cons_val(indic_number_hundred(num_table),
+ indic_number(val_cdr(number),num_table)));
+ else
+ r = val_append(indic_number_digit(val_string(val_car(number)),num_table),
+ cons_val(string_val("शंभर"), indic_number(val_cdr(number),num_table)));
}
else if (val_length(number) == 4)
{
@@ -373,7 +387,6 @@
return r;
}
-#if 0
static int indic_nump_old(const char *number)
{
/* True if all (unicode) characters are in num_table's digit table */
@@ -407,8 +420,8 @@
return flag;
}
-#endif
+
static int indic_nump(const char *number)
{
/* Check if non-empty string */
@@ -516,7 +529,6 @@
cst_utterance *utt;
/* printf("awb_debug token_name %s name %s\n",item_name(token),name); */
- r = NULL;
if (item_feat_present(token,"phones"))
return cons_val(string_val(name),NULL);
@@ -538,10 +550,6 @@
num_table = &kan_num_table;
else if (cst_streq(variant,"mar"))
num_table = &mar_num_table;
- else if (cst_streq(variant,"nep"))
- num_table = &hin_num_table;
- else if (cst_streq(variant,"san"))
- num_table = &hin_num_table;
else if (cst_streq(variant,"tel"))
num_table = &tel_num_table;
else if (cst_streq(variant,"tam"))
@@ -548,6 +556,8 @@
num_table = &tam_num_table;
else if (cst_streq(variant, "pan"))
num_table = &pan_num_table;
+ else if (cst_streq(variant, "san"))
+ num_table = &san_num_table;
else
num_table = &eng_num_table;
@@ -560,7 +570,7 @@
p = indic_num_normalize(name,num_table);
if (val_length(p) <= 9)
/* Long strings of digits are read as strings of digits */
- r = indic_number(p,num_table);
+ r = indic_number(p, num_table);
else
r = indic_number_indiv(p,num_table);
delete_val(p);
@@ -568,58 +578,58 @@
else if (indic_nump(name))
{ /* Its script specific digits (commas/dots) */
- if (indic_nump(name) == 2)
- { /* All characters are digits */
- // printf("nump is 2\n");
- p = indic_num_normalize(name,num_table);
- if (val_length(p) <= 9)
- r = indic_number(p,num_table);
- else
- r = indic_number_indiv(p,num_table);
- delete_val(p);
- }
- else if (indic_nump(name) == 1)
- { /* Some characters are digits */
- int len = 1;
- int i = 0;
- char c0;
- char *aaa;
- char *bbb;
- while(name[i] != '\0')
- {
- /* Iterate over UTF-8 string */
- c0 = name[i];
- len = ts_utf8_sequence_length(c0);
- /* Check if char after this is comma */
- if (name[i+len] == ',')
- {
- /* Skip commas */
- i += len;
- c0 = name[i];
- len = ts_utf8_sequence_length(c0);
- i += len;
- continue;
- }
- /* Find where character type switches to or from digits */
- if(indic_text_splitable(name, i, len))
- break;
- i +=len;
- }
- aaa = cst_strdup(name);
- aaa[i+len] = '\0';
- bbb = cst_strdup(&name[i+len]);
- r = val_append(cmu_indic_tokentowords_one(token, aaa),
- cmu_indic_tokentowords_one(token, bbb));
- cst_free(aaa);
- cst_free(bbb);
- }
+ if (indic_nump(name) == 2)
+ { /* All characters are digits */
+ // printf("nump is 2\n");
+ p = indic_num_normalize(name,num_table);
+ if (val_length(p) <= 9)
+ r = indic_number(p, num_table);
+ else
+ r = indic_number_indiv(p,num_table);
+ delete_val(p);
+ }
+ else if (indic_nump(name) == 1)
+ { /* Some characters are digits */
+ int len = 1;
+ int i = 0;
+ char c0;
+ char *aaa;
+ char *bbb;
+ while(name[i] != '\0')
+ {
+ /* Iterate over UTF-8 string */
+ c0 = name[i];
+ len = ts_utf8_sequence_length(c0);
+ /* Check if char after this is comma */
+ if (name[i+len] == ',')
+ {
+ /* Skip commas */
+ i += len;
+ c0 = name[i];
+ len = ts_utf8_sequence_length(c0);
+ i += len;
+ continue;
+ }
+ /* Find where character type switches to or from digits */
+ if(indic_text_splitable(name, i, len))
+ break;
+ i +=len;
+ }
+ aaa = cst_strdup(name);
+ aaa[i+len] = '\0';
+ bbb = cst_strdup(&name[i+len]);
+ r = val_append(cmu_indic_tokentowords_one(token, aaa),
+ cmu_indic_tokentowords_one(token, bbb));
+ cst_free(aaa);
+ cst_free(bbb);
+ }
}
else if (indic_hyphenated(name))
{ /* For numbers seeparated by - / , */
- char *aaa;
- aaa = cst_strdup(&name[1]);
- r = cmu_indic_tokentowords_one(token, aaa);
- cst_free(aaa);
+ char *aaa;
+ aaa = cst_strdup(&name[1]);
+ r = cmu_indic_tokentowords_one(token, aaa);
+ cst_free(aaa);
}
else if (cst_regex_match(cst_rx_not_indic,name))
@@ -657,18 +667,6 @@
return FALSE;
}
-DEF_STATIC_CONST_VAL_STRING(val_string_zero,"0");
-DEF_STATIC_CONST_VAL_STRING(val_string_one,"1");
-
-const cst_val *is_english(const cst_item *p)
-{
- if (p && cst_regex_match(cst_rx_not_indic,
- flite_ffeature_string(p,"name")))
- return (cst_val *)&val_string_one;
- else
- return (cst_val *)&val_string_zero;
-}
-
void cmu_indic_lang_init(cst_voice *v)
{
/* Set indic language stuff */
@@ -708,9 +706,6 @@
/* Default ffunctions (required) */
basic_ff_register(v->ffunctions);
-
- /* Indic specific features */
- ff_register(v->ffunctions, "lisp_is_english", is_english);
return;
}
--- /dev/null
+++ b/lang/cmu_indic_lang/indic_san_num_table.h
@@ -1,0 +1,172 @@
+/*************************************************************************/
+/* */
+/* Language Technologies Institute */
+/* Carnegie Mellon University */
+/* Copyright (c) 2015 */
+/* All Rights Reserved. */
+/* */
+/* Permission is hereby granted, free of charge, to use and distribute */
+/* this software and its documentation without restriction, including */
+/* without limitation the rights to use, copy, modify, merge, publish, */
+/* distribute, sublicense, and/or sell copies of this work, and to */
+/* permit persons to whom this work is furnished to do so, subject to */
+/* the following conditions: */
+/* 1. The code must retain the above copyright notice, this list of */
+/* conditions and the following disclaimer. */
+/* 2. Any modifications must be clearly marked as such. */
+/* 3. Original authors' names are not deleted. */
+/* 4. The authors' names are not used to endorse or promote products */
+/* derived from this software without specific prior written */
+/* permission. */
+/* */
+/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
+/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
+/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
+/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
+/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
+/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
+/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
+/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
+/* THIS SOFTWARE. */
+/* */
+/*************************************************************************/
+/* Number pronunciation for (Sanskrit) Indic */
+/*************************************************************************/
+
+#ifndef _indic_san_num_table_h_
+#define _indic_san_num_table_h_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "indic_num_table.h"
+
+static const char * const indic_san_digit[11][2] =
+{
+ { "०", "शून्य" },
+ { "१", "एकं" },
+ { "२", "द्वि" },
+ { "३", "त्रि" },
+ { "४", "चतुर्" },
+ { "५", "पञ्च" },
+ { "६", "षट्" },
+ { "७", "सप्त" },
+ { "८", "अष्ट" },
+ { "९", "नव" },
+ { NULL, NULL },
+};
+
+static const char * const indic_san_two_digit[101][4] =
+{
+ { "१", "०", "दश", NULL },
+ { "१", "१", "एकादश", NULL },
+ { "१", "२", "द्वादश", NULL },
+ { "१", "३", "त्रयोदश", NULL },
+ { "१", "४", "चतुर्दश", NULL },
+ { "१", "५", "पञ्चदश", NULL },
+ { "१", "६", "षोडश", NULL },
+ { "१", "७", "सप्तदश", NULL },
+ { "१", "८", "अष्टादश", NULL },
+ { "१", "९", "एकोनविंशतिः", NULL },
+ { "२", "०", "विंशतिः", NULL },
+ { "२", "१", "एकाविंशतिः", NULL },
+ { "२", "२", "द्वाविंशतिः", NULL },
+ { "२", "३", "त्रयोविंशतिः", NULL },
+ { "२", "४", "चतुर्विंशतिः", NULL },
+ { "२", "५", "पञ्चविंशतिः", NULL },
+ { "२", "६", "षड्विंशतिः", NULL },
+ { "२", "७", "सप्तविंशतिः", NULL },
+ { "२", "८", "अष्टाविंशतिः ", NULL },
+ { "२", "९", "एकोनत्रिंशत्", NULL },
+ { "३", "०", "त्रिंशत्", NULL },
+ { "३", "१", "एकत्रिंशत्", NULL },
+ { "३", "२", "द्वात्रिंशत्", NULL },
+ { "३", "३", "त्रयस्त्रिंशत्", NULL },
+ { "३", "४", "चतुस्त्रिंशत्", NULL },
+ { "३", "५", "पञ्चत्रिंशत्", NULL },
+ { "३", "६", "षट्त्रिंशत्", NULL },
+ { "३", "७", "सप्तत्रिंशत्", NULL },
+ { "३", "८", "अष्टात्रिंशत्", NULL },
+ { "३", "९", "एकोनचत्वारिंशत्", NULL },
+ { "४", "०", "चत्वारिंशत्", NULL },
+ { "४", "१", "एकचत्वारिंशत्", NULL },
+ { "४", "२", "द्विचत्वारिंशत्", NULL },
+ { "४", "३", "त्रिचत्वारिंशत्", NULL },
+ { "४", "४", "चतुश्चत्वारिंशत्", NULL },
+ { "४", "५", "पञ्चचत्वारिंशत्", NULL },
+ { "४", "६", "षट्चत्वारिंशत्", NULL },
+ { "४", "७", "सप्तचत्वारिंशत्", NULL },
+ { "४", "८", "अष्टचत्वारिंशत्", NULL },
+ { "४", "९", "एकोनपञ्चाशत्", NULL },
+ { "५", "०", "पञ्चाशत्", NULL },
+ { "५", "१", "एकपञ्चाशत्", NULL },
+ { "५", "२", "द्विपञ्चाशत्", NULL },
+ { "५", "३", "त्रिपञ्चाशत्", NULL },
+ { "५", "४", "चतुःपञ्चाशत्", NULL },
+ { "५", "५", "पञ्चपञ्चाशत्", NULL },
+ { "५", "६", "षट्पञ्चाशत्", NULL },
+ { "५", "७", "सप्तपञ्चाशत्", NULL },
+ { "५", "८", "अष्टपञ्चाशत्", NULL },
+ { "५", "९", "एकोनषष्टिः", NULL },
+ { "६", "०", "षष्टिः", NULL },
+ { "६", "१", "एकषष्टिः", NULL },
+ { "६", "२", "द्विषष्टिः", NULL },
+ { "६", "३", "त्रिषष्टिः", NULL },
+ { "६", "४", "चतुष्षष्टिः", NULL },
+ { "६", "५", "पञ्चषष्टिः", NULL },
+ { "६", "६", "षट्षष्टिः", NULL },
+ { "६", "७", "सप्तषष्टिः", NULL },
+ { "६", "८", "अष्टषष्टिः", NULL },
+ { "६", "९", "एकोनसप्ततिः", NULL },
+ { "७", "०", "सप्ततिः", NULL },
+ { "७", "१", "एकसप्ततिः", NULL },
+ { "७", "२", "द्विसप्ततिः", NULL },
+ { "७", "३", "त्रिसप्ततिः", NULL },
+ { "७", "४", "चतुस्सप्ततिः", NULL },
+ { "७", "५", "पञ्चसप्ततिः", NULL },
+ { "७", "६", "षट्सप्ततिः", NULL },
+ { "७", "७", "सप्तसप्ततिः", NULL },
+ { "७", "८", "अष्टसप्ततिः", NULL },
+ { "७", "९", "एकोनाशीतिः", NULL },
+ { "८", "०", "अशीतिः", NULL },
+ { "८", "१", "एकाशीतिः", NULL },
+ { "८", "२", "द्वशीतिः", NULL },
+ { "८", "३", "त्र्यशीतिः", NULL },
+ { "८", "४", "चतुरशीतिः", NULL },
+ { "८", "५", "पञ्चाशीतिः", NULL },
+ { "८", "६", "षडशीतिः", NULL },
+ { "८", "७", "सप्ताशीतिः", NULL },
+ { "८", "८", "अष्टाशीतिः", NULL },
+ { "८", "९", "एकोननवतिः", NULL },
+ { "९", "०", "नवतिः", NULL },
+ { "९", "१", "एकनवतिः", NULL },
+ { "९", "२", "द्विनवतिः", NULL },
+ { "९", "३", "त्रिनवतिः", NULL },
+ { "९", "४", "चतुर्नवतिः", NULL },
+ { "९", "५", "पञ्चनवतिः", NULL },
+ { "९", "६", "षण्णवतिः", NULL },
+ { "९", "७", "सप्तनवतिः", NULL },
+ { "९", "८", "अष्टनवतिः", NULL },
+ { "९", "९", "एकोनशतम्", NULL },
+ { NULL, NULL },
+};
+
+const static indic_num_table san_num_table = {
+ "san",
+ &indic_san_digit,
+ &indic_san_two_digit,
+ "शतम्", /* hundred */
+ "सहस्र", /* thousand */
+ "लक्ष", /* lakh */
+ "कोटि", /* crore */
+};
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif
+
+