Click here to Skip to main content
15,895,813 members
Articles / Mobile Apps / iPhone

FliteEngine - Objective-C speech synthesizer

Rate me:
Please Sign up or sign in to vote.
0.00/5 (No votes)
21 Jan 2012BSD1 min read 22.6K   419   10  
FliteEngine - An Objective-C speech synthesizer.
/*************************************************************************/
/*                                                                       */
/*                  Language Technologies Institute                      */
/*                     Carnegie Mellon University                        */
/*                         Copyright (c) 2001                            */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
/*               Date:  January 2001                                     */
/*************************************************************************/
/*                                                                       */
/*  US English text analysis functions                                   */
/*                                                                       */
/*************************************************************************/

#include <ctype.h>
#include "flite.h"
#include "usenglish.h"
#include "us_text.h"
#include "cst_regex.h"

static int text_splitable(const char *s,int i);
static cst_val *state_name(const char *name,cst_item *t);

const char *us_english_punctuation = "\"'`.,:;!?(){}[]";
const char *us_english_prepunctuation = "\"'`({[";
const char *us_english_singlecharsymbols = "";
const char *us_english_whitespace = " \t\n\r";

static const unsigned char numbertime_rxprog[] = {
	156, 
	6, 0, 67, 1, 0, 3, 6, 0, 9, 4, 
	0, 9, 48, 49, 0, 6, 0, 3, 9, 0, 
	3, 4, 0, 14, 48, 49, 50, 51, 52, 53, 
	54, 55, 56, 57, 0, 8, 0, 5, 58, 0, 
	4, 0, 10, 48, 49, 50, 51, 52, 53, 0, 
	4, 0, 14, 48, 49, 50, 51, 52, 53, 54, 
	55, 56, 57, 0, 2, 0, 3, 0, 0, 0
};
static const cst_regex numbertime_rx = {
	0,
	1,
	NULL,
	0,
	71,
	(char *) numbertime_rxprog
};

static const unsigned char fourdigits_rxprog[] = {
	156, 
	6, 0, 65, 1, 0, 3, 4, 0, 14, 48, 
	49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 
	4, 0, 14, 48, 49, 50, 51, 52, 53, 54, 
	55, 56, 57, 0, 4, 0, 14, 48, 49, 50, 
	51, 52, 53, 54, 55, 56, 57, 0, 4, 0, 
	14, 48, 49, 50, 51, 52, 53, 54, 55, 56, 
	57, 0, 2, 0, 3, 0, 0, 0
};
static const cst_regex fourdigits_rx = {
	0,
	1,
	NULL,
	0,
	69,
	(char *) fourdigits_rxprog
};

static const unsigned char threedigits_rxprog[] = {
	156, 
	6, 0, 51, 1, 0, 3, 4, 0, 14, 48, 
	49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 
	4, 0, 14, 48, 49, 50, 51, 52, 53, 54, 
	55, 56, 57, 0, 4, 0, 14, 48, 49, 50, 
	51, 52, 53, 54, 55, 56, 57, 0, 2, 0, 
	3, 0, 0, 0
};
static const cst_regex threedigits_rx = {
	0,
	1,
	NULL,
	0,
	55,
	(char *) threedigits_rxprog
};

static const unsigned char romannums_rxprog[] = {
	156, 
	6, 0, 137, 1, 0, 3, 21, 0, 3, 6, 
	0, 36, 8, 0, 5, 73, 0, 6, 0, 8, 
	8, 0, 8, 73, 0, 6, 0, 3, 9, 0, 
	3, 6, 0, 8, 8, 0, 8, 73, 0, 6, 
	0, 3, 9, 0, 89, 6, 0, 9, 8, 0, 
	83, 73, 86, 0, 6, 0, 50, 8, 0, 5, 
	86, 0, 6, 0, 8, 8, 0, 8, 73, 0, 
	6, 0, 3, 9, 0, 3, 6, 0, 8, 8, 
	0, 8, 73, 0, 6, 0, 3, 9, 0, 3, 
	6, 0, 8, 8, 0, 8, 73, 0, 6, 0, 
	3, 9, 0, 30, 6, 0, 9, 8, 0, 24, 
	73, 88, 0, 6, 0, 18, 8, 0, 5, 88, 
	0, 10, 0, 10, 4, 0, 0, 86, 73, 88, 
	0, 31, 0, 3, 2, 0, 3, 0, 0, 0
};
static const cst_regex romannums_rx = {
	0,
	1,
	NULL,
	0,
	141,
	(char *) romannums_rxprog
};

static const unsigned char digitsslashdigits_rxprog[] = {
	156, 
	6, 0, 74, 1, 0, 3, 4, 0, 13, 49, 
	50, 51, 52, 53, 54, 55, 56, 57, 0, 10, 
	0, 17, 4, 0, 0, 48, 49, 50, 51, 52, 
	53, 54, 55, 56, 57, 0, 8, 0, 5, 47, 
	0, 4, 0, 13, 49, 50, 51, 52, 53, 54, 
	55, 56, 57, 0, 10, 0, 17, 4, 0, 0, 
	48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 
	0, 2, 0, 3, 0, 0, 0
};
static const cst_regex digitsslashdigits_rx = {
	0,
	1,
	NULL,
	0,
	78,
	(char *) digitsslashdigits_rxprog
};

static const unsigned char dottedabbrevs_rxprog[] = {
	156, 
	6, 0, 147, 1, 0, 3, 4, 0, 56, 97, 
	98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 
	108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 
	118, 119, 120, 121, 122, 65, 66, 67, 68, 69, 
	70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 
	80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 
	90, 0, 21, 0, 3, 6, 0, 64, 8, 0, 
	5, 46, 0, 4, 0, 56, 97, 98, 99, 100, 
	101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 
	111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 
	121, 122, 65, 66, 67, 68, 69, 70, 71, 72, 
	73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 
	83, 84, 85, 86, 87, 88, 89, 90, 0, 31, 
	0, 3, 6, 0, 6, 7, 0, 73, 6, 0, 
	3, 9, 0, 3, 2, 0, 3, 0, 0, 0
};
static const cst_regex dottedabbrevs_rx = {
	0,
	1,
	NULL,
	0,
	151,
	(char *) dottedabbrevs_rxprog
};

static const unsigned char usmoney_rxprog[] = {
	156, 
	6, 0, 72, 1, 0, 3, 8, 0, 5, 36, 
	0, 11, 0, 18, 4, 0, 0, 48, 49, 50, 
	51, 52, 53, 54, 55, 56, 57, 44, 0, 6, 
	0, 34, 21, 0, 3, 6, 0, 25, 8, 0, 
	5, 46, 0, 11, 0, 17, 4, 0, 0, 48, 
	49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 
	31, 0, 6, 6, 0, 3, 9, 0, 3, 2, 
	0, 3, 0, 0, 0
};
static const cst_regex usmoney_rx = {
	0,
	1,
	NULL,
	0,
	76,
	(char *) usmoney_rxprog
};

static const unsigned char digits2dash_rxprog[] = {
	156, 
	6, 0, 105, 1, 0, 3, 11, 0, 17, 4, 
	0, 0, 48, 49, 50, 51, 52, 53, 54, 55, 
	56, 57, 0, 21, 0, 3, 6, 0, 39, 8, 
	0, 5, 45, 0, 4, 0, 14, 48, 49, 50, 
	51, 52, 53, 54, 55, 56, 57, 0, 10, 0, 
	17, 4, 0, 0, 48, 49, 50, 51, 52, 53, 
	54, 55, 56, 57, 0, 31, 0, 3, 6, 0, 
	6, 7, 0, 48, 6, 0, 3, 9, 0, 3, 
	8, 0, 5, 45, 0, 11, 0, 17, 4, 0, 
	0, 48, 49, 50, 51, 52, 53, 54, 55, 56, 
	57, 0, 2, 0, 3, 0, 0, 0
};
static const cst_regex digits2dash_rx = {
	0,
	1,
	NULL,
	0,
	109,
	(char *) digits2dash_rxprog
};

static const unsigned char sevenphonenumber_rxprog[] = {
	156, 
	6, 0, 112, 1, 0, 3, 4, 0, 14, 48, 
	49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 
	4, 0, 14, 48, 49, 50, 51, 52, 53, 54, 
	55, 56, 57, 0, 4, 0, 14, 48, 49, 50, 
	51, 52, 53, 54, 55, 56, 57, 0, 8, 0, 
	5, 45, 0, 4, 0, 14, 48, 49, 50, 51, 
	52, 53, 54, 55, 56, 57, 0, 4, 0, 14, 
	48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 
	0, 4, 0, 14, 48, 49, 50, 51, 52, 53, 
	54, 55, 56, 57, 0, 4, 0, 14, 48, 49, 
	50, 51, 52, 53, 54, 55, 56, 57, 0, 2, 
	0, 3, 0, 0, 0
};
static const cst_regex sevenphonenumber_rx = {
	0,
	1,
	NULL,
	0,
	116,
	(char *) sevenphonenumber_rxprog
};

static const unsigned char illion_rxprog[] = {
	156, 
	6, 0, 25, 1, 0, 3, 10, 0, 6, 3, 
	0, 0, 8, 0, 10, 105, 108, 108, 105, 111, 
	110, 0, 2, 0, 3, 0, 0, 0
};
static const cst_regex illion_rx = {
	0,
	1,
	NULL,
	0,
	29,
	(char *) illion_rxprog
};

static const unsigned char drst_rxprog[] = {
	156, 
	6, 0, 45, 1, 0, 3, 21, 0, 3, 6, 
	0, 15, 4, 0, 6, 100, 68, 0, 4, 0, 
	21, 82, 114, 0, 6, 0, 15, 4, 0, 6, 
	83, 115, 0, 4, 0, 6, 116, 84, 0, 31, 
	0, 3, 2, 0, 3, 0, 0, 0
};
static const cst_regex drst_rx = {
	0,
	1,
	NULL,
	0,
	49,
	(char *) drst_rxprog
};

static const unsigned char numess_rxprog[] = {
	156, 
	6, 0, 31, 1, 0, 3, 11, 0, 17, 4, 
	0, 0, 48, 49, 50, 51, 52, 53, 54, 55, 
	56, 57, 0, 8, 0, 5, 115, 0, 2, 0, 
	3, 0, 0, 0
};
static const cst_regex numess_rx = {
	0,
	1,
	NULL,
	0,
	35,
	(char *) numess_rxprog
};

static const unsigned char ordinal_number_rxprog[] = {
	156, 
	6, 0, 119, 1, 0, 3, 4, 0, 14, 48, 
	49, 50, 51, 52, 53, 54, 55, 56, 57, 0, 
	10, 0, 18, 4, 0, 0, 48, 49, 50, 51, 
	52, 53, 54, 55, 56, 57, 44, 0, 21, 0, 
	3, 6, 0, 9, 8, 0, 69, 116, 104, 0, 
	6, 0, 9, 8, 0, 60, 84, 72, 0, 6, 
	0, 9, 8, 0, 51, 115, 116, 0, 6, 0, 
	9, 8, 0, 42, 83, 84, 0, 6, 0, 9, 
	8, 0, 33, 110, 100, 0, 6, 0, 9, 8, 
	0, 24, 78, 68, 0, 6, 0, 9, 8, 0, 
	15, 114, 100, 0, 6, 0, 9, 8, 0, 6, 
	82, 68, 0, 31, 0, 3, 2, 0, 3, 0, 
	0, 0
};
static const cst_regex ordinal_number_rx = {
	0,
	1,
	NULL,
	0,
	123,
	(char *) ordinal_number_rxprog
};

const cst_regex *numbertime = &numbertime_rx;
const cst_regex *fourdigits = &fourdigits_rx;
const cst_regex *threedigits = &threedigits_rx;
const cst_regex *romannums = &romannums_rx;
const cst_regex *digitsslashdigits = &digitsslashdigits_rx;
const cst_regex *dottedabbrevs = &dottedabbrevs_rx;
const cst_regex *usmoney = &usmoney_rx;
const cst_regex *digits2dash = &digits2dash_rx;
const cst_regex *sevenphonenumber = &sevenphonenumber_rx;
const cst_regex *illion = &illion_rx;
const cst_regex *drst = &drst_rx;
const cst_regex *numess = &numess_rx;
const cst_regex *ordinal_number = &ordinal_number_rx;

void us_text_init()
{
}

void us_text_deinit()
{
}

static int rex_like(const cst_item *t)
{
    /* returns 1 if this is in a king like context */
    char *pn = cst_downcase(ffeature_string(t,"p.name"));
    char *ppn = cst_downcase(ffeature_string(t,"p.p.name"));
    int v = 0;
    
    if (cst_streq(pn,"louis") ||
	cst_streq(pn,"henry") ||
	cst_streq(pn,"charles") ||
	cst_streq(pn,"philip") ||
	cst_streq(pn,"george") ||
	cst_streq(pn,"edward") ||
	cst_streq(pn,"pius") ||
	cst_streq(pn,"william") ||
	cst_streq(pn,"richard") ||
	cst_streq(pn,"ptolemy") ||
	cst_streq(pn,"john") ||
	cst_streq(pn,"paul") ||
	cst_streq(pn,"peter") ||
	cst_streq(pn,"nicholas") ||
	cst_streq(pn,"frederick") ||
	cst_streq(pn,"james") ||
	cst_streq(pn,"alfonso") ||
	cst_streq(pn,"ivan") ||
	cst_streq(pn,"napolean") ||
	cst_streq(pn,"leo") ||
	cst_streq(pn,"gregory") ||
	cst_streq(pn,"catherine") ||
	cst_streq(pn,"alexandria") ||
	cst_streq(pn,"pierre") ||
	cst_streq(pn,"elizabeth") ||
	cst_streq(pn,"mary"))
	v = 1;
    else if (cst_streq(ppn,"king") ||
	     cst_streq(ppn,"queen") ||
	     cst_streq(ppn,"pope") ||
	     cst_streq(ppn,"duke") ||
	     cst_streq(ppn,"tsar") ||
	     cst_streq(ppn,"emperor") ||
	     cst_streq(ppn,"shah") ||
	     cst_streq(ppn,"ceasar") ||
	     cst_streq(ppn,"duchess") ||
	     cst_streq(ppn,"tsarina") ||
	     cst_streq(ppn,"empress") ||
	     cst_streq(ppn,"baron") ||
	     cst_streq(ppn,"baroness") ||
	     cst_streq(ppn,"sultan") ||
	     cst_streq(ppn,"count") ||
	     cst_streq(ppn,"countess"))
	v = 1;

    cst_free(pn);
    cst_free(ppn);
    return v;
}

static int section_like(const cst_item *t)
{
    /* returns 1 if this is in a king like context */
    char *pn = cst_downcase(ffeature_string(t,"p.name"));
    int v = 0;

    if (cst_streq(pn,"section") ||
	cst_streq(pn,"chapter") ||
	cst_streq(pn,"part") ||
	cst_streq(pn,"phrase") ||
	cst_streq(pn,"verse") ||
	cst_streq(pn,"scene") ||
	cst_streq(pn,"act") ||
	cst_streq(pn,"book") ||
	cst_streq(pn,"volume") ||
	cst_streq(pn,"chap") ||
	cst_streq(pn,"war") ||
	cst_streq(pn,"apollo") ||
	cst_streq(pn,"trek") ||
	cst_streq(pn,"fortran"))
	v = 1;
    
    cst_free(pn);
    
    return v;
}

cst_utterance *us_textanalysis(cst_utterance *u)
{
    if (!feat_present(u->features, "tokentowords_func"))
	utt_set_feat(u, "tokentowords_func", itemfunc_val(us_tokentowords));

    return default_textanalysis(u);
}

static cst_val *us_tokentowords_one(cst_item *token, const char *name);
cst_val *us_tokentowords(cst_item *token)
{
    return us_tokentowords_one(token, item_feat_string(token, "name"));
}

static cst_val *add_break(cst_val *l)
{
    /* add feature (break 1) to last item in this list */
    const cst_val *i;
    cst_val *t;
    cst_features *f;

    for (i=l; val_cdr(i); i=val_cdr(i));

    if (i)  /* might be empty list */
    {
	f = new_features();
	feat_set_string(f,"break","1");
	t = cons_val(val_car(i),features_val(f));
	set_car((cst_val *)i,t);
    }

    return l;
}

static cst_val *us_tokentowords_one(cst_item *token, const char *name)
{
    /* Return list of words that expand token/name */
    char *p, *aaa, *bbb;
    int i,j;
    cst_val *r, *s;
    const char *nsw = "";
    /* printf("token_name %s name %s\n",item_name(token),name); */
    /* FIXME: For SAPI and friends, any tokens with explicit
       pronunciations need to be passed through as-is.  This should be
       done in the interface code rather than here once the
       tokentowords hook is accessible. AWB: no, they should set the
       nsw feature and this function should deal with it (doesn't yet though)*/
    if (item_feat_present(token,"phones"))
	return cons_val(string_val(name),NULL);

    if (item_feat_present(token,"nsw"))
	nsw = item_feat_string(token,"nsw");

    if ((cst_streq("a",name) || cst_streq("A",name)) &&
        ((item_next(token) == 0) ||
         (!cst_streq(name,item_name(token))) ||
         (!cst_streq("",ffeature_string(token,"punc")))))
    {   /* if A is a sub part of a token, then its ey not ah */
	r = cons_val(string_val("_a"),0);
    }
    else if (cst_regex_match(dottedabbrevs,name))
    {
	aaa = cst_strdup(name);
	for (i=j=0; aaa[i]; i++)
	    if (aaa[i] != '.')
	    {
		aaa[j] = aaa[i];
		j++;
	    }
	aaa[j] = '\0';
	r = en_exp_letters(aaa);
	cst_free(aaa);
    }
    else if (cst_regex_match(cst_rx_commaint,name))
    {   /* 99,999,999 */
	aaa = cst_strdup(name);
	for (j=i=0; i < strlen(name); i++)
	    if (name[i] != ',')
	    {
		aaa[j] = name[i];
		j++;
	    }
	aaa[j] = '\0';
	r = en_exp_real(aaa);
	cst_free(aaa);
    }
    else if (cst_regex_match(sevenphonenumber,name))
    {   /* 234-3434 telephone numbers */
	p=strchr(name,'-');
	aaa = cst_strdup(name);
	aaa[strlen(name)-strlen(p)] = '\0';
	bbb = cst_strdup(p+1);
	r = val_append(add_break(en_exp_digits(aaa)),
		       en_exp_digits(bbb));
	cst_free(aaa);
	cst_free(bbb);
    }
    else if 
     ((cst_regex_match(threedigits,name) &&
      ((!cst_regex_match(cst_rx_digits,ffeature_string(token,"p.name"))
	&& cst_regex_match(threedigits,ffeature_string(token,"n.name"))
	&& cst_regex_match(fourdigits,ffeature_string(token,"n.n.name"))) ||
       (cst_regex_match(sevenphonenumber,ffeature_string(token,"n.name"))) ||
       (!cst_regex_match(cst_rx_digits,ffeature_string(token,"p.p.name"))
	&& cst_regex_match(threedigits,ffeature_string(token,"p.name"))
	&& cst_regex_match(fourdigits,ffeature_string(token,"n.name"))))) ||
      (cst_regex_match(fourdigits,name) &&
       (!cst_regex_match(cst_rx_digits,ffeature_string(token,"n.name"))
	&& cst_regex_match(threedigits,ffeature_string(token,"p.name"))
	&& cst_regex_match(threedigits,ffeature_string(token,"p.p.name")))))
    {
	/* part of a telephone number */
	if (cst_streq("",ffeature_string(token,"punc")))
	    item_set_string(token,"punc",",");
	r = add_break(en_exp_digits(name));
    }
    else if (cst_regex_match(numbertime,name))
    {
	p=strchr(name,':');
	aaa = cst_strdup(name);
	aaa[strlen(name)-strlen(p)] = '\0';
	bbb = cst_strdup(p+1);

	r = en_exp_number(aaa);
	if (!cst_streq("00",bbb))
	    r = val_append(r,en_exp_id(bbb));
	/* r = add_break(r); */

	cst_free(aaa);
	cst_free(bbb);
    }
    else if (cst_regex_match(digits2dash,name))
    {   /* 999-999-999 etc */
	bbb = cst_strdup(name);
	for (r=0,aaa=p=bbb; *p; p++)
	{
	    if (*p == '-')
	    {
		*p = '\0';
		r = val_append(val_reverse(add_break(en_exp_digits(aaa))),r);
		aaa = p+1;
	    }
	}
	r = val_append(val_reverse(add_break(en_exp_digits(aaa))),r);
	r = val_reverse(r);
	cst_free(bbb);
    }
    else if (cst_regex_match(cst_rx_digits,name))
    {   /* string of digits (use cart to disambiguate) */
	if (cst_streq("nide",nsw))
	    r = en_exp_id(name);
	else {
	    const cst_val *tv;
	    const char *ts;
	    char *rname;
	    
	    rname = cst_strdup(item_feat_string(token,"name"));
	    if (cst_streq(name,rname))
		tv = cart_interpret(token,&us_nums_cart);
	    else 
	    {   /* in a recursive call */
		item_set_string(token,"name",name);
		tv = cart_interpret(token,&us_nums_cart);
		item_set_string(token,"name",rname);
	    }
	    cst_free(rname);
	    ts = val_string(tv);
	    if (cst_streq(ts,"ordinal"))
		r = en_exp_ordinal(name);
	    else if (cst_streq(ts,"digits"))
		r = en_exp_digits(name);
	    else if (cst_streq(ts,"year"))
		r = en_exp_id(name);
	    else
		r = en_exp_number(name);
	}
    }
    else if (cst_regex_match(romannums,name))
    {   /* Romain numerals */
	if (cst_streq("",ffeature_string(token,"p.punc")))
	{   /* no preceeding punc */
	    char n[10];
	    cst_sprintf(n,"%d",en_exp_roman(name));
	    if (rex_like(token))
		r = cons_val(string_val("the"),
			     en_exp_ordinal(n));
	    else if (section_like(token))
		r = en_exp_number(n);
	    else
		r = en_exp_letters(name);
	}
	else
	    r = en_exp_letters(name);
    }
    else if (cst_regex_match(drst,name))  
    {   /* St Andrew's St, Dr King Dr */
	const char *street;
	const char *saint;
	if ((name[0] == 's') || (name[0] == 'S'))
	{
	    street = "street";
	    saint = "saint";
	}
	else
	{
	    street = "drive";
	    saint = "doctor";
	}
	if ((item_next(token) == 0) ||
	    strchr(item_feat_string(token,"punc"),','))
	    r = cons_val(string_val(street),NULL);
	else if (strchr(ffeature_string(token,"punc"),','))
	    r = cons_val(string_val(saint),NULL);
	else 
	{
	    const char *pname = ffeature_string(token,"p.name");
	    const char *nname = ffeature_string(token,"n.name");
	    if ((pname[0] >= 'A') && (pname[0] <= 'Z') &&
		(nname[0] >= 'a') && (nname[0] <= 'z'))
		r = cons_val(string_val(street),NULL);
	    else if ((pname[0] >= '0') && (pname[0] <= '9') &&
		     (nname[0] >= 'a') && (nname[0] <= 'z'))
		r = cons_val(string_val(street),NULL);
	    else if ((pname[0] >= 'a') && (pname[0] <= 'z') &&
		     (nname[0] >= 'A') && (nname[0] <= 'Z'))
		r = cons_val(string_val(saint),NULL);
	    else if (cst_streq(ffeature_string(token,"n.whitespace")," "))
		r = cons_val(string_val(saint),NULL);
	    else
		r = cons_val(string_val(street),NULL);
	}
	if (cst_streq(item_feat_string(token,"punc"),"."))
	    item_set_string(token,"punc","");
    }
    else if (cst_streq(name,"Mr"))
    {
	item_set_string(token,"punc","");
	r = cons_val(string_val("mister"),NULL);
    }
    else if (cst_streq(name,"Mrs"))
    {
	item_set_string(token,"punc","");
	r = cons_val(string_val("missus"),NULL);
    }
    else if ((strlen(name) == 1) &&
	     (name[0] >= 'A') &&
	     (name[0] <= 'Z') &&
	     (cst_streq(" ",ffeature_string(token,"n.whitespace"))) &&
	     (ffeature_string(token,"n.name")[0] >= 'A') &&
	     (ffeature_string(token,"n.name")[0] <= 'Z'))
    {
	item_set_string(token,"punc","");
	aaa = cst_downcase(name);
	if (cst_streq(aaa,"a"))
	    r = cons_val(string_val("_a"),0);
	else
	    r = cons_val(string_val(aaa),0);
	cst_free(aaa);
    }
    else if (cst_regex_match(cst_rx_double,name))
    {   /* real numbers */
	r = en_exp_real(name);
    }
    else if (cst_regex_match(ordinal_number,name))
    {   /* explicit ordinals */
	aaa = cst_strdup(name);
	aaa[strlen(name)-2] = '\0';
	r = en_exp_ordinal(aaa);
	cst_free(aaa);
    }
    else if ((cst_regex_match(illion,name)) &&
	     (cst_regex_match(usmoney,ffeature_string(token,"p.name"))))
    {
	r = cons_val(string_val(name),
		     cons_val(string_val("dollars"),NULL));
    }
    else if (cst_regex_match(usmoney,name))
    {
	/* US money */
/*	printf("money, money, money %s\n", name); */
	p = strchr(name,'.');

	if (cst_regex_match(illion,ffeature_string(token,"n.name")))
	{   /* carl sagan's billions and billions */
	    r = en_exp_real(&name[1]);
	}
	else if (!p)
	{
	    aaa = cst_strdup(&name[1]);
	    if (cst_streq("1",aaa))
		r = cons_val(string_val("dollar"),NULL);
	    else
		r = cons_val(string_val("dollars"),NULL);
	    r = val_append(us_tokentowords_one(token,aaa),r);
	    cst_free(aaa);
	}
	else if ((strlen(p) == 1) || (strlen(p) > 3))
	{   /* simply read as mumble point mumble */
	    r = val_append(en_exp_real(&name[1]),
			   cons_val(string_val("dollars"),NULL));
	}
	else
	{
	    aaa = cst_strdup(name);
	    aaa[strlen(name)-strlen(p)] = '\0';
	    for (i=j=0; aaa[i] != '\0'; i++)
	    {
		if (aaa[i] != ',')
		{
		    aaa[j] = aaa[i];
		    j++;
		}
	    }
	    aaa[j] = '\0';
	    if (cst_streq("00",p+1))
		r = 0;
	    else if (cst_streq("01",p+1))
		r = val_append(en_exp_number(p+1),
			       cons_val(string_val("cent"),NULL));
	    else
		r = val_append(en_exp_number(p+1),
			       cons_val(string_val("cents"),NULL));

	    if (cst_streq("1",aaa+1))
		r = cons_val(string_val("dollar"),r);
	    else
		r = cons_val(string_val("dollars"),r);

	    r = val_append(en_exp_number(aaa+1),r);
	    cst_free(aaa);
	}
    }
    else if (name[strlen(name)-1] == '%')
    {
	aaa = cst_strdup(name);
	aaa[strlen(aaa)-1] = '\0';
	r = val_append(us_tokentowords_one(token,aaa),
		       cons_val(string_val("per"),
				cons_val(string_val("cent"),NULL)));
	cst_free(aaa);

    }
    else if (cst_regex_match(numess,name)) 
    {   /* 60s and 7s and 9s */
	aaa = cst_strdup(name);
	aaa[strlen(name)-1] = '\0';
	r = val_append(en_exp_number(aaa),
		       cons_val(string_val("'s"),0));
	cst_free(aaa);
    }
    else if ((p=(cst_strrchr(name,'\''))))
    {
	static const char *pc[] = { "'s", "'ll", "'ve", "'d", NULL };

	bbb = cst_downcase(p);
	if (cst_member_string(bbb, pc))
	{
	    aaa = cst_strdup(name);
	    aaa[strlen(name)-strlen(p)] = '\0';
	    r = val_append(us_tokentowords_one(token,aaa),
			   cons_val(string_val(bbb),0));
	    cst_free(aaa);
	}
	else if (cst_streq(p,"'tve")) /* admittedly rare and weird */
	{
	    aaa = cst_strdup(name);
	    aaa[strlen(name)-strlen(p)+2] = '\0';
	    r = val_append(us_tokentowords_one(token,aaa),
			   cons_val(string_val("'ve"),0));
	    cst_free(aaa);
	}
	else
	{
	    aaa = cst_strdup(name);
	    strcpy(&aaa[strlen(name)-strlen(p)],p+1);
	    r = us_tokentowords_one(token,aaa);
	    cst_free(aaa);
	}
	cst_free(bbb);
    }
    else if ((cst_regex_match(digitsslashdigits,name)) &&
	     (cst_streq(name,item_name(token))))
    {   /* might be fraction, or not */
	p=strchr(name,'/');
	aaa = cst_strdup(name);
	aaa[strlen(name)-strlen(p)] = '\0';
	bbb = cst_strdup(p+1);
	if ((cst_streq("1",aaa)) && (cst_streq("2",bbb)))
	    r = cons_val(string_val("a"),
			 cons_val(string_val("half"),0));
	else if (atoi(aaa) < (atoi(bbb)))
	{
	    r = val_append(en_exp_number(aaa),
			   en_exp_ordinal(bbb));
	    if (atoi(aaa) > 1)
		r = val_append(r,cons_val(string_val("'s"),0));
	}
	else
	    r = val_append(en_exp_number(aaa),
			   cons_val(string_val("slash"),
				    en_exp_number(bbb)));

	if ((cst_regex_match(cst_rx_digits,ffeature_string(token,"p.name")))
	    && (item_prev(token)))  /* don't mistake "0" as a number */
	    r = cons_val(string_val("and"),r);
	cst_free(aaa);
	cst_free(bbb);
    }
    else if ((p=(strchr(name,'-'))))
    {   /* aaa-bbb */
	aaa = cst_strdup(name);
	aaa[strlen(name)-strlen(p)] = '\0';
	bbb = cst_strdup(p+1);
	if (cst_regex_match(cst_rx_digits,aaa) &&
	    cst_regex_match(cst_rx_digits,bbb))
	{
	    item_set_string(token,"name",bbb);
	    r = us_tokentowords_one(token,bbb);
	    item_set_string(token,"name",aaa);
	    r = val_append(us_tokentowords_one(token,aaa),
			   cons_val(string_val("to"),r));
	    item_set_string(token,"name",name);
	}
	else
	    r = val_append(us_tokentowords_one(token,aaa),
			   us_tokentowords_one(token,bbb));
	cst_free(aaa);
	cst_free(bbb);
    }
    else if ((strlen(name) > 1) && (!cst_regex_match(cst_rx_alpha,name)))
    {   /* its not just alphas */
	for (i=0; name[i] != '\0'; i++)
	    if (text_splitable(name,i))
		break;
	aaa = cst_strdup(name);
	aaa[i+1] = '\0';
	bbb = cst_strdup(&name[i+1]);
	item_set_string(token,"nsw","nide");
	r = val_append(us_tokentowords_one(token,aaa),
		       us_tokentowords_one(token,bbb));
	cst_free(aaa);
	cst_free(bbb);
    }
    else if ((s = state_name(name,token)))
    {
	r = s;
    }
    else if ((strlen(name) > 1) && 
	     (cst_regex_match(cst_rx_alpha,name)) &&
	     (!us_aswd(name)))
	/* Need common exception list */
	/* unpronouncable list of alphas */
	r = en_exp_letters(name);

    /* buckets of other stuff missing */

    else  /* just a word */
    {
	aaa = cst_downcase(name);
	r = cons_val(string_val(aaa),0);
	cst_free(aaa);
    }
    return r;
}

static int text_splitable(const char *s,int i)
{
    /* should token be split abter this */

    if (strchr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",s[i]) &&
	strchr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",s[i+1]))
	return FALSE;
    else if (strchr("0123456789",s[i]) &&
	     strchr("0123456789",s[i+1]))
	return FALSE;
    else
	return TRUE;
}


static const char *states[99][5] =
{
  { "AL", "ambiguous", "alabama" , NULL, NULL },
  { "Al", "ambiguous", "alabama" , NULL, NULL },
  { "Ala", "", "alabama" , NULL, NULL },
  { "AK", "", "alaska" , NULL, NULL },
  { "Ak", "", "alaska" , NULL, NULL },
  { "AZ", "", "arizona" , NULL, NULL },
  { "Az", "", "arizona" , NULL, NULL },
  { "CA", "", "california" , NULL, NULL },
  { "Ca", "", "california" , NULL, NULL },
  { "Cal", "ambiguous", "california" , NULL, NULL },
  { "Calif", "", "california" , NULL, NULL },
  { "CO", "ambiguous", "colorado" , NULL, NULL },
  { "Co", "ambiguous", "colorado" , NULL, NULL },
  { "Colo", "", "colorado" , NULL, NULL },
  { "DC", "", "d" , "c", NULL },
  { "DE", "", "delaware" , NULL, NULL },
  { "De", "ambiguous", "delaware" , NULL, NULL },
  { "Del", "ambiguous", "delaware" , NULL, NULL },
  { "FL", "", "florida" , NULL, NULL },
  { "Fl", "ambiguous", "florida" , NULL, NULL },
  { "Fla", "", "florida" , NULL, NULL },
  { "GA", "", "georgia" , NULL, NULL },
  { "Ga", "", "georgia" , NULL, NULL },
  { "HI", "", "hawaii" , NULL, NULL },
  { "Hi", "ambiguous", "hawaii" , NULL, NULL },
  { "IA", "", "iowa" , NULL, NULL },
  { "Ia", "ambiguous", "iowa" , NULL, NULL },
  { "Ind", "ambiguous", "indiana" , NULL, NULL },
  { "ID", "ambiguous", "idaho" , NULL, NULL },
  { "IL", "ambiguous", "illinois" , NULL, NULL },
  { "Il", "ambiguous", "illinois" , NULL, NULL },
  { "ILL", "ambiguous", "illinois" , NULL, NULL },
  { "KS", "", "kansas" , NULL, NULL },
  { "Ks", "", "kansas" , NULL, NULL },
  { "Kans", "", "kansas" , NULL, NULL },
  { "KY", "ambiguous", "kentucky" , NULL, NULL },
  { "Ky", "ambiguous", "kentucky" , NULL, NULL },
  { "LA", "ambiguous", "louisiana" , NULL, NULL },
  { "La", "ambiguous", "louisiana" , NULL, NULL },
  { "Lou", "ambiguous", "louisiana" , NULL, NULL },
  { "Lous", "ambiguous", "louisiana" , NULL, NULL },
  { "MA", "ambiguous", "massachusetts" , NULL, NULL },
  { "Mass", "ambiguous", "massachusetts" , NULL, NULL },
  { "Ma", "ambiguous", "massachusetts" , NULL, NULL },
  { "MD", "ambiguous", "maryland" , NULL, NULL },
  { "Md", "ambiguous", "maryland" , NULL, NULL },
  { "ME", "ambiguous", "maine" , NULL, NULL },
  { "Me", "ambiguous", "maine" , NULL, NULL },
  { "MI", "", "michigan" , NULL, NULL },
  { "Mi", "ambiguous", "michigan" , NULL, NULL },
  { "Mich", "ambiguous", "michigan" , NULL, NULL },
  { "MN", "ambiguous", "minnestota" , NULL, NULL },
  { "Minn", "ambiguous", "minnestota" , NULL, NULL },
  { "MS", "ambiguous", "mississippi" , NULL, NULL },
  { "Miss", "ambiguous", "mississippi" , NULL, NULL },
  { "MT", "ambiguous", "montanna" , NULL, NULL },
  { "Mt", "ambiguous", "montanna" , NULL, NULL },
  { "MO", "ambiguous", "missouri" , NULL, NULL },
  { "Mo", "ambiguous", "missouri" , NULL, NULL },
  { "NC", "ambiguous", "north" , "carolina", NULL },
  { "ND", "ambiguous", "north" , "dakota", NULL },
  { "NE", "ambiguous", "nebraska" , NULL, NULL },
  { "Ne", "ambiguous", "nebraska" , NULL, NULL },
  { "Neb", "ambiguous", "nebraska" , NULL, NULL },
  { "NH", "ambiguous", "new" , "hampshire", NULL },
  { "NV", "", "nevada" , NULL, NULL },
  { "Nev", "", "nevada" , NULL, NULL },
  { "NY", "", "new" , "york", NULL },
  { "OH", "ambiguous", "ohio" , NULL, NULL },
  { "OK", "ambiguous", "oklahoma" , NULL, NULL },
  { "Okla", "", "oklahoma" , NULL, NULL },
  { "OR", "ambiguous", "oregon" , NULL, NULL },
  { "Or", "ambiguous", "oregon" , NULL, NULL },
  { "Ore", "ambiguous", "oregon" , NULL, NULL },
  { "PA", "ambiguous", "pennsylvania" , NULL, NULL },
  { "Pa", "ambiguous", "pennsylvania" , NULL, NULL },
  { "Penn", "ambiguous", "pennsylvania" , NULL, NULL },
  { "RI", "ambiguous", "rhode" , "island", NULL },
  { "SC", "ambiguous", "south" , "carlolina", NULL },
  { "SD", "ambiguous", "south" , "dakota", NULL },
  { "TN", "ambiguous", "tennesee" , NULL, NULL },
  { "Tn", "ambiguous", "tennesee" , NULL, NULL },
  { "Tenn", "ambiguous", "tennesee" , NULL, NULL },
  { "TX", "ambiguous", "texas" , NULL, NULL },
  { "Tx", "ambiguous", "texas" , NULL, NULL },
  { "Tex", "ambiguous", "texas" , NULL, NULL },
  { "UT", "ambiguous", "utah" , NULL, NULL },
  { "VA", "ambiguous", "virginia" , NULL, NULL },
  { "WA", "ambiguous", "washington" , NULL, NULL },
  { "Wa", "ambiguous", "washington" , NULL, NULL },
  { "Wash", "ambiguous", "washington" , NULL, NULL },
  { "WI", "ambiguous", "wisconsin" , NULL, NULL },
  { "Wi", "ambiguous", "wisconsin" , NULL, NULL },
  { "WV", "ambiguous", "west" , "virginia", NULL },
  { "WY", "ambiguous", "wyoming" , NULL, NULL },
  { "Wy", "ambiguous", "wyoming" , NULL, NULL },
  { "Wyo", "", "wyoming" , NULL, NULL },
  { "PR", "ambiguous", "puerto" , "rico", NULL },
  { NULL, NULL, "puerto" , "rico", NULL }
};

static cst_val *state_name(const char *name,cst_item *t)
{
    int s,j;
    int do_it = 0;
    cst_val *r = 0;

    for (s=0; states[s][0]; s++)
    {
	if (cst_streq(states[s][0],name))
	{
	    if (cst_streq(states[s][1],"ambiguous"))
	    {
		const char *pname = ffeature_string(t,"p.name");
		const char *nname = ffeature_string(t,"n.name");
		    /* previous name is capitalized */
		if (((strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",pname[0])) &&
		     (strlen(pname) > 2) &&
		     (cst_regex_match(cst_rx_alpha,pname))) &&
		    ((strchr("abcdefghijklmnopqrstuvwxyz",nname[0])) ||
		     (item_next(t) == 0) ||
		     (cst_streq(".",item_feat_string(t,"punc"))) ||
		     (((strlen(nname) == 5 || (strlen(nname) == 10)) &&
		       cst_regex_match(cst_rx_digits,nname)))))
		    do_it = 1;
		else
		    do_it = 0;
	    }
	    else
		do_it = 1;

	    if (do_it)
	    {
		for (j=2; states[s][j]; j++)
		    r = cons_val(string_val(states[s][j]),r);
		return val_reverse(r);
	    }
	}
    }
    return r;

}



By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The BSD License


Written By
CEO bring-it-together s.r.o.
Slovakia Slovakia
Jozef Božek is currently a software engineer at bring-it-together s.r.o. in area of large scale infomation systems and mobile applications development.
He has been developing in C++ nearly full time since 2000, in Java since 2004 and in Objective-C since 2009. He is programming using Java EE SDK, iOS SDK, COM/DCOM, MFC, ATL, STL and so on Smile | :)

Comments and Discussions