/***************************************************************************
* Copyright (C) 2005 to 2007 by Jonathan Duddington *
* email: jonsd@users.sourceforge.net *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write see: *
* <http://www.gnu.org/licenses/>. *
***************************************************************************/
#include "StdAfx.h"
#include <stdlib.h>
#include <stdio.h>
#include <wctype.h>
#include "speak_lib.h"
#include "speech.h"
#include "phoneme.h"
#include "synthesize.h"
#include "voice.h"
#include "translate.h"
extern int GetAmplitude(void);
// convert from words-per-minute to internal speed factor
// Use this to calibrate speed for wpm 80-350
static unsigned char speed_lookup[] = {
255, 255, 255, 255, 255, // 80
253, 249, 245, 242, 238, // 85
235, 232, 228, 225, 222, // 90
218, 216, 213, 210, 207, // 95
204, 201, 198, 196, 193, // 100
191, 188, 186, 183, 181, // 105
179, 176, 174, 172, 169, // 110
168, 165, 163, 161, 159, // 115
158, 155, 153, 152, 150, // 120
148, 146, 145, 143, 141, // 125
139, 137, 136, 135, 133, // 130
131, 130, 129, 127, 126, // 135
124, 123, 122, 120, 119, // 140
118, 117, 115, 114, 113, // 145
112, 111, 110, 109, 107, // 150
106, 105, 104, 103, 102, // 155
101, 100, 99, 98, 97, // 160
96, 95, 94, 93, 92, // 165
91, 90, 89, 89, 88, // 170
87, 86, 85, 84, 83, // 175
82, 82, 81, 80, 80, // 180
79, 78, 77, 76, 76, // 185
75, 75, 74, 73, 72, // 190
71, 71, 70, 69, 69, // 195
68, 67, 67, 66, 66, // 200
65, 64, 64, 63, 62, // 205
62, 61, 61, 60, 59, // 210
59, 58, 58, 57, 57, // 215
56, 56, 55, 54, 54, // 220
53, 53, 52, 52, 52, // 225
51, 50, 50, 49, 49, // 230
48, 48, 47, 47, 46, // 235
46, 46, 45, 45, 44, // 240
44, 44, 43, 43, 42, // 245
41, 40, 40, 40, 39, // 250
39, 39, 38, 38, 38, // 255
37, 37, 37, 36, 36, // 260
35, 35, 35, 35, 34, // 265
34, 34, 33, 33, 33, // 270
32, 32, 31, 31, 31, // 275
30, 30, 30, 29, 29, // 280
29, 29, 28, 28, 27, // 285
27, 27, 27, 26, 26, // 290
26, 26, 25, 25, 25, // 295
24, 24, 24, 24, 23, // 300
23, 23, 23, 22, 22, // 305
22, 21, 21, 21, 21, // 310
20, 20, 20, 20, 19, // 315
19, 19, 18, 18, 17, // 320
17, 17, 16, 16, 16, // 325
16, 16, 16, 15, 15, // 330
15, 15, 14, 14, 14, // 335
13, 13, 13, 12, 12, // 340
12, 12, 11, 11, 11, // 345
11, 10, 10, 10, 9, // 350
9, 9, 8, 8, 8, // 355
};
// speed_factor1 adjustments for speeds 350 to 374: pauses
static unsigned char pause_factor_350[] = {
22,22,22,22,22,22,22,21,21,21, // 350
21,20,20,19,19,18,17,16,15,15, // 360
15,15,15,15,15}; // 370
// wav_factor adjustments for speeds 350 to 450
// Use this to calibrate speed for wpm 350-450
static unsigned char wav_factor_350[] = {
120, 121, 120, 119, 119, // 350
118, 118, 117, 116, 116, // 355
115, 114, 113, 112, 112, // 360
111, 111, 110, 109, 108, // 365
107, 106, 106, 104, 103, // 370
103, 102, 102, 102, 101, // 375
101, 99, 98, 98, 97, // 380
96, 96, 95, 94, 93, // 385
91, 90, 91, 90, 89, // 390
88, 86, 85, 86, 85, // 395
85, 84, 82, 81, 80, // 400
79, 77, 78, 78, 76, // 405
77, 75, 75, 74, 73, // 410
71, 72, 70, 69, 69, // 415
69, 67, 65, 64, 63, // 420
63, 63, 61, 61, 59, // 425
59, 59, 58, 56, 57, // 430
58, 56, 54, 53, 52, // 435
52, 53, 52, 52, 50, // 440
48, 47, 47, 45, 46, // 445
45}; // 450
static int speed1 = 130;
static int speed2 = 121;
static int speed3 = 118;
//#define TEST_SPEED
void SetSpeed(int control)
{//=======================
int x;
int s1;
int wpm;
int wpm2;
speed.loud_consonants = 0;
speed.min_sample_len = 450;
speed.lenmod_factor = 110; // controls the effect of FRFLAG_LEN_MOD reduce length change
speed.lenmod2_factor = 100;
wpm = embedded_value[EMBED_S];
if(control == 2)
wpm = embedded_value[EMBED_S2];
#ifdef TEST_SPEED
if(wpm > 1000)
{
// TESTING
// test = wpm / 1000;
wpm = wpm % 1000;
}
#endif
if(voice->speed_percent > 0)
{
wpm = (wpm * voice->speed_percent)/100;
}
if(wpm > 450)
wpm = 450;
if(wpm > 360)
{
speed.loud_consonants = (wpm - 360) / 8;
}
wpm2 = wpm;
if(wpm > 359) wpm2 = 359;
if(wpm < 80) wpm2 = 80;
x = speed_lookup[wpm2-80];
if(wpm >= 380)
x = 7;
if(wpm >= 400)
x = 6;
if(control & 1)
{
// set speed factors for different syllable positions within a word
// these are used in CalcLengths()
speed1 = (x * voice->speedf1)/256;
speed2 = (x * voice->speedf2)/256;
speed3 = (x * voice->speedf3)/256;
if(x <= 7)
{
speed1 = x;
speed2 = speed3 = x - 1;
}
}
if(control & 2)
{
// these are used in synthesis file
if(wpm > 350)
{
speed.lenmod_factor = 85 - (wpm - 350) / 3;
speed.lenmod2_factor = 60 - (wpm - 350) / 8;
}
else
if(wpm > 250)
{
speed.lenmod_factor = 110 - (wpm - 250)/4;
speed.lenmod2_factor = 110 - (wpm - 250)/2;
}
s1 = (x * voice->speedf1)/256;
if(wpm >= 170)
speed.wav_factor = 110 + (150*s1)/128; // reduced speed adjustment, used for playing recorded sounds
else
speed.wav_factor = 128 + (128*s1)/130; // = 215 at 170 wpm
if(wpm >= 350)
{
speed.wav_factor = wav_factor_350[wpm-350];
}
if(wpm >= 390)
{
speed.min_sample_len = 450 - (wpm - 400)/2;
if(wpm > 440)
speed.min_sample_len = 420 - (wpm - 440);
}
speed.pause_factor = (256 * s1)/115; // full speed adjustment, used for pause length
speed.clause_pause_factor = 0;
if(wpm > 430)
{
speed.pause_factor = 12;
// speed.clause_pause_factor = 15;
}
else
if(wpm > 400)
{
speed.pause_factor = 13;
// speed.clause_pause_factor = 15;
}
else
if(wpm > 374)
{
speed.pause_factor = 14;
}
else
if(wpm > 350)
{
speed.pause_factor = pause_factor_350[wpm - 350];
}
if(speed.clause_pause_factor == 0)
{
// restrict the reduction of pauses between clauses
if((speed.clause_pause_factor = speed.pause_factor) < 16)
speed.clause_pause_factor = 16;
}
if(wpm >= 370)
{
// TESTING
// use experimental fast settings if they have been specified in the Voice
if(speed.fast_settings[0] > 0)
speed.pause_factor = speed.fast_settings[0];
if(speed.fast_settings[1] > 0)
speed.wav_factor = speed.fast_settings[1];
if(speed.fast_settings[2] > 0)
speed.lenmod_factor = speed.lenmod2_factor = speed.fast_settings[2];
}
}
#ifdef TEST_SPEED
//if(control==3)
printf("%3d: speedf %d %d %d pause=%d %d wav=%d lenmod=%d %d\n",wpm,speed1,speed2,speed3, speed.pause_factor,speed.clause_pause_factor, speed.wav_factor,speed.lenmod_factor,speed.lenmod2_factor);
#endif
} // end of SetSpeed
#ifdef deleted
void SetAmplitude(int amp)
{//=======================
static unsigned char amplitude_factor[] = {0,5,6,7,9,11,14,17,21,26, 32, 38,44,50,56,63,70,77,84,91,100 };
if((amp >= 0) && (amp <= 20))
{
option_amplitude = (amplitude_factor[amp] * 480)/256;
}
}
#endif
void SetParameter(int parameter, int value, int relative)
{//======================================================
// parameter: reset-all, amp, pitch, speed, linelength, expression, capitals, number grouping
// relative 0=absolute 1=relative
int new_value = value;
int default_value;
if(relative)
{
if(parameter < 5)
{
default_value = param_defaults[parameter];
new_value = default_value + (default_value * value)/100;
}
}
param_stack[0].parameter[parameter] = new_value;
switch(parameter)
{
case espeakRATE:
embedded_value[EMBED_S] = new_value;
embedded_value[EMBED_S2] = new_value;
SetSpeed(3);
break;
case espeakVOLUME:
embedded_value[EMBED_A] = new_value;
GetAmplitude();
break;
case espeakPITCH:
if(new_value > 99) new_value = 99;
if(new_value < 0) new_value = 0;
embedded_value[EMBED_P] = new_value;
break;
case espeakRANGE:
if(new_value > 99) new_value = 99;
embedded_value[EMBED_R] = new_value;
break;
case espeakLINELENGTH:
option_linelength = new_value;
break;
case espeakWORDGAP:
option_wordgap = new_value;
break;
case espeakINTONATION:
if((new_value & 0xff) != 0)
translator->langopts.intonation_group = new_value & 0xff;
option_tone_flags = new_value;
break;
default:
break;
}
} // end of SetParameter
static void DoEmbedded2(int *embix)
{//================================
// There were embedded commands in the text at this point
unsigned int word;
do {
word = embedded_list[(*embix)++];
if((word & 0x1f) == EMBED_S)
{
// speed
SetEmbedded(word & 0x7f, word >> 8); // adjusts embedded_value[EMBED_S]
SetSpeed(1);
}
} while((word & 0x80) == 0);
}
void CalcLengths(Translator *tr)
{//==============================
int ix;
int ix2;
PHONEME_LIST *prev;
PHONEME_LIST *next;
PHONEME_LIST *next2;
PHONEME_LIST *next3;
PHONEME_LIST *p;
PHONEME_LIST *p2;
int stress;
int type;
static int more_syllables=0;
int pre_sonorant=0;
int pre_voiced=0;
int last_pitch = 0;
int pitch_start;
int length_mod;
int len;
int env2;
int end_of_clause;
int embedded_ix = 0;
int min_drop;
int pitch1;
int emphasized;
int tone_mod;
unsigned char *pitch_env=NULL;
PHONEME_DATA phdata_tone;
for(ix=1; ix<n_phoneme_list; ix++)
{
prev = &phoneme_list[ix-1];
p = &phoneme_list[ix];
stress = p->stresslevel & 0x7;
emphasized = p->stresslevel & 0x8;
next = &phoneme_list[ix+1];
if(p->synthflags & SFLAG_EMBEDDED)
{
DoEmbedded2(&embedded_ix);
}
type = p->type;
if(p->synthflags & SFLAG_SYLLABLE)
type = phVOWEL;
switch(type)
{
case phPAUSE:
last_pitch = 0;
break;
case phSTOP:
last_pitch = 0;
if(prev->type == phFRICATIVE)
p->prepause = 25;
else
if((more_syllables > 0) || (stress < 4))
p->prepause = 48;
else
p->prepause = 60;
if(prev->type == phSTOP)
p->prepause = 60;
if((tr->langopts.word_gap & 0x10) && (p->newword))
p->prepause = 60;
if(p->ph->phflags & phLENGTHENSTOP)
p->prepause += 30;
if(p->synthflags & SFLAG_LENGTHEN)
p->prepause += tr->langopts.long_stop;
break;
case phVFRICATIVE:
case phFRICATIVE:
if(p->newword)
{
if((prev->type == phVOWEL) && (p->ph->phflags & phNOPAUSE))
{
}
else
{
p->prepause = 15;
}
}
if(next->type==phPAUSE && prev->type==phNASAL && !(p->ph->phflags&phFORTIS))
p->prepause = 25;
if(prev->ph->phflags & phBRKAFTER)
p->prepause = 30;
if((tr->langopts.word_gap & 0x10) && (p->newword))
p->prepause = 30;
if((p->ph->phflags & phSIBILANT) && next->type==phSTOP && !next->newword)
{
if(prev->type == phVOWEL)
p->length = 200; // ?? should do this if it's from a prefix
else
p->length = 150;
}
else
p->length = 256;
if(type == phVFRICATIVE)
{
if(next->type==phVOWEL)
{
pre_voiced = 1;
}
if((prev->type==phVOWEL) || (prev->type == phLIQUID))
{
p->length = (255 + prev->length)/2;
}
}
break;
case phVSTOP:
if(prev->type==phVFRICATIVE || prev->type==phFRICATIVE || (prev->ph->phflags & phSIBILANT) || (prev->type == phLIQUID))
p->prepause = 30;
if(next->type==phVOWEL || next->type==phLIQUID)
{
if((next->type==phVOWEL) || !next->newword)
pre_voiced = 1;
p->prepause = 40;
if((prev->type == phPAUSE) || (prev->type == phVOWEL)) // || (prev->ph->mnemonic == ('/'*256+'r')))
p->prepause = 0;
else
if(p->newword==0)
{
if(prev->type==phLIQUID)
p->prepause = 20;
if(prev->type==phNASAL)
p->prepause = 12;
if(prev->type==phSTOP && !(prev->ph->phflags & phFORTIS))
p->prepause = 0;
}
}
if((tr->langopts.word_gap & 0x10) && (p->newword) && (p->prepause < 20))
p->prepause = 20;
break;
case phLIQUID:
case phNASAL:
p->amp = tr->stress_amps[0]; // unless changed later
p->length = 256; // TEMPORARY
min_drop = 0;
if(p->newword)
{
if(prev->type==phLIQUID)
p->prepause = 25;
if(prev->type==phVOWEL)
{
if(!(p->ph->phflags & phNOPAUSE))
p->prepause = 12;
}
}
if(next->type==phVOWEL)
{
pre_sonorant = 1;
}
else
{
p->pitch2 = last_pitch;
if((prev->type==phVOWEL) || (prev->type == phLIQUID))
{
p->length = prev->length;
if(p->type == phLIQUID)
{
p->length = speed1;
}
if(next->type == phVSTOP)
{
p->length = (p->length * 160)/100;
}
if(next->type == phVFRICATIVE)
{
p->length = (p->length * 120)/100;
}
}
else
{
for(ix2=ix; ix2<n_phoneme_list; ix2++)
{
if(phoneme_list[ix2].type == phVOWEL)
{
p->pitch2 = phoneme_list[ix2].pitch2;
break;
}
}
}
p->pitch1 = p->pitch2-16;
if(p->pitch2 < 16)
{
p->pitch1 = 0;
}
p->env = PITCHfall;
pre_voiced = 0;
}
break;
case phVOWEL:
min_drop = 0;
next2 = &phoneme_list[ix+2];
next3 = &phoneme_list[ix+3];
if(stress > 7) stress = 7;
if(stress <= 1)
{
stress = stress ^ 1; // swap diminished and unstressed (until we swap stress_amps,stress_lengths in tr_languages)
}
if(pre_sonorant)
p->amp = tr->stress_amps[stress]-1;
else
p->amp = tr->stress_amps[stress];
if(emphasized)
p->amp = 25;
if(ix >= (n_phoneme_list-3))
{
// last phoneme of a clause, limit its amplitude
if(p->amp > tr->langopts.param[LOPT_MAXAMP_EOC])
p->amp = tr->langopts.param[LOPT_MAXAMP_EOC];
}
// is the last syllable of a word ?
more_syllables=0;
end_of_clause = 0;
for(p2 = p+1; p2->newword== 0; p2++)
{
if((p2->type == phVOWEL) && !(p2->ph->phflags & phNONSYLLABIC))
more_syllables++;
if(p2->ph->code == phonPAUSE_CLAUSE)
end_of_clause = 2;
}
if(p2->ph->code == phonPAUSE_CLAUSE)
end_of_clause = 2;
if((p2->newword & 2) && (more_syllables==0))
{
end_of_clause = 2;
}
// calc length modifier
if((next->ph->code == phonPAUSE_VSHORT) && (next2->type == phPAUSE))
{
// if PAUSE_VSHORT is followed by a pause, then use that
next = next2;
next2 = next3;
next3 = &phoneme_list[ix+4];
}
if(more_syllables==0)
{
len = tr->langopts.length_mods0[next2->ph->length_mod *10+ next->ph->length_mod];
if((next->newword) && (tr->langopts.word_gap & 0x20))
{
// consider as a pause + first phoneme of the next word
length_mod = (len + tr->langopts.length_mods0[next->ph->length_mod *10+ 1])/2;
}
else
length_mod = len;
}
else
{
length_mod = tr->langopts.length_mods[next2->ph->length_mod *10+ next->ph->length_mod];
if((next->type == phNASAL) && (next2->type == phSTOP || next2->type == phVSTOP) && (next3->ph->phflags & phFORTIS))
length_mod -= 15;
}
if(more_syllables==0)
length_mod *= speed1;
else
if(more_syllables==1)
length_mod *= speed2;
else
length_mod *= speed3;
length_mod = length_mod / 128;
if(length_mod < 8)
length_mod = 8; // restrict how much lengths can be reduced
if(stress >= 7)
{
// tonic syllable, include a constant component so it doesn't decrease directly with speed
length_mod += 20;
if(emphasized)
length_mod += 10;
}
else
if(emphasized)
{
length_mod += 20;
}
if((len = tr->stress_lengths[stress]) == 0)
len = tr->stress_lengths[6];
length_mod = length_mod * len;
if(p->tone_ph != 0)
{
if((tone_mod = phoneme_tab[p->tone_ph]->std_length) > 0)
{
// a tone phoneme specifies a percentage change to the length
length_mod = (length_mod * tone_mod) / 100;
}
}
if(end_of_clause == 2)
{
// this is the last syllable in the clause, lengthen it - more for short vowels
len = (p->ph->std_length * 2);
if(tr->langopts.stress_flags & 0x40000)
len=200; // don't lengthen short vowels more than long vowels at end-of-clause
length_mod = length_mod * (256 + (280 - len)/3)/256;
}
if(length_mod > tr->langopts.max_lengthmod*speed1)
{
//limit the vowel length adjustment for some languages
length_mod = (tr->langopts.max_lengthmod*speed1);
}
length_mod = length_mod / 128;
if(p->type != phVOWEL)
{
length_mod = 256; // syllabic consonant
min_drop = 16;
}
p->length = length_mod;
if(p->env >= (N_ENVELOPE_DATA-1))
{
fprintf(stderr,"espeak: Bad intonation data\n");
p->env = 0;
}
// pre-vocalic part
// set last-pitch
env2 = p->env + 1; // version for use with preceding semi-vowel
if(p->tone_ph != 0)
{
InterpretPhoneme2(p->tone_ph, &phdata_tone);
pitch_env = GetEnvelope(phdata_tone.pitch_env);
}
else
{
pitch_env = envelope_data[env2];
}
pitch_start = p->pitch1 + ((p->pitch2-p->pitch1)*pitch_env[0])/256;
if(pre_sonorant || pre_voiced)
{
// set pitch for pre-vocalic part
if(pitch_start == 255)
last_pitch = pitch_start; // pitch is not set
if(pitch_start - last_pitch > 16)
last_pitch = pitch_start - 16;
prev->pitch1 = last_pitch;
prev->pitch2 = pitch_start;
if(last_pitch < pitch_start)
{
prev->env = PITCHrise;
p->env = env2;
}
else
{
prev->env = PITCHfall;
}
prev->length = length_mod;
prev->amp = p->amp;
if((prev->type != phLIQUID) && (prev->amp > 18))
prev->amp = 18;
}
// vowel & post-vocalic part
next->synthflags &= ~SFLAG_SEQCONTINUE;
if(next->type == phNASAL && next2->type != phVOWEL)
next->synthflags |= SFLAG_SEQCONTINUE;
if(next->type == phLIQUID)
{
next->synthflags |= SFLAG_SEQCONTINUE;
if(next2->type == phVOWEL)
{
next->synthflags &= ~SFLAG_SEQCONTINUE;
}
if(next2->type != phVOWEL)
{
if(next->ph->mnemonic == ('/'*256+'r'))
{
next->synthflags &= ~SFLAG_SEQCONTINUE;
// min_drop = 15;
}
}
}
if((min_drop > 0) && ((p->pitch2 - p->pitch1) < min_drop))
{
pitch1 = p->pitch2 - min_drop;
if(pitch1 < 0)
pitch1 = 0;
p->pitch1 = pitch1;
}
last_pitch = p->pitch1 + ((p->pitch2-p->pitch1)*envelope_data[p->env][127])/256;
pre_sonorant = 0;
pre_voiced = 0;
break;
}
}
} // end of CalcLengths