pub mod ipa_table;
use std::path::{Path, PathBuf};
pub fn default_data_dir() -> String {
if let Ok(path) = std::env::var("ESPEAK_DATA_PATH") {
return path;
}
if let Ok(exe) = std::env::current_exe() {
if let Some(dir) = exe.parent() {
let local = dir.join("espeak-ng-data");
if local.join("en_dict").exists() {
return local.to_string_lossy().into_owned();
}
}
}
{
let cwd_local = std::path::Path::new("espeak-ng-data");
if cwd_local.join("en_dict").exists() {
if let Ok(abs) = cwd_local.canonicalize() {
return abs.to_string_lossy().into_owned();
}
}
}
"/usr/share/espeak-ng-data".to_string()
}
use crate::error::{Error, Result};
use crate::phoneme::load::PhonemeData;
use crate::dictionary::file::Dictionary;
use crate::dictionary::lookup::{lookup, LookupCtx};
use crate::dictionary::rules::is_letter_wc;
use crate::dictionary::rules::translate_rules_phdata;
use crate::dictionary::{
FLAG_SUFX, FLAG_SUFX_E_ADDED, FLAG_SUFFIX_REMOVED, FLAG_SUFFIX_VOWEL, FLAG_SUFX_S,
LETTERGP_B, LETTERGP_VOWEL2, SUFX_A, SUFX_E, SUFX_I,
};
use crate::dictionary::stress::{set_word_stress, promote_strend_stress, change_word_stress,
apply_word_final_devoicing, apply_alt_stress_upgrade, StressOpts};
use ipa_table::{
EN_IPA_OVERRIDES,
phoneme_ipa_lang,
IPA_STRESS_PRIMARY, IPA_STRESS_SECONDARY,
PendingStress, PHON_STRESS_P, PHON_STRESS_P2, PHON_STRESS_TONIC,
PHON_STRESS_2, PHON_STRESS_3,
PHON_STRESS_U, PHON_STRESS_D, PHON_STRESS_PREV,
is_pause_code,
};
bitflags::bitflags! {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ClauseFlags: u32 {
const PAUSE_MASK = 0x0000_0FFF;
const INTONATION_MASK = 0x0000_7000;
const OPTIONAL_SPACE_AFTER = 0x0000_8000;
const TYPE_MASK = 0x000F_0000;
const PUNCT_IN_WORD = 0x0010_0000;
const SPEAK_PUNCT_NAME = 0x0020_0000;
const DOT_AFTER_LAST_WORD = 0x0040_0000;
const PAUSE_LONG = 0x0080_0000;
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Intonation {
FullStop,
Comma,
Question,
Exclamation,
None,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ClauseType {
None,
Eof,
VoiceChange,
Clause,
Sentence,
}
#[derive(Debug, Clone)]
pub struct Clause {
pub text: String,
pub intonation: Intonation,
pub clause_type: ClauseType,
pub pause_ms: u32,
}
#[derive(Debug, Clone)]
pub struct LangOptions {
pub lang: String,
pub rate: u32,
pub pitch: u32,
pub word_gap: i32,
pub stress_rule: u8,
pub number_grammar: NumberGrammar,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct NumberGrammar {
pub ordinals: OrdinalGrammar,
pub tens: TensGrammar,
pub hundreds: HundredsGrammar,
pub thousands: ThousandsGrammar,
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct OrdinalGrammar {
pub indicator: Option<String>,
pub dot_marks_ordinal: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum TensGrammar {
#[default]
Standard,
WithConjunction,
UnitsThenConjunction,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub struct HundredsGrammar {
pub use_conjunction_with_remainder: bool,
pub omit_one_prefix: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub struct ThousandsGrammar {
pub omit_one_prefix: bool,
}
impl NumberGrammar {
fn for_lang(lang: &str) -> Self {
let mut grammar = Self::default();
match lang {
"en" => {
grammar.hundreds.use_conjunction_with_remainder = true;
}
"es" => {
grammar.tens = TensGrammar::WithConjunction;
grammar.hundreds.omit_one_prefix = true;
grammar.thousands.omit_one_prefix = true;
}
"fr" => {
grammar.hundreds.omit_one_prefix = true;
}
"de" => {
grammar.ordinals.dot_marks_ordinal = true;
grammar.tens = TensGrammar::UnitsThenConjunction;
}
"nl" | "mt" => {
grammar.ordinals.dot_marks_ordinal = true;
grammar.ordinals.indicator = Some("e".to_string());
grammar.tens = TensGrammar::UnitsThenConjunction;
grammar.hundreds.omit_one_prefix = true;
grammar.thousands.omit_one_prefix = true;
}
"da" | "et" | "fi" | "fo" | "kl" | "lt" | "nb" | "no" | "sl" => {
grammar.ordinals.dot_marks_ordinal = true;
}
_ => {}
}
grammar
}
}
impl Default for NumberGrammar {
fn default() -> Self {
Self {
ordinals: OrdinalGrammar::default(),
tens: TensGrammar::Standard,
hundreds: HundredsGrammar::default(),
thousands: ThousandsGrammar::default(),
}
}
}
impl Default for LangOptions {
fn default() -> Self {
LangOptions {
lang: "en".to_string(),
rate: 175,
pitch: 50,
word_gap: 0,
stress_rule: 2, number_grammar: NumberGrammar::default(),
}
}
}
impl LangOptions {
pub fn for_lang(lang: &str) -> Self {
Self {
lang: lang.to_string(),
number_grammar: NumberGrammar::for_lang(lang),
..Default::default()
}
}
}
fn is_cjk_ideograph(c: char) -> bool {
let cp = c as u32;
(0x4E00..=0x9FFF).contains(&cp)
|| (0x3400..=0x4DBF).contains(&cp)
|| (0x20000..=0x323AF).contains(&cp)
|| (0xF900..=0xFAFF).contains(&cp)
|| (0x2F00..=0x2FDF).contains(&cp)
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
Word(String),
Number(NumberToken),
Space,
ClauseBoundary(char),
Punctuation(char),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum NumberToken {
Cardinal(String),
Decimal { integer: String, fractional: String },
Ordinal(OrdinalNumber),
}
impl NumberToken {
fn parse(word: &str, grammar: &NumberGrammar) -> Option<Self> {
if word.is_empty() {
return None;
}
if let Some((integer, fractional)) = word.split_once('.') {
let has_single_dot = word.bytes().filter(|&b| b == b'.').count() == 1;
if has_single_dot
&& !integer.is_empty()
&& !fractional.is_empty()
&& integer.bytes().all(|b| b.is_ascii_digit())
&& fractional.bytes().all(|b| b.is_ascii_digit())
{
return Some(NumberToken::Decimal {
integer: integer.to_string(),
fractional: fractional.to_string(),
});
}
}
let digit_end = word.bytes().position(|b| !b.is_ascii_digit()).unwrap_or(word.len());
if digit_end == 0 {
return None;
}
if digit_end == word.len() {
return word
.bytes()
.all(|b| b.is_ascii_digit())
.then(|| NumberToken::Cardinal(word.to_string()));
}
let digits = &word[..digit_end];
let suffix = &word[digit_end..];
if suffix == "." && grammar.ordinals.dot_marks_ordinal {
return Some(NumberToken::Ordinal(OrdinalNumber {
digits: digits.to_string(),
marker: OrdinalMarker::Dot,
}));
}
Some(NumberToken::Ordinal(OrdinalNumber {
digits: digits.to_string(),
marker: OrdinalMarker::Suffix(suffix.to_lowercase()),
}))
}
fn surface(&self) -> String {
match self {
NumberToken::Cardinal(digits) => digits.clone(),
NumberToken::Decimal { integer, fractional } => format!("{integer}.{fractional}"),
NumberToken::Ordinal(ordinal) => ordinal.surface(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct OrdinalNumber {
pub digits: String,
pub marker: OrdinalMarker,
}
impl OrdinalNumber {
fn surface(&self) -> String {
match &self.marker {
OrdinalMarker::Suffix(suffix) => format!("{}{}", self.digits, suffix),
OrdinalMarker::Dot => format!("{}.", self.digits),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum OrdinalMarker {
Suffix(String),
Dot,
}
pub fn tokenize(text: &str) -> Vec<Token> {
tokenize_opts(text, &NumberGrammar::default())
}
pub fn tokenize_opts(text: &str, grammar: &NumberGrammar) -> Vec<Token> {
let mut tokens = Vec::new();
let mut chars = text.chars().peekable();
while let Some(c) = chars.next() {
if c.is_whitespace() {
while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
chars.next();
}
tokens.push(Token::Space);
} else if matches!(c, '.' | ',' | '!' | '?' | ';' | ':') {
while chars.peek().map(|ch| ch.is_whitespace()).unwrap_or(false) {
chars.next();
}
tokens.push(Token::ClauseBoundary(c));
} else if c.is_ascii_digit() {
let mut digits = String::new();
digits.push(c);
let mut has_dot = false;
let mut fractional = String::new();
while let Some(&next) = chars.peek() {
if next.is_ascii_digit() {
if has_dot {
fractional.push(next);
} else {
digits.push(next);
}
chars.next();
} else if next == '.' && !has_dot {
let mut lookahead = chars.clone();
lookahead.next(); if lookahead.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
has_dot = true;
chars.next();
} else {
break;
}
} else {
break;
}
}
if has_dot {
tokens.push(Token::Number(NumberToken::Decimal {
integer: digits,
fractional,
}));
continue;
}
let mut suffix = String::new();
while let Some(&next) = chars.peek() {
if next.is_alphabetic() || next == 'º' || next == 'ª' {
suffix.push(next);
chars.next();
} else {
break;
}
}
if !suffix.is_empty() {
tokens.push(Token::Number(NumberToken::Ordinal(OrdinalNumber {
digits,
marker: OrdinalMarker::Suffix(suffix.to_lowercase()),
})));
continue;
}
if grammar.ordinals.dot_marks_ordinal && chars.peek() == Some(&'.') {
let mut lookahead = chars.clone();
lookahead.next(); let after_dot = lookahead.peek().copied();
if !after_dot.map_or(false, |c| c.is_ascii_digit()) {
chars.next();
tokens.push(Token::Number(NumberToken::Ordinal(OrdinalNumber {
digits,
marker: OrdinalMarker::Dot,
})));
continue;
}
}
tokens.push(Token::Number(NumberToken::Cardinal(digits)));
} else if is_cjk_ideograph(c) {
tokens.push(Token::Word(c.to_string()));
while let Some(&next) = chars.peek() {
if is_cjk_ideograph(next) {
tokens.push(Token::Word(next.to_string()));
chars.next();
} else {
break;
}
}
} else if c.is_alphabetic() || c == '\'' {
let mut word = String::new();
word.push(c);
while let Some(&next) = chars.peek() {
if is_cjk_ideograph(next) {
break;
} else if next.is_alphabetic() || next == '\'' {
word.push(next);
chars.next();
} else if next == '-' {
let mut lookahead = chars.clone();
lookahead.next(); if lookahead.peek().map(|c| c.is_alphabetic()).unwrap_or(false) {
word.push(next);
chars.next();
} else {
break;
}
} else {
break;
}
}
tokens.push(Token::Word(word));
} else {
tokens.push(Token::Punctuation(c));
}
}
tokens
}
pub fn english_letter_bits() -> [u8; 256] {
let mut bits = [0u8; 256];
let set = |bits: &mut [u8; 256], group: u8, letters: &[u8]| {
for &c in letters {
bits[c as usize] |= 1 << group;
if c.is_ascii_lowercase() {
bits[(c - 32) as usize] |= 1 << group;
}
}
};
set(&mut bits, 0, b"aeiou");
set(&mut bits, 1, b"bcdfgjklmnpqstvxz");
set(&mut bits, 2, b"bcdfghjklmnpqrstvwxz");
set(&mut bits, 3, b"hlmnr");
set(&mut bits, 4, b"cfhkpqstx");
set(&mut bits, 5, b"bdgjlmnrvwyz");
set(&mut bits, 6, b"eiy");
set(&mut bits, 7, b"aeiouy");
bits
}
pub fn phonemes_to_ipa(
phoneme_bytes: &[u8],
phdata: &PhonemeData,
pending_stress_in: PendingStress,
word_sep: bool, ) -> (String, PendingStress) {
phonemes_to_ipa_lang(phoneme_bytes, phdata, pending_stress_in, word_sep, true)
}
pub fn phonemes_to_ipa_lang(
phoneme_bytes: &[u8],
phdata: &PhonemeData,
pending_stress_in: PendingStress,
word_sep: bool,
use_en_overrides: bool,
) -> (String, PendingStress) {
phonemes_to_ipa_full(phoneme_bytes, phdata, pending_stress_in, word_sep, use_en_overrides, false)
}
pub fn phonemes_to_ipa_full(
phoneme_bytes: &[u8],
phdata: &PhonemeData,
pending_stress_in: PendingStress,
word_sep: bool,
use_en_overrides: bool,
suppress_word_final_liaison: bool,
) -> (String, PendingStress) {
let mut out = String::new();
let mut stress = pending_stress_in;
let mut need_space = word_sep;
let mut prev_phcode: u8 = 0; const PH_VOICED_FLAG: u32 = 1 << 4;
for (idx, &code) in phoneme_bytes.iter().enumerate() {
if code == 0 { break; }
match code {
PHON_STRESS_P | PHON_STRESS_P2 | PHON_STRESS_TONIC => {
stress = PendingStress::Primary;
continue;
}
PHON_STRESS_2 | PHON_STRESS_3 => {
stress = PendingStress::Secondary;
continue;
}
PHON_STRESS_U | PHON_STRESS_D | PHON_STRESS_PREV => {
stress = PendingStress::None;
continue;
}
_ => {}
}
if is_pause_code(code) {
if code == 15 { need_space = true;
stress = PendingStress::None;
}
continue;
}
let is_primary = stress == PendingStress::Primary;
let resolved_code = phdata.resolve_stressed_phoneme(code, is_primary);
let code = resolved_code;
if let Some(ph) = phdata.get(code) {
let is_vowel = ph.typ == 2; let is_stress_type = ph.typ == 1;
if is_stress_type {
if ph.std_length <= 4 && ph.program == 0 {
match ph.std_length {
4 => { stress = PendingStress::Primary; }
2 | 3 => { stress = PendingStress::Secondary; }
_ => {}
}
}
continue;
}
if suppress_word_final_liaison {
let mnemonic = ph.mnemonic;
let b1 = ((mnemonic >> 8) & 0xff) as u8;
let b2 = ((mnemonic >> 16) & 0xff) as u8;
let is_liaison = (b1 == b'2' || b1 == b'3') && b2 == 0 && !is_vowel;
if is_liaison {
let word_final = phoneme_bytes[idx+1..].iter()
.all(|&c| c == 0 || c <= 8 || c == 15);
if word_final {
continue; }
}
}
if need_space {
out.push(' ');
need_space = false;
}
let word_final = phoneme_bytes[idx+1..].iter()
.all(|&c| c == 0 || c <= 8 || c == 15);
if is_vowel {
match stress {
PendingStress::Primary => { out.push_str(IPA_STRESS_PRIMARY); }
PendingStress::Secondary => { out.push_str(IPA_STRESS_SECONDARY); }
PendingStress::None => {}
}
stress = PendingStress::None;
}
let b1 = ((ph.mnemonic >> 8) & 0xff) as u8;
if b1 == b'#' {
let b0 = (ph.mnemonic & 0xff) as u8;
let prev_voiced = if let Some(prev_ph) = phdata.get(prev_phcode) {
prev_ph.typ == 2 ||
prev_ph.typ == 3 ||
(prev_ph.phflags & PH_VOICED_FLAG) != 0
} else { false };
let ipa_char = if b0 == b'd' {
if prev_voiced { "d" } else { "t" }
} else if b0 == b'z' {
if prev_voiced { "z" } else { "s" }
} else {
""
};
if !ipa_char.is_empty() {
out.push_str(ipa_char);
prev_phcode = code;
continue;
}
}
let override_ipa = if use_en_overrides {
EN_IPA_OVERRIDES.iter()
.find_map(|&(override_code, ipa)| (override_code == code).then(|| ipa.to_string()))
} else {
None
};
let ipa = if let Some(override_ipa) = override_ipa {
override_ipa
} else if let Some(ipa_str) = phdata.phoneme_ipa_string(ph.program) {
ipa_str
} else {
phoneme_ipa_lang(code, ph.mnemonic, is_vowel, false)
};
let mut ipa = ipa;
if use_en_overrides && is_vowel {
let b1 = ((ph.mnemonic >> 8) & 0xff) as u8;
let b2 = ((ph.mnemonic >> 16) & 0xff) as u8;
let b3 = ((ph.mnemonic >> 24) & 0xff) as u8;
let has_rhotic_liaison = b1 != 0 && (b2 == b'3' || b3 == b'3');
if has_rhotic_liaison && !word_final {
ipa.push('ɹ');
}
}
out.push_str(&ipa);
prev_phcode = code;
}
}
(out, stress)
}
pub struct WordResult {
pub phonemes: Vec<u8>,
pub dict_flags: u32,
}
fn append_raw_phonemes(dst: &mut Vec<u8>, src: &[u8]) {
for &b in src {
if b == 0 {
break;
}
dst.push(b);
}
}
fn combine_rules_result(result: &crate::dictionary::rules::RulesResult) -> Vec<u8> {
let mut combined = Vec::new();
append_raw_phonemes(&mut combined, &result.phonemes);
append_raw_phonemes(&mut combined, &result.end_phonemes);
combined
}
fn english_suffix_needs_e(stem: &str, dict: &Dictionary) -> bool {
const ADD_E_EXCEPTIONS: &[&str] = &["ion"];
const ADD_E_ADDITIONS: &[&str] = &["c", "rs", "ir", "ur", "ath", "ns", "u", "spong", "rang", "larg"];
let chars: Vec<char> = stem.chars().collect();
if chars.len() < 2 {
return false;
}
let penultimate = chars[chars.len() - 2] as u32;
let last = chars[chars.len() - 1] as u32;
if is_letter_wc(&dict.letter_bits, penultimate, dict.letter_bits_offset, LETTERGP_VOWEL2)
&& is_letter_wc(&dict.letter_bits, last, dict.letter_bits_offset, LETTERGP_B)
{
return !ADD_E_EXCEPTIONS.iter().any(|suffix| stem.ends_with(suffix));
}
ADD_E_ADDITIONS.iter().any(|suffix| stem.ends_with(suffix))
}
fn remove_standard_suffix(word: &str, end_type: u32, dict: &Dictionary) -> Option<(String, u32, u32)> {
let suffix_len_chars = (end_type & 0x3f) as usize;
if suffix_len_chars == 0 {
return None;
}
let mut chars: Vec<char> = word.chars().collect();
if suffix_len_chars > chars.len() {
return None;
}
let suffix_start = chars.len() - suffix_len_chars;
let ending: String = chars[suffix_start..].iter().collect();
chars.truncate(suffix_start);
if (end_type & SUFX_I) != 0 && chars.last() == Some(&'i') {
*chars.last_mut().unwrap() = 'y';
}
let mut stem: String = chars.iter().collect();
let mut end_flags = (end_type & 0xfff0) | FLAG_SUFX;
if (end_type & SUFX_E) != 0 && dict.lang == "en" && english_suffix_needs_e(&stem, dict) {
stem.push('e');
end_flags |= FLAG_SUFX_E_ADDED;
}
if ending == "s" || ending == "es" {
end_flags |= FLAG_SUFX_S;
}
if ending.starts_with('\'') {
end_flags &= !FLAG_SUFX;
}
let mut stem_word_flags = 0;
if (end_flags & FLAG_SUFX) != 0 {
stem_word_flags |= FLAG_SUFFIX_REMOVED;
}
if (end_type & SUFX_A) != 0 {
stem_word_flags |= FLAG_SUFFIX_VOWEL;
}
Some((stem, end_flags, stem_word_flags))
}
fn lookup_num_phonemes(dict: &Dictionary, key: &str) -> Vec<u8> {
let ctx = LookupCtx { lookup_symbol: true, ..Default::default() };
if let Some(r) = lookup(dict, key, &ctx) {
if !r.phonemes.is_empty() {
return r.phonemes;
}
}
Vec::new()
}
const PHON_END_WORD: u8 = 15;
#[derive(Debug, Clone, Default)]
struct Pronunciation {
bytes: Vec<u8>,
}
impl Pronunciation {
fn push_lookup_word(&mut self, src: &[u8]) {
self.start_word();
self.bytes.extend_from_slice(trim_lookup(src));
}
fn append_lookup_suffix(&mut self, src: &[u8]) {
self.bytes.extend_from_slice(trim_lookup(src));
}
fn push_pronunciation(&mut self, other: &Pronunciation) {
let len = other.trimmed_len();
if len == 0 {
return;
}
self.start_word();
self.bytes.extend_from_slice(&other.bytes[..len]);
}
fn finish(mut self) -> Vec<u8> {
if self.bytes.last().copied() != Some(PHON_END_WORD) {
self.bytes.push(PHON_END_WORD);
}
self.bytes.push(0);
self.bytes
}
fn trimmed_len(&self) -> usize {
self.bytes
.iter()
.rposition(|&b| b != PHON_END_WORD)
.map_or(0, |idx| idx + 1)
}
fn start_word(&mut self) {
if !self.bytes.is_empty() && self.bytes.last().copied() != Some(PHON_END_WORD) {
self.bytes.push(PHON_END_WORD);
}
}
}
fn trim_lookup(src: &[u8]) -> &[u8] {
let len = src.iter().position(|&b| b == 0).unwrap_or(src.len());
&src[..len]
}
fn num_key(raw: impl std::fmt::Display) -> String {
format!("_{raw}")
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct ScaleGroup {
value: u32,
scale: Option<u8>,
}
fn split_scale_groups(value: u64) -> [ScaleGroup; 4] {
[
ScaleGroup {
value: (value / 1_000_000_000) as u32,
scale: Some(3),
},
ScaleGroup {
value: ((value / 1_000_000) % 1_000) as u32,
scale: Some(2),
},
ScaleGroup {
value: ((value / 1_000) % 1_000) as u32,
scale: Some(1),
},
ScaleGroup {
value: (value % 1_000) as u32,
scale: None,
},
]
}
fn append_scale_word(
dst: &mut Pronunciation,
group_value: u32,
scale: u8,
dict: &Dictionary,
grammar: &NumberGrammar,
) {
let scale_key = format!("_0M{scale}");
let singular_key = format!("_1M{scale}");
if scale == 1 && group_value == 1 && grammar.thousands.omit_one_prefix {
dst.push_lookup_word(&lookup_num_phonemes(dict, &scale_key));
return;
}
if group_value == 1 {
let singular = lookup_num_phonemes(dict, &singular_key);
if !singular.is_empty() {
dst.push_lookup_word(&singular);
return;
}
}
dst.push_pronunciation(&num3_phonemes(dict, group_value, false, grammar));
dst.push_lookup_word(&lookup_num_phonemes(dict, &scale_key));
}
fn append_cardinal_group(
dst: &mut Pronunciation,
group: ScaleGroup,
dict: &Dictionary,
grammar: &NumberGrammar,
) {
if group.value == 0 {
return;
}
if let Some(scale) = group.scale {
append_scale_word(dst, group.value, scale, dict, grammar);
} else {
dst.push_pronunciation(&num3_phonemes(dict, group.value, false, grammar));
}
}
fn append_ordinal_scale(
dst: &mut Pronunciation,
group_value: u32,
scale: u8,
dict: &Dictionary,
grammar: &NumberGrammar,
) -> bool {
let singular_ord_key = format!("_1M{scale}o");
if group_value == 1 {
let singular_ord = lookup_num_phonemes(dict, &singular_ord_key);
if !singular_ord.is_empty() {
dst.push_lookup_word(&singular_ord);
return true;
}
}
let ord_key = format!("_0M{scale}o");
let ord_scale = lookup_num_phonemes(dict, &ord_key);
if !ord_scale.is_empty() {
if !(scale == 1 && group_value == 1 && grammar.thousands.omit_one_prefix) {
dst.push_pronunciation(&num3_phonemes(dict, group_value, false, grammar));
}
dst.push_lookup_word(&ord_scale);
return true;
}
append_scale_word(dst, group_value, scale, dict, grammar);
false
}
fn num3_phonemes(
dict: &Dictionary,
value: u32,
suppress_null: bool,
grammar: &NumberGrammar,
) -> Pronunciation {
let hundreds = value / 100;
let tensunits = value % 100;
let mut hundreds_part = Pronunciation::default();
let mut tens_part = Pronunciation::default();
let mut suppress_null = suppress_null;
if hundreds > 0 {
let compound = lookup_num_phonemes(dict, &format!("_{}C", hundreds));
if !compound.is_empty() {
hundreds_part.push_lookup_word(&compound);
} else if tensunits == 0 {
let exact = lookup_num_phonemes(dict, &format!("_{}C0", hundreds));
if !exact.is_empty() {
hundreds_part.push_lookup_word(&exact);
} else {
if !(hundreds == 1 && grammar.hundreds.omit_one_prefix) {
hundreds_part.push_lookup_word(&lookup_num_phonemes(dict, &num_key(hundreds)));
}
hundreds_part.append_lookup_suffix(&lookup_num_phonemes(dict, "_0C"));
}
} else {
if !(hundreds == 1 && grammar.hundreds.omit_one_prefix) {
hundreds_part.push_lookup_word(&lookup_num_phonemes(dict, &num_key(hundreds)));
}
hundreds_part.append_lookup_suffix(&lookup_num_phonemes(dict, "_0C"));
}
suppress_null = true;
}
if tensunits != 0 || !suppress_null {
if tensunits < 20 {
tens_part.push_lookup_word(&lookup_num_phonemes(dict, &num_key(tensunits)));
} else {
let ph_full = lookup_num_phonemes(dict, &num_key(tensunits));
if !ph_full.is_empty() {
tens_part.push_lookup_word(&ph_full);
} else {
let tens = tensunits / 10;
let units = tensunits % 10;
match grammar.tens {
TensGrammar::UnitsThenConjunction if units != 0 => {
tens_part.push_lookup_word(&lookup_num_phonemes(dict, &num_key(units)));
tens_part.append_lookup_suffix(&lookup_num_phonemes(dict, "_0and"));
tens_part.append_lookup_suffix(&lookup_num_phonemes(dict, &format!("_{tens}X")));
}
TensGrammar::UnitsThenConjunction => {
tens_part.push_lookup_word(&lookup_num_phonemes(dict, &format!("_{tens}X")));
}
TensGrammar::WithConjunction => {
tens_part.push_lookup_word(&lookup_num_phonemes(dict, &format!("_{tens}X")));
if units != 0 {
tens_part.append_lookup_suffix(&lookup_num_phonemes(dict, "_0and"));
tens_part.append_lookup_suffix(&lookup_num_phonemes(dict, &num_key(units)));
}
}
TensGrammar::Standard => {
tens_part.push_lookup_word(&lookup_num_phonemes(dict, &format!("_{tens}X")));
if units != 0 {
tens_part.append_lookup_suffix(&lookup_num_phonemes(dict, &num_key(units)));
}
}
}
}
}
}
if hundreds > 0 && tensunits > 0 && grammar.hundreds.use_conjunction_with_remainder {
hundreds_part.append_lookup_suffix(&lookup_num_phonemes(dict, "_0and"));
}
let mut result = Pronunciation::default();
result.push_pronunciation(&hundreds_part);
result.push_pronunciation(&tens_part);
result
}
fn number_token_to_phonemes(
token: &NumberToken,
dict: &Dictionary,
grammar: &NumberGrammar,
) -> Option<Vec<u8>> {
match token {
NumberToken::Cardinal(digits) => Some(cardinal_pronunciation(digits, dict, grammar)?.finish()),
NumberToken::Decimal { integer, fractional } => {
let mut pronunciation = cardinal_pronunciation(integer, dict, grammar)?;
let decimal_point = lookup_num_phonemes(dict, "_dpt");
if !decimal_point.is_empty() {
pronunciation.push_lookup_word(&decimal_point);
}
for digit in fractional.bytes() {
pronunciation.push_lookup_word(&lookup_num_phonemes(dict, &num_key(digit - b'0')));
}
Some(pronunciation.finish())
}
NumberToken::Ordinal(_) => None,
}
}
fn cardinal_pronunciation(
digits: &str,
dict: &Dictionary,
grammar: &NumberGrammar,
) -> Option<Pronunciation> {
if digits.is_empty() || !digits.bytes().all(|b| b.is_ascii_digit()) {
return None;
}
let value: u64 = digits.parse().ok()?;
if value == 0 {
let mut pronunciation = Pronunciation::default();
pronunciation.push_lookup_word(&lookup_num_phonemes(dict, "_0"));
return Some(pronunciation);
}
let is_year_form = value >= 1100 && value <= 9999 && value % 100 == 0 && value / 100 >= 11;
if is_year_form {
let mut pronunciation = num3_phonemes(dict, (value / 100) as u32, false, grammar);
pronunciation.append_lookup_suffix(&lookup_num_phonemes(dict, "_0C"));
return Some(pronunciation);
}
let mut result = Pronunciation::default();
for group in split_scale_groups(value) {
append_cardinal_group(&mut result, group, dict, grammar);
}
Some(result)
}
fn ordinal_sub_thousand_pronunciation(
value: u32,
dict: &Dictionary,
grammar: &NumberGrammar,
suffix_ph: &[u8],
) -> (Pronunciation, bool) {
let hundreds = value / 100;
let tensunits = value % 100;
let units = value % 10;
let tens = tensunits / 10;
let mut pronunciation = Pronunciation::default();
let mut found_ordinal = false;
if hundreds > 0 {
if tensunits == 0 {
let ord_hundreds = lookup_num_phonemes(dict, "_0Co");
if !ord_hundreds.is_empty() {
if hundreds > 1 {
pronunciation.push_lookup_word(&lookup_num_phonemes(dict, &num_key(hundreds)));
}
pronunciation.push_lookup_word(&ord_hundreds);
found_ordinal = true;
} else {
pronunciation.push_pronunciation(&num3_phonemes(dict, hundreds * 100, false, grammar));
}
} else {
pronunciation.push_pronunciation(&num3_phonemes(dict, hundreds * 100, false, grammar));
}
}
let full_ord = lookup_num_phonemes(dict, &format!("_{tensunits}o"));
if !full_ord.is_empty() {
pronunciation.push_lookup_word(&full_ord);
found_ordinal = true;
} else if tens >= 2 && units > 0 {
let tens_ord = lookup_num_phonemes(dict, &format!("_{tens}Xo"));
if !tens_ord.is_empty() {
pronunciation.push_lookup_word(&tens_ord);
pronunciation.append_lookup_suffix(suffix_ph);
} else {
pronunciation.push_lookup_word(&lookup_num_phonemes(dict, &format!("_{tens}X")));
}
let units_ord = lookup_num_phonemes(dict, &format!("_{units}o"));
if !units_ord.is_empty() {
pronunciation.push_lookup_word(&units_ord);
found_ordinal = true;
} else {
pronunciation.push_lookup_word(&lookup_num_phonemes(dict, &num_key(units)));
}
} else if tens >= 2 {
pronunciation.push_lookup_word(&lookup_num_phonemes(dict, &format!("_{tens}X")));
} else if tensunits > 0 {
pronunciation.push_pronunciation(&num3_phonemes(dict, tensunits, false, grammar));
}
(pronunciation, found_ordinal)
}
fn try_ordinal_number(
ordinal: &OrdinalNumber,
dict: &Dictionary,
phdata: &PhonemeData,
stress_opts: &StressOpts,
grammar: &NumberGrammar,
) -> Option<WordResult> {
let suffix = match &ordinal.marker {
OrdinalMarker::Suffix(suffix) => suffix.as_str(),
OrdinalMarker::Dot => ".",
};
let suffix_ph = lookup_num_phonemes(dict, &format!("_#{suffix}"));
let is_ordinal = !suffix_ph.is_empty()
|| grammar.ordinals.indicator.as_deref() == Some(suffix)
|| matches!(ordinal.marker, OrdinalMarker::Dot) && grammar.ordinals.dot_marks_ordinal;
if !is_ordinal {
return None;
}
let value: u64 = ordinal.digits.parse().ok()?;
let mut pronunciation = Pronunciation::default();
let groups = split_scale_groups(value);
let last_nonzero = groups.iter().rposition(|group| group.value != 0)?;
for &group in &groups[..last_nonzero] {
append_cardinal_group(&mut pronunciation, group, dict, grammar);
}
let final_group = groups[last_nonzero];
let found_ordinal = if let Some(scale) = final_group.scale {
append_ordinal_scale(
&mut pronunciation,
final_group.value,
scale,
dict,
grammar,
)
} else {
let (remainder_ordinal, found) =
ordinal_sub_thousand_pronunciation(final_group.value, dict, grammar, &suffix_ph);
pronunciation.push_pronunciation(&remainder_ordinal);
found
};
if found_ordinal {
pronunciation.append_lookup_suffix(&suffix_ph);
} else {
let ord_ph = lookup_num_phonemes(dict, "_ord");
if !ord_ph.is_empty() {
pronunciation.append_lookup_suffix(&ord_ph);
} else {
pronunciation.append_lookup_suffix(&suffix_ph);
}
}
let mut phonemes = pronunciation.finish();
set_word_stress(&mut phonemes, phdata, stress_opts, Some(0), -1, 0);
Some(WordResult { phonemes, dict_flags: 0 })
}
fn translate_number_token(
token: &NumberToken,
dict: &Dictionary,
phdata: &PhonemeData,
stress_opts: &StressOpts,
grammar: &NumberGrammar,
) -> Option<WordResult> {
match token {
NumberToken::Ordinal(ordinal) => try_ordinal_number(ordinal, dict, phdata, stress_opts, grammar),
_ => {
let mut phonemes = number_token_to_phonemes(token, dict, grammar)?;
set_word_stress(&mut phonemes, phdata, stress_opts, Some(0), -1, 0);
Some(WordResult { phonemes, dict_flags: 0 })
}
}
}
pub fn word_to_phonemes(
word: &str,
dict: &Dictionary,
phdata: &PhonemeData,
stress_opts: &StressOpts,
lang_opts: &LangOptions,
) -> WordResult {
let ctx = LookupCtx {
lookup_symbol: true,
..Default::default()
};
let dict_result = lookup(dict, word, &ctx);
const FLAG_FOUND_ATTRIBUTES: u32 = 0x4000_0000;
let dict_flags_from_lookup = dict_result.as_ref()
.filter(|r| r.flags1.0 & (FLAG_FOUND_ATTRIBUTES | 0x8000_0000) != 0)
.map(|r| r.flags1.0)
.unwrap_or(0);
if let Some(ref result) = dict_result {
if result.flags1.found() && !result.phonemes.is_empty() {
let dict_flags = result.flags1.0;
let mut phonemes = result.phonemes.clone();
set_word_stress(&mut phonemes, phdata, stress_opts, Some(dict_flags as u32), -1, 0);
if stress_opts.alt_stress_upgrade {
apply_alt_stress_upgrade(&mut phonemes, phdata);
}
if stress_opts.word_final_devoicing {
apply_word_final_devoicing(&mut phonemes, phdata);
}
return WordResult { phonemes, dict_flags };
}
}
if let Some(token) = NumberToken::parse(word, &lang_opts.number_grammar) {
if let Some(result) =
translate_number_token(&token, dict, phdata, stress_opts, &lang_opts.number_grammar)
{
return result;
}
}
let letter_bits = &*dict.letter_bits;
let mut vowel_count = 0i32;
let mut stressed_count = 0i32;
let mut word_buf = Vec::with_capacity(word.len() + 2);
word_buf.push(b' ');
word_buf.extend_from_slice(word.as_bytes());
word_buf.push(b' ');
word_buf.push(0);
let result = translate_rules_phdata(
dict,
&word_buf,
1, 0, 0, &letter_bits,
0, &mut vowel_count,
&mut stressed_count,
Some(phdata),
);
if !result.phonemes.is_empty() {
let mut stress_dict_flags = dict_flags_from_lookup;
let mut phonemes = if result.end_type != 0 && result.suffix_start > 1 {
if let Some((stem, end_flags, stem_word_flags)) =
remove_standard_suffix(word, result.end_type, dict)
{
let stem_lookup = lookup(
dict,
&stem,
&LookupCtx {
lookup_symbol: true,
end_flags,
..Default::default()
},
);
let mut combined = Vec::new();
let mut used_stem = false;
if let Some(stem_lookup) = stem_lookup {
if !stem_lookup.phonemes.is_empty() {
combined.extend_from_slice(&stem_lookup.phonemes);
stress_dict_flags = stem_lookup.flags1.0;
used_stem = true;
}
}
if !used_stem {
let mut stem_buf = Vec::with_capacity(stem.len() + 3);
stem_buf.push(b' ');
stem_buf.extend_from_slice(stem.as_bytes());
stem_buf.push(b' ');
stem_buf.push(0);
let mut stem_vc = 0i32;
let mut stem_sc = 0i32;
let stem_rules = translate_rules_phdata(
dict,
&stem_buf,
1,
stem_word_flags,
0,
&letter_bits,
0,
&mut stem_vc,
&mut stem_sc,
Some(phdata),
);
let stem_phonemes = combine_rules_result(&stem_rules);
if !stem_phonemes.is_empty() {
combined.extend_from_slice(&stem_phonemes);
used_stem = true;
}
}
if used_stem {
append_raw_phonemes(&mut combined, &result.end_phonemes);
combined.push(0);
combined
} else {
let mut fallback = combine_rules_result(&result);
fallback.push(0);
fallback
}
} else {
let mut fallback = combine_rules_result(&result);
fallback.push(0);
fallback
}
} else {
let mut combined = combine_rules_result(&result);
combined.push(0);
combined
};
let flags_for_stress = if stress_dict_flags != 0 {
Some(stress_dict_flags as u32)
} else {
Some(0) };
set_word_stress(&mut phonemes, phdata, stress_opts, flags_for_stress, -1, 0);
if stress_opts.alt_stress_upgrade {
apply_alt_stress_upgrade(&mut phonemes, phdata);
}
if stress_opts.word_final_devoicing {
apply_word_final_devoicing(&mut phonemes, phdata);
}
return WordResult { phonemes, dict_flags: stress_dict_flags };
}
WordResult { phonemes: Vec::new(), dict_flags: dict_flags_from_lookup }
}
pub struct Translator {
pub options: LangOptions,
data_dir: PathBuf,
}
impl Translator {
pub fn new(lang: &str, data_dir: Option<&Path>) -> Result<Self> {
let dir = data_dir
.map(|p| p.to_path_buf())
.unwrap_or_else(|| PathBuf::from(default_data_dir()));
Ok(Translator { options: LangOptions::for_lang(lang), data_dir: dir })
}
pub fn new_default(lang: &str) -> Result<Self> {
Self::new(lang, None)
}
pub fn read_clauses(&self, text: &str) -> Result<Vec<Clause>> {
let mut clauses = Vec::new();
let mut current = String::new();
for c in text.chars() {
match c {
'.' | '!' | '?' => {
current.push(c);
let intonation = match c {
'!' => Intonation::Exclamation,
'?' => Intonation::Question,
_ => Intonation::FullStop,
};
let text_trim = current.trim().to_string();
if !text_trim.is_empty() {
clauses.push(Clause {
text: text_trim,
intonation,
clause_type: ClauseType::Sentence,
pause_ms: 400,
});
}
current = String::new();
}
',' | ';' | ':' => {
current.push(c);
}
_ => { current.push(c); }
}
}
let text_trim = current.trim().to_string();
if !text_trim.is_empty() {
clauses.push(Clause {
text: text_trim,
intonation: Intonation::None,
clause_type: ClauseType::Eof,
pause_ms: 0,
});
}
if clauses.is_empty() {
clauses.push(Clause {
text: text.trim().to_string(),
intonation: Intonation::None,
clause_type: ClauseType::Eof,
pause_ms: 0,
});
}
Ok(clauses)
}
pub fn text_to_ipa(&self, text: &str) -> Result<String> {
let lang = &self.options.lang;
let dict_path = self.data_dir.join(format!("{}_dict", lang));
let phontab_path = self.data_dir.join("phontab");
if !dict_path.exists() {
return Err(Error::NotImplemented("text_to_ipa: dict not found"));
}
let dict_bytes = std::fs::read(&dict_path)
.map_err(Error::Io)?;
let dict = Dictionary::from_bytes(lang, dict_bytes)?;
if !phontab_path.exists() {
return Err(Error::NotImplemented("text_to_ipa: phontab not found"));
}
let mut phdata = PhonemeData::load(&self.data_dir)?;
phdata.select_table_by_name(lang)?;
let stress_opts = StressOpts::for_lang(lang);
let tokens = tokenize_opts(text, &self.options.number_grammar);
#[derive(Clone, PartialEq)]
enum EntryKind {
Word,
ClauseBoundary,
Other,
}
struct EntryFull {
phonemes: Vec<u8>,
dict_flags: u32,
kind: EntryKind,
}
let mut entries: Vec<EntryFull> = Vec::new();
for token in &tokens {
match token {
Token::Word(word) => {
let lower = word.to_lowercase();
let wr = word_to_phonemes(&lower, &dict, &phdata, &stress_opts, &self.options);
entries.push(EntryFull {
phonemes: wr.phonemes,
dict_flags: wr.dict_flags,
kind: EntryKind::Word,
});
}
Token::Number(token) => {
let wr = translate_number_token(
token,
&dict,
&phdata,
&stress_opts,
&self.options.number_grammar,
)
.unwrap_or_else(|| {
let surface = token.surface();
word_to_phonemes(&surface, &dict, &phdata, &stress_opts, &self.options)
});
entries.push(EntryFull {
phonemes: wr.phonemes,
dict_flags: wr.dict_flags,
kind: EntryKind::Word,
});
}
Token::ClauseBoundary(_) => {
entries.push(EntryFull {
phonemes: Vec::new(),
dict_flags: 0,
kind: EntryKind::ClauseBoundary,
});
}
_ => {
entries.push(EntryFull {
phonemes: Vec::new(),
dict_flags: 0,
kind: EntryKind::Other,
});
}
}
}
const FLAG_STREND: u32 = 1 << 9; const FLAG_STREND2: u32 = 1 << 10; const PHON_STRESS_P_CODE: u8 = 6;
const PHON_STRESS_P2_CODE: u8 = 7;
fn promote_clause(entries: &mut [EntryFull], phdata: &PhonemeData) {
let n = entries.len();
for i in 0..n {
if entries[i].kind != EntryKind::Word { continue; }
let dict_flags = entries[i].dict_flags;
if dict_flags & (FLAG_STREND | FLAG_STREND2) == 0 { continue; }
let is_last_word = entries[i+1..].iter().all(|e| e.kind != EntryKind::Word);
let following_all_unstressed = entries[i+1..].iter()
.filter(|e| e.kind == EntryKind::Word)
.all(|e| !e.phonemes.iter().any(|&c| c == PHON_STRESS_P_CODE || c == PHON_STRESS_P2_CODE));
promote_strend_stress(
&mut entries[i].phonemes,
phdata,
dict_flags,
is_last_word,
following_all_unstressed,
);
}
let has_primary = entries.iter()
.filter(|e| e.kind == EntryKind::Word)
.any(|e| e.phonemes.iter().any(|&c| c == PHON_STRESS_P_CODE || c == PHON_STRESS_P2_CODE));
if !has_primary {
let last_secondary = entries.iter().enumerate()
.rev()
.find(|(_, e)| e.kind == EntryKind::Word && !e.phonemes.is_empty()
&& e.phonemes.iter().any(|&c| c == 4 || c == 5))
.map(|(i, _)| i);
if let Some(idx) = last_secondary {
change_word_stress(&mut entries[idx].phonemes, phdata, 4);
} else {
let last_word = entries.iter().enumerate()
.rev()
.find(|(_, e)| e.kind == EntryKind::Word && !e.phonemes.is_empty())
.map(|(i, _)| i);
if let Some(idx) = last_word {
change_word_stress(&mut entries[idx].phonemes, phdata, 4);
}
}
}
}
let clause_boundaries: Vec<usize> = {
let mut tmp = Vec::new();
for i in 0..entries.len() {
if entries[i].kind == EntryKind::ClauseBoundary {
tmp.push(i);
}
}
tmp
};
if clause_boundaries.is_empty() {
promote_clause(&mut entries, &phdata);
} else {
let mut prev_end = 0usize;
let mut boundaries_with_end: Vec<usize> = clause_boundaries.clone();
boundaries_with_end.push(entries.len()); for &bound in &boundaries_with_end {
let slice_end = if bound < entries.len() { bound } else { entries.len() };
if slice_end > prev_end {
promote_clause(&mut entries[prev_end..slice_end], &phdata);
}
prev_end = if bound < entries.len() { bound + 1 } else { entries.len() };
}
}
let mut ipa_out = String::new();
let mut first_word = true;
let mut clause_has_output = false;
let mut stress = PendingStress::None;
for (ei, entry) in entries.iter().enumerate() {
match entry.kind {
EntryKind::Word => {
let phonemes = &entry.phonemes;
if phonemes.is_empty() { continue; }
let use_en_overrides = lang == "en";
let next_starts_vowel = entries[ei+1..].iter()
.find(|e| e.kind == EntryKind::Word && !e.phonemes.is_empty())
.map(|e| {
e.phonemes.iter()
.find(|&&c| c > 8 && c != 15)
.and_then(|&c| phdata.get(c))
.map(|ph| ph.typ == 2) .unwrap_or(false)
})
.unwrap_or(false);
let suppress_liaison = !next_starts_vowel;
let (word_ipa, new_stress) = phonemes_to_ipa_full(
phonemes,
&phdata,
stress,
!first_word,
use_en_overrides,
suppress_liaison,
);
stress = new_stress;
if !word_ipa.is_empty() {
ipa_out.push_str(&word_ipa);
first_word = false;
clause_has_output = true;
}
}
EntryKind::ClauseBoundary => {
if clause_has_output {
ipa_out.push('\n');
clause_has_output = false;
stress = PendingStress::None;
}
first_word = true;
}
EntryKind::Other => {}
}
}
let mut ipa_out = ipa_out.trim_end_matches('\n').to_string();
if lang == "fr" {
ipa_out = ipa_out.replace('r', "ʁ");
}
Ok(ipa_out)
}
pub fn translate_to_codes(&self, text: &str) -> Result<Vec<PhonemeCode>> {
let lang = &self.options.lang;
let dict_path = self.data_dir.join(format!("{}_dict", lang));
let phontab_path = self.data_dir.join("phontab");
if !dict_path.exists() {
return Err(Error::NotImplemented("translate_to_codes: dict not found"));
}
let dict_bytes = std::fs::read(&dict_path).map_err(Error::Io)?;
let dict = Dictionary::from_bytes(lang, dict_bytes)?;
if !phontab_path.exists() {
return Err(Error::NotImplemented("translate_to_codes: phontab not found"));
}
let mut phdata = PhonemeData::load(&self.data_dir)?;
phdata.select_table_by_name(lang)?;
let stress_opts = StressOpts::for_lang(lang);
let tokens = tokenize_opts(text, &self.options.number_grammar);
let mut codes: Vec<PhonemeCode> = Vec::new();
for token in &tokens {
match token {
Token::Word(word) => {
let lower = word.to_lowercase();
let wr = word_to_phonemes(&lower, &dict, &phdata, &stress_opts, &self.options);
for &b in &wr.phonemes {
codes.push(PhonemeCode { code: b, is_boundary: false });
}
}
Token::Number(token) => {
let wr = translate_number_token(
token,
&dict,
&phdata,
&stress_opts,
&self.options.number_grammar,
)
.unwrap_or_else(|| {
let surface = token.surface();
word_to_phonemes(&surface, &dict, &phdata, &stress_opts, &self.options)
});
for &b in &wr.phonemes {
codes.push(PhonemeCode { code: b, is_boundary: false });
}
}
Token::Space => {
codes.push(PhonemeCode { code: 15, is_boundary: true }); }
Token::ClauseBoundary(_) => {
codes.push(PhonemeCode { code: 0, is_boundary: true }); }
_ => {}
}
}
Ok(codes)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct PhonemeCode {
pub code: u8,
pub is_boundary: bool,
}
#[cfg(test)]
mod tests {
use super::*;
fn contains_subsequence(haystack: &[u8], needle: &[u8]) -> bool {
if needle.is_empty() {
return true;
}
let mut needle_ix = 0;
for &byte in haystack {
if byte == needle[needle_ix] {
needle_ix += 1;
if needle_ix == needle.len() {
return true;
}
}
}
false
}
#[test]
fn translator_new_default_succeeds() {
let t = Translator::new_default("en").unwrap();
assert_eq!(t.options.lang, "en");
assert_eq!(t.options.rate, 175);
}
#[test]
fn tokenize_hello_world() {
let tokens = tokenize("hello world");
assert_eq!(tokens, vec![
Token::Word("hello".to_string()),
Token::Space,
Token::Word("world".to_string()),
]);
}
#[test]
fn tokenize_with_punctuation() {
let tokens = tokenize("hello, world!");
assert!(tokens.iter().any(|t| t == &Token::Word("hello".to_string())));
assert!(tokens.iter().any(|t| t == &Token::Word("world".to_string())));
assert!(tokens.iter().any(|t| t == &Token::ClauseBoundary(',')));
assert!(tokens.iter().any(|t| t == &Token::ClauseBoundary('!')));
}
#[test]
fn tokenize_empty() {
assert!(tokenize("").is_empty());
}
#[test]
fn tokenize_apostrophe() {
let tokens = tokenize("it's");
assert_eq!(tokens, vec![Token::Word("it's".to_string())]);
}
#[test]
fn clause_flags_fields_do_not_overlap() {
assert!(
(ClauseFlags::PAUSE_MASK & ClauseFlags::INTONATION_MASK).is_empty()
);
assert!(
(ClauseFlags::INTONATION_MASK & ClauseFlags::TYPE_MASK).is_empty()
);
}
#[test]
fn read_clauses_basic() {
let t = Translator::new_default("en").unwrap();
let clauses = t.read_clauses("Hello world. How are you?").unwrap();
assert_eq!(clauses.len(), 2);
assert_eq!(clauses[0].intonation, Intonation::FullStop);
assert_eq!(clauses[1].intonation, Intonation::Question);
}
#[test]
fn read_clauses_no_punctuation() {
let t = Translator::new_default("en").unwrap();
let clauses = t.read_clauses("hello world").unwrap();
assert_eq!(clauses.len(), 1);
assert_eq!(clauses[0].text, "hello world");
}
fn make_phdata() -> Option<PhonemeData> {
let dir = std::path::Path::new("/usr/share/espeak-ng-data");
if !dir.join("phontab").exists() { return None; }
let mut phdata = PhonemeData::load(dir).ok()?;
phdata.select_table_by_name("en").ok()?;
Some(phdata)
}
#[test]
fn phonemes_to_ipa_the() {
let phdata = match make_phdata() { Some(d) => d, None => return };
let (ipa, _) = phonemes_to_ipa(&[87, 115], &phdata, PendingStress::None, false);
assert_eq!(ipa, "ðə");
}
#[test]
fn phonemes_to_ipa_be() {
let phdata = match make_phdata() { Some(d) => d, None => return };
let (ipa, _) = phonemes_to_ipa(&[72, 137], &phdata, PendingStress::None, false);
assert_eq!(ipa, "biː");
}
#[test]
fn phonemes_to_ipa_with_stress() {
let phdata = match make_phdata() { Some(d) => d, None => return };
let (ipa, _) = phonemes_to_ipa(&[4, 50, 129, 47], &phdata, PendingStress::None, false);
assert_eq!(ipa, "nˌɒt");
}
#[test]
fn text_to_ipa_be() {
let t = Translator::new_default("en").unwrap();
if !Path::new("/usr/share/espeak-ng-data/en_dict").exists() { return; }
let ipa = t.text_to_ipa("be").unwrap();
assert_eq!(ipa, "bˈiː");
}
#[test]
fn text_to_ipa_he() {
let t = Translator::new_default("en").unwrap();
if !Path::new("/usr/share/espeak-ng-data/en_dict").exists() { return; }
let ipa = t.text_to_ipa("he").unwrap();
assert_eq!(ipa, "hˈiː");
}
#[test]
fn text_to_ipa_do() {
let t = Translator::new_default("en").unwrap();
if !Path::new("/usr/share/espeak-ng-data/en_dict").exists() { return; }
let ipa = t.text_to_ipa("do").unwrap();
assert_eq!(ipa, "dˈuː");
}
#[test]
fn text_to_ipa_the() {
let t = Translator::new_default("en").unwrap();
if !Path::new("/usr/share/espeak-ng-data/en_dict").exists() { return; }
let ipa = t.text_to_ipa("the").unwrap();
assert_eq!(ipa, "ðˈə");
}
#[test]
fn tokenize_chinese_chars_are_individual_words() {
let tokens = tokenize("你好世界");
assert_eq!(tokens, vec![
Token::Word("你".to_string()),
Token::Word("好".to_string()),
Token::Word("世".to_string()),
Token::Word("界".to_string()),
]);
}
#[test]
fn tokenize_cjk_with_spaces() {
let tokens = tokenize("你好 世界");
assert_eq!(tokens, vec![
Token::Word("你".to_string()),
Token::Word("好".to_string()),
Token::Space,
Token::Word("世".to_string()),
Token::Word("界".to_string()),
]);
}
#[test]
fn tokenize_mixed_cjk_and_latin() {
let tokens = tokenize("Hello你好World世界");
assert_eq!(tokens, vec![
Token::Word("Hello".to_string()),
Token::Word("你".to_string()),
Token::Word("好".to_string()),
Token::Word("World".to_string()),
Token::Word("世".to_string()),
Token::Word("界".to_string()),
]);
}
#[test]
fn tokenize_single_cjk_char() {
let tokens = tokenize("你");
assert_eq!(tokens, vec![Token::Word("你".to_string())]);
}
#[test]
fn tokenize_cjk_with_punctuation() {
let tokens = tokenize("你好,世界!");
assert!(tokens.contains(&Token::Word("你".to_string())));
assert!(tokens.contains(&Token::Word("好".to_string())));
assert!(tokens.contains(&Token::Word("世".to_string())));
assert!(tokens.contains(&Token::Word("界".to_string())));
}
fn run_ipa_table(lang: &str, dict_name: &str, cases: &[(&str, &str)]) {
let dict_path = format!("espeak-ng-data/{dict_name}");
if !Path::new(&dict_path).exists() { return; }
let t = Translator::new_default(lang).unwrap();
for &(input, expected) in cases {
let ipa = t.text_to_ipa(input).unwrap();
assert_eq!(ipa, expected, "lang={lang} input={input:?}");
}
}
#[test]
fn text_to_ipa_english_rule_regressions() {
run_ipa_table("en", "en_dict", &[
("sky", "skˈaɪ"),
("caused", "kˈɔːzd"),
("reflection", "ɹɪflˈɛkʃən"),
("droplets", "dɹˈɒplɪts"),
("appearing", "ɐpˈiəɹɪŋ"),
("meteorological", "mˌiːtɪˌɔːɹəlˈɒdʒɪkəl"),
]);
}
#[test]
fn text_to_ipa_english_sentence_weak_forms() {
let t = Translator::new_default("en").unwrap();
if !Path::new("/usr/share/espeak-ng-data/en_dict").exists() { return; }
let ipa = t.text_to_ipa("A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky.").unwrap();
assert_eq!(
ipa,
"ɐ ɹˈeɪnbəʊ ɪz ɐ mˌiːtɪˌɔːɹəlˈɒdʒɪkəl fɪnˈɒmɪnən ðat ɪz kˈɔːzd baɪ ɹɪflˈɛkʃən\nɹɪfɹˈakʃən and dɪspˈɜːʃən ɒv lˈaɪt ɪn wˈɔːtə dɹˈɒplɪts ɹɪzˈʌltɪŋ ɪn ɐ spˈɛktɹəm ɒv lˈaɪt ɐpˈiəɹɪŋ ɪnðə skˈaɪ"
);
}
#[test]
fn ordinals_english() {
run_ipa_table("en", "en_dict", &[
("1st", "fˈɜːst"),
("2nd", "sˈɛkənd"),
("3rd", "θˈɜːd"),
("4th", "fˈɔːθ"),
("21st", "twˈɛnti fˈɜːst"),
("100th","wˈɒnhˈʌndɹɪdθ"),
]);
}
#[test]
fn ordinals_english_large_scales() {
run_ipa_table("en", "en_dict", &[
("1000th", "wˈɒn θˈaʊzəndθ"),
("1001st", "wˈɒn θˈaʊzənd fˈɜːst"),
("1000000th", "wˈɒn mˈɪliənθ"),
]);
}
#[test]
fn ordinals_spanish() {
run_ipa_table("es", "es_dict", &[
("1º", "pɾimˈɛɾˈo"),
("21º", "βixˈɛsimˌo pɾimˈɛɾˈo"),
("100º", "θentˈɛsimˈo"),
]);
}
#[test]
fn ordinals_spanish_large_scale_do_not_use_hundred_root() {
let dict_path = "espeak-ng-data/es_dict";
if !Path::new(dict_path).exists() { return; }
let data_dir = Path::new("espeak-ng-data");
let dict = Dictionary::load("es", data_dir).unwrap();
let mut phdata = PhonemeData::load(data_dir).unwrap();
phdata.select_table_by_name("es").unwrap();
let stress_opts = StressOpts::for_lang("es");
let grammar = LangOptions::for_lang("es").number_grammar;
let ordinal = OrdinalNumber {
digits: "1000000".to_string(),
marker: OrdinalMarker::Suffix("º".to_string()),
};
let result = try_ordinal_number(&ordinal, &dict, &phdata, &stress_opts, &grammar).unwrap();
let hundred_ordinal_lookup = lookup_num_phonemes(&dict, "_0Co");
let hundred_ordinal = trim_lookup(&hundred_ordinal_lookup);
assert!(
!contains_subsequence(&result.phonemes, hundred_ordinal),
"1000000º should not be built from the hundredth root",
);
}
#[test]
fn ordinals_dutch() {
run_ipa_table("nl", "nl_dict", &[
("1e", "ˈɪːrstə"),
("3e", "dˈɛrdə"),
]);
}
#[test]
fn ordinals_german_dot() {
run_ipa_table("de", "de_dict", &[
("1.", "ˈeːrstə"),
("3.", "drˈɪtə"),
("21.", "tsvˈantsɪɡʰ ˈeːrstə"),
]);
}
#[test]
fn cardinals_1234567() {
let cases: &[(&str, &str, &str, &str)] = &[
("en", "en_dict",
"wˈɒn mˈɪliən tˈuːhˈʌndɹɪdən θˈɜːti fˈɔː θˈaʊzənd fˈaɪvhˈʌndɹɪdən sˈɪksti sˈɛvən",
"wˈɒn mˈɪliən tˈuːhˈʌndɹɪdən θˈɜːti fˈɔː θˈaʊzənd fˈaɪvhˈʌndɹɪdən sˈɪksti sˈɛvən"),
("es", "es_dict",
"ˈunmiʝˈon dosθjˈentos tɾˈeɪntaikwˈatɾo mˈil kinjˈɛntos sesˈɛntaisjˈetˈe",
"ˈunmiʝˈon dosθjˈentos tɾˌeɪntaikwˈatɾo mˈil kinjˈɛntos sɛsˌɛntaisjˈete"),
("fr", "fr_dict",
"œ̃ miljɔ̃ døzsɑ̃ tʁɑ̃tkatʁ mil sɛ̃ksɑ̃ swasɑ̃tsˈɛt",
"œ̃ miljˈɔ̃ døsɑ̃ tʁɑ̃tkatʁ mˈil sɛ̃ksɑ̃ swasɑ̃tsˈɛt"),
("de", "de_dict",
"ˈaɪnə mɪljˈoːn tsvˈaɪhˈʊndɜt fˈiːr ʊntdrˈaɪsɪɡʰ tˈaʊzənt fˈʏnfhˈʊndɜt zˈiːbən ʊntzˈɛçtsɪɡʰ",
"ˈaɪnə mɪljˈoːn tsvˈaɪhˈʊndɜt fˈiːɾ ʊntdɾˈaɪsɪç tˈaʊzənt fˈynfhˈʊndɜt zˈiːbən ʊntzˈɛçtsɪç"),
("nl", "nl_dict",
"ˈeːn mˈiljun tʋˈeːhˈɔndərt vˈirɛndˈɛrtəx dˈœyzɛnt vˈɛɪfhˈɔndərt zˈeːvənɛnzˈɛstəx",
"ˈeːn mˌiljun tʋˈeːhˌɔndərt vˌirɛndˌɛrtəx dˌœyzɛnt vˈɛɪfhˌɔndərt zˌeːvənɛnzˌɛstəx"),
];
for &(lang, dict, expected, _oracle) in cases {
let dict_path = format!("espeak-ng-data/{dict}");
if !Path::new(&dict_path).exists() { continue; }
let t = Translator::new_default(lang).unwrap();
let ipa = t.text_to_ipa("1234567").unwrap();
assert_eq!(ipa, expected, "lang={lang} input=\"1234567\"");
}
}
#[test]
fn cardinals_english_billion_scale() {
let dict_path = "espeak-ng-data/en_dict";
if !Path::new(dict_path).exists() { return; }
let dict = Dictionary::load("en", Path::new("espeak-ng-data")).unwrap();
let grammar = LangOptions::for_lang("en").number_grammar;
let pronunciation = cardinal_pronunciation("1000000000", &dict, &grammar).unwrap();
let billion_lookup = lookup_num_phonemes(&dict, "_0M3");
let billion = trim_lookup(&billion_lookup);
assert!(!billion.is_empty(), "en_dict is missing _0M3");
let trimmed = &pronunciation.bytes[..pronunciation.trimmed_len()];
assert!(
trimmed.windows(billion.len()).any(|window| window == billion),
"1000000000 should include the billion scale phonemes",
);
}
#[test]
fn cardinals_french() {
let dict_path = "espeak-ng-data/fr_dict";
if !Path::new(dict_path).exists() { return; }
let t = Translator::new_default("fr").unwrap();
for input in ["1", "2", "3", "4", "20", "80", "87", "100", "101"] {
let ipa = t.text_to_ipa(input).unwrap();
assert!(!ipa.is_empty(), "fr {input} produced empty IPA");
assert!(!ipa.chars().any(|c| c.is_ascii_digit()),
"fr {input} has raw digits in IPA: {ipa}");
}
}
}