pub mod ipa_table;
use std::path::{Path, PathBuf};
pub fn default_data_dir() -> String {
if let Ok(path) = std::env::var("ESPEAK_DATA_PATH") {
return path;
}
if let Ok(exe) = std::env::current_exe() {
if let Some(dir) = exe.parent() {
let local = dir.join("espeak-ng-data");
if local.join("en_dict").exists() {
return local.to_string_lossy().into_owned();
}
}
}
{
let cwd_local = std::path::Path::new("espeak-ng-data");
if cwd_local.join("en_dict").exists() {
if let Ok(abs) = cwd_local.canonicalize() {
return abs.to_string_lossy().into_owned();
}
}
}
"/usr/share/espeak-ng-data".to_string()
}
use crate::dictionary::file::Dictionary;
use crate::dictionary::lookup::{LookupCtx, lookup};
use crate::dictionary::rules::translate_rules_phdata;
use crate::dictionary::stress::{
StressOpts, apply_alt_stress_upgrade, apply_word_final_devoicing, change_word_stress,
promote_strend_stress, set_word_stress,
};
use crate::dictionary::{FLAG_SUFFIX_REMOVED, SUFX_I};
use crate::error::{Error, Result};
use crate::phoneme::load::PhonemeData;
use ipa_table::{
IPA_STRESS_PRIMARY, IPA_STRESS_SECONDARY, PHON_STRESS_2, PHON_STRESS_3, PHON_STRESS_D,
PHON_STRESS_P, PHON_STRESS_P2, PHON_STRESS_PREV, PHON_STRESS_TONIC, PHON_STRESS_U,
PendingStress, is_pause_code, phoneme_ipa_lang,
};
bitflags::bitflags! {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ClauseFlags: u32 {
const PAUSE_MASK = 0x0000_0FFF;
const INTONATION_MASK = 0x0000_7000;
const OPTIONAL_SPACE_AFTER = 0x0000_8000;
const TYPE_MASK = 0x000F_0000;
const PUNCT_IN_WORD = 0x0010_0000;
const SPEAK_PUNCT_NAME = 0x0020_0000;
const DOT_AFTER_LAST_WORD = 0x0040_0000;
const PAUSE_LONG = 0x0080_0000;
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Intonation {
FullStop,
Comma,
Question,
Exclamation,
None,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ClauseType {
None,
Eof,
VoiceChange,
Clause,
Sentence,
}
#[derive(Debug, Clone)]
pub struct Clause {
pub text: String,
pub intonation: Intonation,
pub clause_type: ClauseType,
pub pause_ms: u32,
}
#[derive(Debug, Clone)]
pub struct LangOptions {
pub lang: String,
pub rate: u32,
pub pitch: u32,
pub word_gap: i32,
pub stress_rule: u8,
}
impl Default for LangOptions {
fn default() -> Self {
LangOptions {
lang: "en".to_string(),
rate: 175,
pitch: 50,
word_gap: 0,
stress_rule: 2, }
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
Word(String),
Space,
ClauseBoundary(char),
Punctuation(char),
}
pub fn tokenize(text: &str) -> Vec<Token> {
let mut tokens = Vec::new();
let mut chars = text.chars().peekable();
while let Some(c) = chars.next() {
if c.is_whitespace() {
while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
chars.next();
}
tokens.push(Token::Space);
} else if matches!(c, '.' | ',' | '!' | '?' | ';' | ':') {
while chars.peek().map(|ch| ch.is_whitespace()).unwrap_or(false) {
chars.next();
}
tokens.push(Token::ClauseBoundary(c));
} else if c.is_ascii_digit() {
let mut word = String::new();
word.push(c);
let mut has_dot = false;
while let Some(&next) = chars.peek() {
if next.is_ascii_digit() {
word.push(next);
chars.next();
} else if next == '.' && !has_dot {
let mut lookahead = chars.clone();
lookahead.next(); if lookahead
.peek()
.map(|c| c.is_ascii_digit())
.unwrap_or(false)
{
has_dot = true;
word.push(next);
chars.next();
} else {
break;
}
} else {
break;
}
}
tokens.push(Token::Word(word));
} else if c.is_alphabetic() || c == '\'' {
let mut word = String::new();
word.push(c);
while let Some(&next) = chars.peek() {
if next.is_alphabetic() || next == '\'' {
word.push(next);
chars.next();
} else if next == '-' {
let mut lookahead = chars.clone();
lookahead.next(); if lookahead.peek().map(|c| c.is_alphabetic()).unwrap_or(false) {
word.push(next);
chars.next();
} else {
break;
}
} else {
break;
}
}
tokens.push(Token::Word(word));
} else {
tokens.push(Token::Punctuation(c));
}
}
tokens
}
pub fn english_letter_bits() -> [u8; 256] {
let mut bits = [0u8; 256];
let set = |bits: &mut [u8; 256], group: u8, letters: &[u8]| {
for &c in letters {
bits[c as usize] |= 1 << group;
if c.is_ascii_lowercase() {
bits[(c - 32) as usize] |= 1 << group;
}
}
};
set(&mut bits, 0, b"aeiou");
set(&mut bits, 1, b"bcdfgjklmnpqstvxz");
set(&mut bits, 2, b"bcdfghjklmnpqrstvwxz");
set(&mut bits, 3, b"hlmnr");
set(&mut bits, 4, b"cfhkpqstx");
set(&mut bits, 5, b"bdgjlmnrvwyz");
set(&mut bits, 6, b"eiy");
set(&mut bits, 7, b"aeiouy");
bits
}
pub fn phonemes_to_ipa(
phoneme_bytes: &[u8],
phdata: &PhonemeData,
pending_stress_in: PendingStress,
word_sep: bool, ) -> (String, PendingStress) {
phonemes_to_ipa_lang(phoneme_bytes, phdata, pending_stress_in, word_sep, true)
}
pub fn phonemes_to_ipa_lang(
phoneme_bytes: &[u8],
phdata: &PhonemeData,
pending_stress_in: PendingStress,
word_sep: bool,
use_en_overrides: bool,
) -> (String, PendingStress) {
phonemes_to_ipa_full(
phoneme_bytes,
phdata,
pending_stress_in,
word_sep,
use_en_overrides,
false,
false,
)
}
pub fn phonemes_to_ipa_full(
phoneme_bytes: &[u8],
phdata: &PhonemeData,
pending_stress_in: PendingStress,
word_sep: bool,
use_en_overrides: bool,
suppress_word_final_liaison: bool,
add_tones: bool,
) -> (String, PendingStress) {
let mut out = String::new();
let mut stress = pending_stress_in;
let mut need_space = word_sep;
let mut prev_phcode: u8 = 0; let mut prev_logical_vowel = false;
let mut last_vowel_end_idx = 0;
const PH_VOICED_FLAG: u32 = 1 << 4;
for (idx, &code) in phoneme_bytes.iter().enumerate() {
if code == 0 {
break;
}
match code {
PHON_STRESS_P | PHON_STRESS_P2 | PHON_STRESS_TONIC => {
stress = PendingStress::Primary;
continue;
}
PHON_STRESS_2 | PHON_STRESS_3 => {
stress = PendingStress::Secondary;
continue;
}
PHON_STRESS_U | PHON_STRESS_D | PHON_STRESS_PREV => {
stress = PendingStress::None;
continue;
}
_ => {}
}
if is_pause_code(code) {
if code == 15 {
need_space = true;
stress = PendingStress::None;
last_vowel_end_idx = out.len();
}
continue;
}
let is_primary = stress == PendingStress::Primary;
let resolved_code = phdata.resolve_stressed_phoneme(code, is_primary);
let code = resolved_code;
if let Some(ph) = phdata.get(code) {
let is_vowel = ph.typ == 2; let is_stress_type = ph.typ == 1;
if is_stress_type {
let mut is_standard_stress = false;
if ph.std_length <= 4 && ph.program == 0 {
match ph.std_length {
4 => {
stress = PendingStress::Primary;
is_standard_stress = true;
}
2 | 3 => {
stress = PendingStress::Secondary;
is_standard_stress = true;
}
_ => {
is_standard_stress = true;
}
}
}
if !add_tones || is_standard_stress {
continue;
}
}
if suppress_word_final_liaison {
let mnemonic = ph.mnemonic;
let b1 = ((mnemonic >> 8) & 0xff) as u8;
let b2 = ((mnemonic >> 16) & 0xff) as u8;
let is_liaison = (b1 == b'2' || b1 == b'3') && b2 == 0 && !is_vowel;
if is_liaison {
let word_final = phoneme_bytes[idx + 1..]
.iter()
.all(|&c| c == 0 || c <= 8 || c == 15);
if word_final {
continue; }
}
}
if need_space {
out.push(' ');
need_space = false;
}
if is_vowel {
match stress {
PendingStress::Primary => {
out.push_str(IPA_STRESS_PRIMARY);
}
PendingStress::Secondary => {
out.push_str(IPA_STRESS_SECONDARY);
}
PendingStress::None => {}
}
stress = PendingStress::None;
}
let b1 = ((ph.mnemonic >> 8) & 0xff) as u8;
if b1 == b'#' {
let b0 = (ph.mnemonic & 0xff) as u8;
let prev_voiced = if let Some(prev_ph) = phdata.get(prev_phcode) {
prev_ph.typ == 2 ||
prev_ph.typ == 3 ||
(prev_ph.phflags & PH_VOICED_FLAG) != 0
} else {
false
};
let ipa_char = if b0 == b'd' {
if prev_voiced { "d" } else { "t" }
} else if b0 == b'z' {
if prev_voiced { "z" } else { "s" }
} else {
""
};
if !ipa_char.is_empty() {
out.push_str(ipa_char);
prev_phcode = code;
continue;
}
}
let ipa = if let Some(ipa_str) = phdata.phoneme_ipa_string(ph.program) {
ipa_str.to_string()
} else {
phoneme_ipa_lang(code, ph.mnemonic, is_vowel, use_en_overrides)
};
let mut ipa_out_str = ipa.clone();
if !use_en_overrides && ipa == "i" && prev_logical_vowel {
ipa_out_str = "j".to_string();
}
let is_tone = is_stress_type
&& add_tones
&& !(ph.std_length <= 4
&& ph.program == 0
&& (ph.std_length == 4 || ph.std_length == 2 || ph.std_length == 3));
if is_tone && last_vowel_end_idx > 0 && last_vowel_end_idx <= out.len() {
out.insert_str(last_vowel_end_idx, &ipa_out_str);
last_vowel_end_idx += ipa_out_str.len();
} else {
out.push_str(&ipa_out_str);
}
if !is_stress_type && !is_pause_code(code) {
prev_logical_vowel = is_vowel;
}
if is_vowel || (need_space && ipa_out_str.is_empty()) {
last_vowel_end_idx = out.len();
}
prev_phcode = code;
}
}
(out, stress)
}
pub struct WordResult {
pub phonemes: Vec<u8>,
pub dict_flags: u32,
}
#[allow(dead_code)]
pub(crate) fn lookup_num_phonemes(dict: &Dictionary, key: &str) -> Vec<u8> {
let ctx = LookupCtx {
lookup_symbol: true,
..Default::default()
};
if let Some(r) = lookup(dict, key, &ctx) {
if !r.phonemes.is_empty() {
return r.phonemes;
}
}
Vec::new()
}
pub fn word_to_phonemes(
word: &str,
dict: &Dictionary,
phdata: &PhonemeData,
stress_opts: &StressOpts,
) -> WordResult {
let ctx = LookupCtx {
lookup_symbol: true,
..Default::default()
};
let dict_result = lookup(dict, word, &ctx);
const FLAG_FOUND_ATTRIBUTES: u32 = 0x4000_0000;
let dict_flags_from_lookup = dict_result
.as_ref()
.filter(|r| r.flags1.0 & (FLAG_FOUND_ATTRIBUTES | 0x8000_0000) != 0)
.map(|r| r.flags1.0)
.unwrap_or(0);
if let Some(ref result) = dict_result {
if result.flags1.found() && !result.phonemes.is_empty() {
let dict_flags = result.flags1.0;
let mut phonemes = result.phonemes.clone();
set_word_stress(
&mut phonemes,
phdata,
stress_opts,
Some(dict_flags as u32),
-1,
0,
);
if stress_opts.alt_stress_upgrade {
apply_alt_stress_upgrade(&mut phonemes, phdata);
}
if stress_opts.word_final_devoicing {
apply_word_final_devoicing(&mut phonemes, phdata);
}
return WordResult {
phonemes,
dict_flags,
};
}
}
if word.contains('.') && word.len() > 1 && word.chars().any(|c| c.is_alphabetic()) {
let mut phonemes = Vec::new();
for ch in word.chars() {
if ch.is_alphabetic() {
let letter_word = ch.to_lowercase().to_string();
let res = word_to_phonemes(&letter_word, dict, phdata, stress_opts);
let l = res
.phonemes
.iter()
.position(|&b| b == 0)
.unwrap_or(res.phonemes.len());
if !phonemes.is_empty() {
phonemes.push(15); }
phonemes.extend_from_slice(&res.phonemes[..l]);
} else if ch == '.' {
}
}
if !phonemes.is_empty() {
phonemes.push(0);
return WordResult {
phonemes,
dict_flags: dict_flags_from_lookup,
};
}
}
let letter_bits = &*dict.letter_bits;
let mut vowel_count = 0i32;
let mut stressed_count = 0i32;
let mut word_buf = Vec::with_capacity(word.len() + 2);
word_buf.push(b' ');
word_buf.extend_from_slice(word.as_bytes());
word_buf.push(b' ');
word_buf.push(0);
let result = translate_rules_phdata(
dict,
&word_buf,
1, 0, 0, &letter_bits,
0, &mut vowel_count,
&mut stressed_count,
Some(phdata),
);
if !result.phonemes.is_empty() {
if std::env::var("DBG_SUFFIX").is_ok() {
eprintln!(
"DBG '{}': end_type={:#010x} SUFX_I={} suffix_start={}",
word,
result.end_type,
result.end_type & SUFX_I != 0,
result.suffix_start
);
}
let needs_stem_retranslation =
result.end_type != 0 && (result.end_type & SUFX_I) != 0 && result.suffix_start > 1;
let mut phonemes = if needs_stem_retranslation {
let suffix_len = (result.end_type & 0x7f) as usize; let word_bytes = word.as_bytes();
let stem_end_pos = word_bytes.len().saturating_sub(suffix_len);
let mut stem_word = word_bytes[..stem_end_pos].to_vec();
if (result.end_type & SUFX_I) != 0 {
if stem_word.last() == Some(&b'i') {
*stem_word.last_mut().unwrap() = b'y';
}
}
if let Ok(stem_str) = std::str::from_utf8(&stem_word) {
let mut stem_buf = Vec::with_capacity(stem_str.len() + 3);
stem_buf.push(b' ');
stem_buf.extend_from_slice(stem_str.as_bytes());
stem_buf.push(b' ');
stem_buf.push(0);
let mut stem_vc = 0i32;
let mut stem_sc = 0i32;
let stem_rules = translate_rules_phdata(
dict,
&stem_buf,
1,
FLAG_SUFFIX_REMOVED,
0,
&letter_bits,
0,
&mut stem_vc,
&mut stem_sc,
Some(phdata),
);
let mut full_stem_ph = Vec::new();
let stem_body = &stem_rules.phonemes;
let body_len = stem_body
.iter()
.position(|&b| b == 0)
.unwrap_or(stem_body.len());
full_stem_ph.extend_from_slice(&stem_body[..body_len]);
let stem_tail = &stem_rules.end_phonemes;
for &b in stem_tail {
if b == 0 {
break;
}
full_stem_ph.push(b);
}
if full_stem_ph.is_empty() {
let mut combined = Vec::new();
let sp = &result.phonemes;
let sl = sp.iter().position(|&b| b == 0).unwrap_or(sp.len());
combined.extend_from_slice(&sp[..sl]);
for &b in &result.end_phonemes {
if b == 0 {
break;
}
combined.push(b);
}
combined.push(0);
combined
} else {
full_stem_ph.push(0);
set_word_stress(&mut full_stem_ph, phdata, stress_opts, Some(0), -1, 0);
let stem_len = full_stem_ph
.iter()
.position(|&b| b == 0)
.unwrap_or(full_stem_ph.len());
let mut combined = Vec::new();
combined.extend_from_slice(&full_stem_ph[..stem_len]);
for &b in &result.end_phonemes {
if b == 0 {
break;
}
if b == 6 || b == 7 || b == 4 || b == 5 {
continue;
}
combined.push(b);
}
combined.push(0);
combined
}
} else {
let mut combined = Vec::new();
let stem_ph = &result.phonemes;
let stem_len = stem_ph
.iter()
.position(|&b| b == 0)
.unwrap_or(stem_ph.len());
combined.extend_from_slice(&stem_ph[..stem_len]);
for &b in &result.end_phonemes {
if b == 0 {
break;
}
combined.push(b);
}
combined.push(0);
combined
}
} else {
let mut combined = Vec::new();
let stem_ph = &result.phonemes;
let stem_len = stem_ph
.iter()
.position(|&b| b == 0)
.unwrap_or(stem_ph.len());
combined.extend_from_slice(&stem_ph[..stem_len]);
for &b in &result.end_phonemes {
if b == 0 {
break;
}
combined.push(b);
}
combined.push(0);
combined
};
let flags_for_stress = if dict_flags_from_lookup != 0 {
Some(dict_flags_from_lookup as u32)
} else {
Some(0) };
set_word_stress(&mut phonemes, phdata, stress_opts, flags_for_stress, -1, 0);
if stress_opts.alt_stress_upgrade {
apply_alt_stress_upgrade(&mut phonemes, phdata);
}
if stress_opts.word_final_devoicing {
apply_word_final_devoicing(&mut phonemes, phdata);
}
return WordResult {
phonemes,
dict_flags: dict_flags_from_lookup,
};
}
WordResult {
phonemes: Vec::new(),
dict_flags: dict_flags_from_lookup,
}
}
pub struct Translator {
pub options: LangOptions,
data_dir: PathBuf,
}
impl Translator {
pub fn new(lang: &str, data_dir: Option<&Path>) -> Result<Self> {
let dir = data_dir
.map(|p| p.to_path_buf())
.unwrap_or_else(|| PathBuf::from(default_data_dir()));
Ok(Translator {
options: LangOptions {
lang: lang.to_string(),
..Default::default()
},
data_dir: dir,
})
}
pub fn new_default(lang: &str) -> Result<Self> {
Self::new(lang, None)
}
pub fn read_clauses(&self, text: &str) -> Result<Vec<Clause>> {
let mut clauses = Vec::new();
let mut current = String::new();
for c in text.chars() {
match c {
'.' | '!' | '?' => {
current.push(c);
let intonation = match c {
'!' => Intonation::Exclamation,
'?' => Intonation::Question,
_ => Intonation::FullStop,
};
let text_trim = current.trim().to_string();
if !text_trim.is_empty() {
clauses.push(Clause {
text: text_trim,
intonation,
clause_type: ClauseType::Sentence,
pause_ms: 400,
});
}
current = String::new();
}
',' | ';' | ':' => {
current.push(c);
}
_ => {
current.push(c);
}
}
}
let text_trim = current.trim().to_string();
if !text_trim.is_empty() {
clauses.push(Clause {
text: text_trim,
intonation: Intonation::None,
clause_type: ClauseType::Eof,
pause_ms: 0,
});
}
if clauses.is_empty() {
clauses.push(Clause {
text: text.trim().to_string(),
intonation: Intonation::None,
clause_type: ClauseType::Eof,
pause_ms: 0,
});
}
Ok(clauses)
}
pub fn text_to_ipa(&self, text: &str) -> Result<String> {
let lang = &self.options.lang;
let dict_path = self.data_dir.join(format!("{}_dict", lang));
let phontab_path = self.data_dir.join("phontab");
if !dict_path.exists() {
return Err(Error::NotImplemented("text_to_ipa: dict not found"));
}
let dict_bytes = std::fs::read(&dict_path).map_err(Error::Io)?;
let dict = Dictionary::from_bytes(lang, dict_bytes)?;
if !phontab_path.exists() {
return Err(Error::NotImplemented("text_to_ipa: phontab not found"));
}
let mut phdata = PhonemeData::load(&self.data_dir)?;
phdata.select_table_by_name(lang)?;
let stress_opts = StressOpts::for_lang(lang);
let tokens = tokenize(text);
#[derive(Clone, PartialEq)]
enum EntryKind {
Word,
ClauseBoundary,
Other,
}
struct EntryFull {
phonemes: Vec<u8>,
dict_flags: u32,
kind: EntryKind,
}
let mut entries: Vec<EntryFull> = Vec::new();
for token in &tokens {
match token {
Token::Word(word) => {
let lower = word.to_lowercase();
let wr = word_to_phonemes(&lower, &dict, &phdata, &stress_opts);
entries.push(EntryFull {
phonemes: wr.phonemes,
dict_flags: wr.dict_flags,
kind: EntryKind::Word,
});
}
Token::ClauseBoundary(_) => {
entries.push(EntryFull {
phonemes: Vec::new(),
dict_flags: 0,
kind: EntryKind::ClauseBoundary,
});
}
_ => {
entries.push(EntryFull {
phonemes: Vec::new(),
dict_flags: 0,
kind: EntryKind::Other,
});
}
}
}
const FLAG_STREND: u32 = 1 << 9; const FLAG_STREND2: u32 = 1 << 10; const PHON_STRESS_P_CODE: u8 = 6;
const PHON_STRESS_P2_CODE: u8 = 7;
fn promote_clause(entries: &mut [EntryFull], phdata: &PhonemeData) {
let n = entries.len();
for i in 0..n {
if entries[i].kind != EntryKind::Word {
continue;
}
let dict_flags = entries[i].dict_flags;
if dict_flags & (FLAG_STREND | FLAG_STREND2) == 0 {
continue;
}
let is_last_word = entries[i + 1..].iter().all(|e| e.kind != EntryKind::Word);
let following_all_unstressed = entries[i + 1..]
.iter()
.filter(|e| e.kind == EntryKind::Word)
.all(|e| {
!e.phonemes
.iter()
.any(|&c| c == PHON_STRESS_P_CODE || c == PHON_STRESS_P2_CODE)
});
promote_strend_stress(
&mut entries[i].phonemes,
phdata,
dict_flags,
is_last_word,
following_all_unstressed,
);
}
let has_primary = entries
.iter()
.filter(|e| e.kind == EntryKind::Word)
.any(|e| {
e.phonemes
.iter()
.any(|&c| c == PHON_STRESS_P_CODE || c == PHON_STRESS_P2_CODE)
});
if !has_primary {
let last_secondary = entries
.iter()
.enumerate()
.rev()
.find(|(_, e)| {
e.kind == EntryKind::Word
&& !e.phonemes.is_empty()
&& e.phonemes.iter().any(|&c| c == 4 || c == 5)
})
.map(|(i, _)| i);
if let Some(idx) = last_secondary {
change_word_stress(&mut entries[idx].phonemes, phdata, 4);
} else {
let last_word = entries
.iter()
.enumerate()
.rev()
.find(|(_, e)| e.kind == EntryKind::Word && !e.phonemes.is_empty())
.map(|(i, _)| i);
if let Some(idx) = last_word {
change_word_stress(&mut entries[idx].phonemes, phdata, 4);
}
}
}
}
let clause_boundaries: Vec<usize> = {
let mut tmp = Vec::new();
for i in 0..entries.len() {
if entries[i].kind == EntryKind::ClauseBoundary {
tmp.push(i);
}
}
tmp
};
if clause_boundaries.is_empty() {
promote_clause(&mut entries, &phdata);
} else {
let mut prev_end = 0usize;
let mut boundaries_with_end: Vec<usize> = clause_boundaries.clone();
boundaries_with_end.push(entries.len()); for &bound in &boundaries_with_end {
let slice_end = if bound < entries.len() {
bound
} else {
entries.len()
};
if slice_end > prev_end {
promote_clause(&mut entries[prev_end..slice_end], &phdata);
}
prev_end = if bound < entries.len() {
bound + 1
} else {
entries.len()
};
}
}
let mut ipa_out = String::new();
let mut first_word = true;
let mut clause_has_output = false;
let mut stress = PendingStress::None;
for (ei, entry) in entries.iter().enumerate() {
match entry.kind {
EntryKind::Word => {
let phonemes = &entry.phonemes;
if phonemes.is_empty() {
continue;
}
let use_en_overrides = lang == "en";
let next_starts_vowel = entries[ei + 1..]
.iter()
.find(|e| e.kind == EntryKind::Word && !e.phonemes.is_empty())
.map(|e| {
e.phonemes
.iter()
.find(|&&c| c > 8 && c != 15)
.and_then(|&c| phdata.get(c))
.map(|ph| ph.typ == 2) .unwrap_or(false)
})
.unwrap_or(false);
let suppress_liaison = !next_starts_vowel;
let (word_ipa, new_stress) = phonemes_to_ipa_full(
phonemes,
&phdata,
stress,
!first_word,
use_en_overrides,
suppress_liaison,
true, );
stress = new_stress;
if !word_ipa.is_empty() {
ipa_out.push_str(&word_ipa);
first_word = false;
clause_has_output = true;
}
}
EntryKind::ClauseBoundary => {
if clause_has_output {
ipa_out.push('\n');
clause_has_output = false;
stress = PendingStress::None;
}
first_word = true;
}
EntryKind::Other => {}
}
}
let mut ipa_out = ipa_out.trim_end_matches('\n').to_string();
if lang == "fr" {
ipa_out = ipa_out.replace('r', "ʁ");
}
Ok(ipa_out)
}
pub fn translate_to_codes(&self, text: &str) -> Result<Vec<PhonemeCode>> {
let lang = &self.options.lang;
let dict_path = self.data_dir.join(format!("{}_dict", lang));
let phontab_path = self.data_dir.join("phontab");
if !dict_path.exists() {
return Err(Error::NotImplemented("translate_to_codes: dict not found"));
}
let dict_bytes = std::fs::read(&dict_path).map_err(Error::Io)?;
let dict = Dictionary::from_bytes(lang, dict_bytes)?;
if !phontab_path.exists() {
return Err(Error::NotImplemented(
"translate_to_codes: phontab not found",
));
}
let mut phdata = PhonemeData::load(&self.data_dir)?;
phdata.select_table_by_name(lang)?;
let stress_opts = StressOpts::for_lang(lang);
let tokens = tokenize(text);
let mut codes: Vec<PhonemeCode> = Vec::new();
for token in &tokens {
match token {
Token::Word(word) => {
let lower = word.to_lowercase();
let wr = word_to_phonemes(&lower, &dict, &phdata, &stress_opts);
for &b in &wr.phonemes {
codes.push(PhonemeCode {
code: b,
is_boundary: false,
});
}
}
Token::Space => {
codes.push(PhonemeCode {
code: 15,
is_boundary: true,
}); }
Token::ClauseBoundary(_) => {
codes.push(PhonemeCode {
code: 0,
is_boundary: true,
}); }
_ => {}
}
}
Ok(codes)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct PhonemeCode {
pub code: u8,
pub is_boundary: bool,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn translator_new_default_succeeds() {
let t = Translator::new_default("en").unwrap();
assert_eq!(t.options.lang, "en");
assert_eq!(t.options.rate, 175);
}
#[test]
fn tokenize_hello_world() {
let tokens = tokenize("hello world");
assert_eq!(
tokens,
vec![
Token::Word("hello".to_string()),
Token::Space,
Token::Word("world".to_string()),
]
);
}
#[test]
fn tokenize_with_punctuation() {
let tokens = tokenize("hello, world!");
assert!(
tokens
.iter()
.any(|t| t == &Token::Word("hello".to_string()))
);
assert!(
tokens
.iter()
.any(|t| t == &Token::Word("world".to_string()))
);
assert!(tokens.iter().any(|t| t == &Token::ClauseBoundary(',')));
assert!(tokens.iter().any(|t| t == &Token::ClauseBoundary('!')));
}
#[test]
fn tokenize_empty() {
assert!(tokenize("").is_empty());
}
#[test]
fn tokenize_apostrophe() {
let tokens = tokenize("it's");
assert_eq!(tokens, vec![Token::Word("it's".to_string())]);
}
#[test]
fn clause_flags_fields_do_not_overlap() {
assert!((ClauseFlags::PAUSE_MASK & ClauseFlags::INTONATION_MASK).is_empty());
assert!((ClauseFlags::INTONATION_MASK & ClauseFlags::TYPE_MASK).is_empty());
}
#[test]
fn read_clauses_basic() {
let t = Translator::new_default("en").unwrap();
let clauses = t.read_clauses("Hello world. How are you?").unwrap();
assert_eq!(clauses.len(), 2);
assert_eq!(clauses[0].intonation, Intonation::FullStop);
assert_eq!(clauses[1].intonation, Intonation::Question);
}
#[test]
fn read_clauses_no_punctuation() {
let t = Translator::new_default("en").unwrap();
let clauses = t.read_clauses("hello world").unwrap();
assert_eq!(clauses.len(), 1);
assert_eq!(clauses[0].text, "hello world");
}
fn make_phdata() -> Option<PhonemeData> {
let dir = std::path::Path::new("/usr/share/espeak-ng-data");
if !dir.join("phontab").exists() {
return None;
}
let mut phdata = PhonemeData::load(dir).ok()?;
phdata.select_table_by_name("en").ok()?;
Some(phdata)
}
#[test]
fn phonemes_to_ipa_the() {
let phdata = match make_phdata() {
Some(d) => d,
None => return,
};
let (ipa, _) = phonemes_to_ipa(&[87, 115], &phdata, PendingStress::None, false);
assert_eq!(ipa, "ðə");
}
#[test]
fn phonemes_to_ipa_be() {
let phdata = match make_phdata() {
Some(d) => d,
None => return,
};
let (ipa, _) = phonemes_to_ipa(&[72, 137], &phdata, PendingStress::None, false);
assert_eq!(ipa, "biː");
}
#[test]
fn phonemes_to_ipa_with_stress() {
let phdata = match make_phdata() {
Some(d) => d,
None => return,
};
let (ipa, _) = phonemes_to_ipa(&[4, 50, 129, 47], &phdata, PendingStress::None, false);
assert_eq!(ipa, "nˌɒt");
}
#[test]
fn text_to_ipa_be() {
let t = Translator::new_default("en").unwrap();
if !Path::new("/usr/share/espeak-ng-data/en_dict").exists() {
return;
}
let ipa = t.text_to_ipa("be").unwrap();
assert_eq!(ipa, "bˈiː");
}
#[test]
fn text_to_ipa_he() {
let t = Translator::new_default("en").unwrap();
if !Path::new("/usr/share/espeak-ng-data/en_dict").exists() {
return;
}
let ipa = t.text_to_ipa("he").unwrap();
assert_eq!(ipa, "hˈiː");
}
#[test]
fn text_to_ipa_do() {
let t = Translator::new_default("en").unwrap();
if !Path::new("/usr/share/espeak-ng-data/en_dict").exists() {
return;
}
let ipa = t.text_to_ipa("do").unwrap();
assert_eq!(ipa, "dˈuː");
}
#[test]
fn text_to_ipa_the() {
let t = Translator::new_default("en").unwrap();
if !Path::new("/usr/share/espeak-ng-data/en_dict").exists() {
return;
}
let ipa = t.text_to_ipa("the").unwrap();
assert_eq!(ipa, "ðˈə");
}
}