use std::collections::HashSet;
use std::sync::LazyLock;
use super::token_map::token_to_pua;
use super::{Phonemizer, ProsodyFeature, ProsodyInfo};
use crate::config::PhonemeIdMap;
use crate::error::PiperError;
const IPA_BETA: char = '\u{03B2}';
const IPA_ETH: char = '\u{00F0}';
const IPA_G: char = '\u{0261}';
const IPA_GAMMA: char = '\u{0263}';
const IPA_PALATAL_NASAL: char = '\u{0272}';
const IPA_TAP: char = '\u{027E}';
const IPA_PALATAL_FRIC: char = '\u{029D}';
const IPA_STRESS: char = '\u{02C8}';
const PUA_RR: char = '\u{E01D}';
const PUA_TCH: char = '\u{E054}';
fn is_punctuation(c: char) -> bool {
matches!(
c,
',' | '.' | ';' | ':' | '!' | '?' | '\u{00A1}' | '\u{00BF}'
)
}
fn is_vowel(c: char) -> bool {
matches!(c, 'a' | 'e' | 'i' | 'o' | 'u')
}
fn is_strong_vowel(c: char) -> bool {
matches!(c, 'a' | 'e' | 'o')
}
fn is_weak_vowel(c: char) -> bool {
matches!(c, 'i' | 'u')
}
fn accent_base(c: char) -> char {
match c {
'\u{00E1}' => 'a', '\u{00E9}' => 'e', '\u{00ED}' => 'i', '\u{00F3}' => 'o', '\u{00FA}' => 'u', '\u{00FC}' => 'u', _ => c,
}
}
fn has_stress_accent(c: char) -> bool {
matches!(
c,
'\u{00E1}' | '\u{00E9}' | '\u{00ED}' | '\u{00F3}' | '\u{00FA}'
)
}
fn is_vowel_or_accented(c: char) -> bool {
is_vowel(c) || has_stress_accent(c) || c == '\u{00FC}'
}
fn is_spanish_alpha(c: char) -> bool {
if c.is_ascii_lowercase() {
return true;
}
matches!(
c,
'\u{00F1}' | '\u{00E1}' | '\u{00E9}' | '\u{00ED}' | '\u{00F3}' | '\u{00FA}' | '\u{00FC}'
)
}
fn to_lower_sp(c: char) -> char {
if c.is_ascii_uppercase() {
return (c as u8 + 32) as char;
}
match c {
'\u{00C1}' => '\u{00E1}', '\u{00C9}' => '\u{00E9}', '\u{00CD}' => '\u{00ED}', '\u{00D3}' => '\u{00F3}', '\u{00DA}' => '\u{00FA}', '\u{00DC}' => '\u{00FC}', '\u{00D1}' => '\u{00F1}', _ => c,
}
}
fn collapse_combiners(cps: &[char]) -> Vec<char> {
if cps.len() < 2 {
return cps.to_vec();
}
let mut out = Vec::with_capacity(cps.len());
let mut i = 0;
let n = cps.len();
while i < n {
if i + 1 < n {
let base = cps[i];
let comb = cps[i + 1];
let composed = match comb {
'\u{0301}' => match base {
'A' => Some('\u{00C1}'),
'a' => Some('\u{00E1}'),
'E' => Some('\u{00C9}'),
'e' => Some('\u{00E9}'),
'I' => Some('\u{00CD}'),
'i' => Some('\u{00ED}'),
'O' => Some('\u{00D3}'),
'o' => Some('\u{00F3}'),
'U' => Some('\u{00DA}'),
'u' => Some('\u{00FA}'),
_ => None,
},
'\u{0308}' => match base {
'U' => Some('\u{00DC}'),
'u' => Some('\u{00FC}'),
_ => None,
},
'\u{0303}' => match base {
'N' => Some('\u{00D1}'),
'n' => Some('\u{00F1}'),
_ => None,
},
_ => None,
};
if let Some(c) = composed {
out.push(c);
i += 2;
continue;
}
}
out.push(cps[i]);
i += 1;
}
out
}
fn normalize(text: &str) -> Vec<char> {
let cps: Vec<char> = text.chars().collect();
let nfc = collapse_combiners(&cps);
nfc.into_iter().map(to_lower_sp).collect()
}
#[derive(Debug)]
enum Token {
Word(Vec<char>),
Punct(Vec<char>),
}
fn tokenize(cps: &[char]) -> Vec<Token> {
let mut tokens = Vec::new();
let n = cps.len();
let mut i = 0;
while i < n {
if is_spanish_alpha(cps[i]) {
let mut chars = Vec::new();
while i < n && is_spanish_alpha(cps[i]) {
chars.push(cps[i]);
i += 1;
}
tokens.push(Token::Word(chars));
} else if is_punctuation(cps[i]) {
let mut chars = Vec::new();
while i < n && is_punctuation(cps[i]) {
chars.push(cps[i]);
i += 1;
}
tokens.push(Token::Punct(chars));
} else {
i += 1; }
}
tokens
}
static UNSTRESSED_FUNCTION_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"el", "la", "los", "las", "un", "una", "de", "del", "al", "a", "en", "con", "por", "y",
"o", "que", "se", "me", "te", "le", "lo", "nos", "su", "mi", "tu", "es", "no", "si",
]
.into_iter()
.collect()
});
fn chars_to_string(chars: &[char]) -> String {
chars.iter().collect()
}
#[derive(Debug, Clone)]
struct GUnit {
chars: Vec<char>,
is_vowel: bool,
is_silent: bool,
}
fn segment_graphemes(word: &[char]) -> Vec<GUnit> {
let bw: Vec<char> = word.iter().map(|&c| accent_base(c)).collect();
let mut units = Vec::new();
let n = word.len();
let mut i = 0;
while i < n {
let bc = bw[i];
if bc == 'q' && i + 1 < n && bw[i + 1] == 'u' {
units.push(GUnit {
chars: vec![word[i], word[i + 1]],
is_vowel: false,
is_silent: false,
});
i += 2;
continue;
}
if bc == 'g'
&& i + 1 < n
&& word[i + 1] == '\u{00FC}'
&& i + 2 < n
&& (bw[i + 2] == 'e' || bw[i + 2] == 'i')
{
units.push(GUnit {
chars: vec![word[i], word[i + 1]],
is_vowel: false,
is_silent: false,
});
i += 2;
continue;
}
if bc == 'g'
&& i + 1 < n
&& bw[i + 1] == 'u'
&& i + 2 < n
&& (bw[i + 2] == 'e' || bw[i + 2] == 'i')
{
units.push(GUnit {
chars: vec![word[i], word[i + 1]],
is_vowel: false,
is_silent: false,
});
i += 2;
continue;
}
if bc == 'c' && i + 1 < n && bw[i + 1] == 'h' {
units.push(GUnit {
chars: vec![word[i], word[i + 1]],
is_vowel: false,
is_silent: false,
});
i += 2;
continue;
}
if bc == 'l' && i + 1 < n && bw[i + 1] == 'l' {
units.push(GUnit {
chars: vec![word[i], word[i + 1]],
is_vowel: false,
is_silent: false,
});
i += 2;
continue;
}
if bc == 'r' && i + 1 < n && bw[i + 1] == 'r' {
units.push(GUnit {
chars: vec![word[i], word[i + 1]],
is_vowel: false,
is_silent: false,
});
i += 2;
continue;
}
if bc == 's'
&& i + 1 < n
&& bw[i + 1] == 'c'
&& i + 2 < n
&& (bw[i + 2] == 'e' || bw[i + 2] == 'i')
{
units.push(GUnit {
chars: vec![word[i], word[i + 1]],
is_vowel: false,
is_silent: false,
});
i += 2;
continue;
}
if bc == 'x'
&& i + 1 < n
&& bw[i + 1] == 'c'
&& i + 2 < n
&& (bw[i + 2] == 'e' || bw[i + 2] == 'i')
{
units.push(GUnit {
chars: vec![word[i], word[i + 1]],
is_vowel: false,
is_silent: false,
});
i += 2;
continue;
}
if bc == 'h' {
units.push(GUnit {
chars: vec![word[i]],
is_vowel: false,
is_silent: true,
});
i += 1;
continue;
}
if is_vowel(bc) {
units.push(GUnit {
chars: vec![word[i]],
is_vowel: true,
is_silent: false,
});
i += 1;
continue;
}
units.push(GUnit {
chars: vec![word[i]],
is_vowel: false,
is_silent: false,
});
i += 1;
}
units
}
fn is_inseparable(c1: char, c2: char) -> bool {
if c2 == 'l' {
return matches!(c1, 'b' | 'c' | 'f' | 'g' | 'p' | 't');
}
if c2 == 'r' {
return matches!(c1, 'b' | 'c' | 'd' | 'f' | 'g' | 'p' | 't');
}
false
}
fn base_cons_of_unit(u: &GUnit) -> char {
accent_base(*u.chars.last().unwrap())
}
fn find_syllable_boundaries(units: &[GUnit]) -> Vec<usize> {
let mut ns_idx: Vec<usize> = Vec::new();
let mut ns_vow: Vec<bool> = Vec::new();
for (idx, unit) in units.iter().enumerate() {
if unit.is_silent {
continue;
}
ns_idx.push(idx);
ns_vow.push(unit.is_vowel);
}
let ns_n = ns_idx.len();
if ns_n == 0 {
return vec![0];
}
let mut ns_bounds: Vec<usize> = vec![0];
let mut i = 1;
while i < ns_n {
if ns_vow[i] {
if i > 0 && ns_vow[i - 1] {
let prev_g = *units[ns_idx[i - 1]].chars.last().unwrap();
let curr_g = *units[ns_idx[i]].chars.last().unwrap();
let prev_b = accent_base(prev_g);
let curr_b = accent_base(curr_g);
if is_strong_vowel(prev_b) && is_strong_vowel(curr_b) {
ns_bounds.push(i); } else {
if (is_weak_vowel(curr_b) && has_stress_accent(curr_g))
|| (is_weak_vowel(prev_b) && has_stress_accent(prev_g))
{
ns_bounds.push(i);
}
}
}
i += 1;
} else {
let cons_start = i;
while i < ns_n && !ns_vow[i] {
i += 1;
}
let cons_count = i - cons_start;
if i < ns_n {
if cons_count == 1 {
ns_bounds.push(cons_start);
} else if cons_count == 2 {
let c1 = base_cons_of_unit(&units[ns_idx[cons_start]]);
let c2 = base_cons_of_unit(&units[ns_idx[cons_start + 1]]);
if is_inseparable(c1, c2) {
ns_bounds.push(cons_start);
} else {
ns_bounds.push(cons_start + 1);
}
} else {
let c1 = base_cons_of_unit(&units[ns_idx[i - 2]]);
let c2 = base_cons_of_unit(&units[ns_idx[i - 1]]);
if is_inseparable(c1, c2) {
ns_bounds.push(i - 2);
} else {
ns_bounds.push(i - 1);
}
}
}
}
}
ns_bounds.iter().map(|&b| ns_idx[b]).collect()
}
fn find_accent_index(word: &[char]) -> Option<usize> {
word.iter().position(|&c| has_stress_accent(c))
}
fn get_stressed_syllable(word: &[char], units: &[GUnit], boundaries: &[usize]) -> usize {
let num_syl = boundaries.len();
if num_syl == 0 {
return 0;
}
if let Some(acc_idx) = find_accent_index(word) {
let mut char_off = 0usize;
let mut acc_unit_idx = 0usize;
for (uid, unit) in units.iter().enumerate() {
let u_len = unit.chars.len();
if char_off <= acc_idx && acc_idx < char_off + u_len {
acc_unit_idx = uid;
break;
}
char_off += u_len;
}
for s in (0..num_syl).rev() {
if boundaries[s] <= acc_unit_idx {
return s;
}
}
return 0;
}
if num_syl == 1 {
return 0;
}
let last = accent_base(*word.last().unwrap());
if is_vowel(last) || last == 'n' || last == 's' {
num_syl.saturating_sub(2) } else {
num_syl - 1 }
}
struct G2PResult {
phonemes: Vec<char>,
stressed_syl: usize,
units: Vec<GUnit>,
boundaries: Vec<usize>,
}
fn g2p_word(word: &[char]) -> G2PResult {
let mut ph: Vec<char> = Vec::new();
let n = word.len();
let bw: Vec<char> = word.iter().map(|&c| accent_base(c)).collect();
let prev_is_vowel = |idx: usize| -> bool { idx > 0 && is_vowel_or_accented(word[idx - 1]) };
let is_after_nasal =
|idx: usize| -> bool { idx > 0 && (bw[idx - 1] == 'm' || bw[idx - 1] == 'n') };
let is_word_initial = |idx: usize| -> bool { idx == 0 };
let mut i = 0;
while i < n {
let bc = bw[i];
if is_vowel(bc) {
ph.push(bc);
i += 1;
continue;
}
if bc == 'q' && i + 1 < n && bw[i + 1] == 'u' {
ph.push('k');
i += 2;
continue;
}
if bc == 'c' && i + 1 < n && bw[i + 1] == 'h' {
ph.push(PUA_TCH);
i += 2;
continue;
}
if bc == 'l' && i + 1 < n && bw[i + 1] == 'l' {
ph.push(IPA_PALATAL_FRIC);
i += 2;
continue;
}
if bc == 'r' && i + 1 < n && bw[i + 1] == 'r' {
ph.push(PUA_RR);
i += 2;
continue;
}
if bc == 'g'
&& i + 1 < n
&& word[i + 1] == '\u{00FC}'
&& i + 2 < n
&& (bw[i + 2] == 'e' || bw[i + 2] == 'i')
{
ph.push(IPA_G);
ph.push('w');
i += 2;
continue;
}
if bc == 'g'
&& i + 1 < n
&& bw[i + 1] == 'u'
&& i + 2 < n
&& (bw[i + 2] == 'e' || bw[i + 2] == 'i')
{
if prev_is_vowel(i) && !is_after_nasal(i) {
ph.push(IPA_GAMMA);
} else {
ph.push(IPA_G);
}
i += 2;
continue;
}
if bc == 's'
&& i + 1 < n
&& bw[i + 1] == 'c'
&& i + 2 < n
&& (bw[i + 2] == 'e' || bw[i + 2] == 'i')
{
ph.push('s');
i += 2;
continue;
}
if bc == 'b' || bc == 'v' {
if is_word_initial(i) || is_after_nasal(i) || (i > 0 && bw[i - 1] == 'l') {
ph.push('b');
} else {
ph.push(IPA_BETA);
}
i += 1;
continue;
}
if bc == 'c' {
if i + 1 < n && (bw[i + 1] == 'e' || bw[i + 1] == 'i') {
ph.push('s'); } else {
ph.push('k');
}
i += 1;
continue;
}
if bc == 'd' {
if is_word_initial(i) || is_after_nasal(i) || (i > 0 && bw[i - 1] == 'l') {
ph.push('d');
} else {
ph.push(IPA_ETH);
}
i += 1;
continue;
}
if bc == 'f' {
ph.push('f');
i += 1;
continue;
}
if bc == 'g' {
if i + 1 < n && (bw[i + 1] == 'e' || bw[i + 1] == 'i') {
ph.push('x'); } else if is_word_initial(i) || is_after_nasal(i) || (i > 0 && bw[i - 1] == 'l') {
ph.push(IPA_G);
} else {
ph.push(IPA_GAMMA);
}
i += 1;
continue;
}
if bc == 'h' {
i += 1;
continue;
}
if bc == 'j' {
ph.push('x');
i += 1;
continue;
}
if bc == 'k' {
ph.push('k');
i += 1;
continue;
}
if bc == 'l' {
ph.push('l');
i += 1;
continue;
}
if bc == 'm' {
ph.push('m');
i += 1;
continue;
}
if bc == 'n' {
ph.push('n');
i += 1;
continue;
}
if bc == '\u{00F1}' {
ph.push(IPA_PALATAL_NASAL);
i += 1;
continue;
}
if bc == 'p' {
ph.push('p');
i += 1;
continue;
}
if bc == 'r' {
if is_word_initial(i) {
ph.push(PUA_RR); } else if i > 0 && (bw[i - 1] == 'l' || bw[i - 1] == 'n' || bw[i - 1] == 's') {
ph.push(PUA_RR); } else {
ph.push(IPA_TAP);
}
i += 1;
continue;
}
if bc == 's' {
ph.push('s');
i += 1;
continue;
}
if bc == 't' {
ph.push('t');
i += 1;
continue;
}
if bc == 'w' {
ph.push('w');
i += 1;
continue;
}
if bc == 'x' {
if i + 1 < n && bw[i + 1] == 'c' && i + 2 < n && (bw[i + 2] == 'e' || bw[i + 2] == 'i')
{
ph.push('k');
ph.push('s');
i += 2;
continue;
}
ph.push('k');
ph.push('s');
i += 1;
continue;
}
if bc == 'y' {
if i == n - 1 {
ph.push('i'); } else {
ph.push(IPA_PALATAL_FRIC);
}
i += 1;
continue;
}
if bc == 'z' {
ph.push('s');
i += 1;
continue;
}
i += 1;
}
let units = segment_graphemes(word);
let boundaries = find_syllable_boundaries(&units);
let stressed_syl = get_stressed_syllable(word, &units, &boundaries);
G2PResult {
phonemes: ph,
stressed_syl,
units,
boundaries,
}
}
fn phoneme_count_for_unit(unit: &GUnit) -> usize {
let base: Vec<char> = unit.chars.iter().map(|&c| accent_base(c)).collect();
if base.len() == 1 && base[0] == 'h' {
return 0;
}
if base.len() == 2 && base[0] == 'g' && unit.chars[1] == '\u{00FC}' {
return 2;
}
if base.len() == 2 && base[0] == 'x' && base[1] == 'c' {
return 2;
}
if base.len() == 1 && base[0] == 'x' {
return 2;
}
1
}
fn insert_stress_marker(
phonemes: &mut Vec<char>,
units: &[GUnit],
boundaries: &[usize],
stressed_syl: usize,
) {
if phonemes.is_empty() || boundaries.is_empty() {
return;
}
if stressed_syl >= boundaries.len() {
return;
}
let num_units = units.len();
let syl_start = boundaries[stressed_syl];
let syl_end = if stressed_syl + 1 < boundaries.len() {
boundaries[stressed_syl + 1]
} else {
num_units
};
let stressed_unit_idx = units[syl_start..syl_end.min(num_units)]
.iter()
.enumerate()
.find(|(_, u)| u.is_vowel)
.map(|(offset, _)| syl_start + offset);
let stressed_unit_idx = match stressed_unit_idx {
Some(idx) => idx,
None => return,
};
let mut ph_i = 0usize;
for (uid, unit) in units.iter().enumerate() {
if uid == stressed_unit_idx {
phonemes.insert(ph_i, IPA_STRESS);
return;
}
ph_i += phoneme_count_for_unit(unit);
}
}
fn map_sequence(tokens: Vec<String>) -> Vec<String> {
tokens
.into_iter()
.map(|t| {
if let Some(pua_char) = token_to_pua(&t) {
pua_char.to_string()
} else {
t
}
})
.collect()
}
pub fn phonemize_spanish_with_prosody(text: &str) -> (Vec<String>, Vec<Option<ProsodyInfo>>) {
let cps = normalize(text);
let tokens = tokenize(&cps);
if tokens.is_empty() {
return (Vec::new(), Vec::new());
}
let mut phonemes: Vec<String> = Vec::new();
let mut prosody_list: Vec<Option<ProsodyInfo>> = Vec::new();
let mut need_space = false;
for tok in &tokens {
match tok {
Token::Punct(chars) => {
for &c in chars {
phonemes.push(c.to_string());
prosody_list.push(Some(ProsodyInfo {
a1: 0,
a2: 0,
a3: 0,
}));
}
}
Token::Word(chars) => {
if need_space {
phonemes.push(" ".to_string());
prosody_list.push(Some(ProsodyInfo {
a1: 0,
a2: 0,
a3: 0,
}));
}
let mut res = g2p_word(chars);
let word_utf8 = chars_to_string(chars);
let is_function = UNSTRESSED_FUNCTION_WORDS.contains(word_utf8.as_str());
if !is_function {
insert_stress_marker(
&mut res.phonemes,
&res.units,
&res.boundaries,
res.stressed_syl,
);
}
let word_phoneme_count =
res.phonemes.iter().filter(|&&c| c != IPA_STRESS).count() as i32;
for (idx, &ph_char) in res.phonemes.iter().enumerate() {
if ph_char == IPA_STRESS {
phonemes.push(IPA_STRESS.to_string());
prosody_list.push(Some(ProsodyInfo {
a1: 0,
a2: 2,
a3: word_phoneme_count,
}));
} else {
let is_stressed_vowel =
idx > 0 && res.phonemes[idx - 1] == IPA_STRESS && is_vowel(ph_char);
let a2 = if is_stressed_vowel { 2 } else { 0 };
phonemes.push(ph_char.to_string());
prosody_list.push(Some(ProsodyInfo {
a1: 0,
a2,
a3: word_phoneme_count,
}));
}
}
need_space = true;
}
}
}
let mapped = map_sequence(phonemes);
(mapped, prosody_list)
}
pub fn phonemize_spanish(text: &str) -> Vec<String> {
let (phonemes, _) = phonemize_spanish_with_prosody(text);
phonemes
}
pub struct SpanishPhonemizer;
impl SpanishPhonemizer {
pub fn new() -> Self {
Self
}
}
impl Default for SpanishPhonemizer {
fn default() -> Self {
Self::new()
}
}
impl Phonemizer for SpanishPhonemizer {
fn phonemize_with_prosody(
&self,
text: &str,
) -> Result<(Vec<String>, Vec<Option<ProsodyInfo>>), PiperError> {
Ok(phonemize_spanish_with_prosody(text))
}
fn get_phoneme_id_map(&self) -> Option<&PhonemeIdMap> {
None
}
fn post_process_ids(
&self,
ids: Vec<i64>,
prosody: Vec<Option<ProsodyFeature>>,
id_map: &PhonemeIdMap,
) -> (Vec<i64>, Vec<Option<ProsodyFeature>>) {
let pad_id = id_map
.get("_")
.and_then(|v| v.first().copied())
.unwrap_or(0);
let bos_id = id_map
.get("^")
.and_then(|v| v.first().copied())
.unwrap_or(1);
let eos_id = id_map
.get("$")
.and_then(|v| v.first().copied())
.unwrap_or(2);
let mut out_ids: Vec<i64> = Vec::with_capacity(ids.len() * 2 + 2);
let mut out_prosody: Vec<Option<ProsodyFeature>> = Vec::with_capacity(ids.len() * 2 + 2);
out_ids.push(bos_id);
out_prosody.push(None);
for (i, &id) in ids.iter().enumerate() {
if i > 0 {
out_ids.push(pad_id);
out_prosody.push(None);
}
out_ids.push(id);
out_prosody.push(prosody.get(i).cloned().unwrap_or(None));
}
out_ids.push(eos_id);
out_prosody.push(None);
(out_ids, out_prosody)
}
fn language_code(&self) -> &str {
"es"
}
}
#[cfg(test)]
mod tests {
use super::*;
fn ph(text: &str) -> Vec<String> {
phonemize_spanish(text)
}
fn ph_with_prosody(text: &str) -> (Vec<String>, Vec<Option<ProsodyInfo>>) {
phonemize_spanish_with_prosody(text)
}
#[test]
fn test_simple_word_hola() {
let result = ph("hola");
let stress = IPA_STRESS.to_string();
assert!(
result.contains(&stress),
"should have stress marker: {:?}",
result
);
assert!(result.contains(&"o".to_string()));
assert!(result.contains(&"l".to_string()));
assert!(result.contains(&"a".to_string()));
assert!(
!result.iter().any(|s| s == "h"),
"h should be silent: {:?}",
result
);
}
#[test]
fn test_seseo_c_before_e_and_z() {
let result_ce = ph("ce");
assert!(
result_ce.contains(&"s".to_string()),
"c before e -> s: {:?}",
result_ce
);
let result_z = ph("zapato");
assert!(
result_z.contains(&"s".to_string()),
"z -> s: {:?}",
result_z
);
assert!(
!result_z.iter().any(|s| s == "z"),
"z should not appear: {:?}",
result_z
);
}
#[test]
fn test_ch_affricate() {
let result = ph("chico");
let tch_str = PUA_TCH.to_string();
assert!(result.contains(&tch_str), "ch -> PUA_TCH: {:?}", result);
}
#[test]
fn test_ll_yeismo() {
let result = ph("calle");
let palatal = IPA_PALATAL_FRIC.to_string();
assert!(result.contains(&palatal), "ll -> ʝ: {:?}", result);
}
#[test]
fn test_rr_trill_and_word_initial_r() {
let result_rr = ph("perro");
let rr_str = PUA_RR.to_string();
assert!(result_rr.contains(&rr_str), "rr -> PUA_RR: {:?}", result_rr);
let result_r = ph("rosa");
assert!(
result_r.contains(&rr_str),
"word-initial r -> PUA_RR: {:?}",
result_r
);
}
#[test]
fn test_ntilde_palatal_nasal() {
let result = ph("niño");
let palatal = IPA_PALATAL_NASAL.to_string();
assert!(result.contains(&palatal), "ñ -> ɲ: {:?}", result);
}
#[test]
fn test_intervocalic_allophony_b_d_g() {
assert!(
ph("lobo").contains(&IPA_BETA.to_string()),
"intervocalic b -> β"
);
assert!(
ph("todo").contains(&IPA_ETH.to_string()),
"intervocalic d -> ð"
);
assert!(
ph("lago").contains(&IPA_GAMMA.to_string()),
"intervocalic g -> ɣ"
);
}
#[test]
fn test_stress_penultimate_and_final() {
let stress = IPA_STRESS.to_string();
assert!(ph("casa").contains(&stress), "penultimate stress for casa");
assert!(ph("ciudad").contains(&stress), "final stress for ciudad");
assert!(
ph("teléfono").contains(&stress),
"accent mark stress for teléfono"
);
}
#[test]
fn test_function_word_no_stress() {
let stress = IPA_STRESS.to_string();
assert!(!ph("el").contains(&stress), "function word 'el' no stress");
assert!(!ph("de").contains(&stress), "function word 'de' no stress");
}
#[test]
fn test_non_function_word_has_stress() {
let stress = IPA_STRESS.to_string();
assert!(ph("sol").contains(&stress), "content word 'sol' has stress");
}
#[test]
fn test_punctuation_preserved() {
let result = ph("¡hola!");
assert!(result.contains(&"\u{00A1}".to_string()), "¡ preserved");
assert!(result.contains(&"!".to_string()), "! preserved");
}
#[test]
fn test_prosody_length_matches_phonemes() {
let (phonemes, prosody) = ph_with_prosody("hola mundo");
assert_eq!(phonemes.len(), prosody.len());
}
#[test]
fn test_prosody_stress_a2() {
let (phonemes, prosody) = ph_with_prosody("casa");
let stress = IPA_STRESS.to_string();
if let Some(pos) = phonemes.iter().position(|s| s == &stress) {
let pi = prosody[pos].unwrap();
assert_eq!(pi.a2, 2, "stress marker should have a2=2");
}
}
#[test]
fn test_post_process_ids_bos_eos_padding() {
let phonemizer = SpanishPhonemizer::new();
let mut id_map: PhonemeIdMap = std::collections::HashMap::new();
id_map.insert("_".to_string(), vec![0]);
id_map.insert("^".to_string(), vec![1]);
id_map.insert("$".to_string(), vec![2]);
let ids = vec![10, 20, 30];
let prosody: Vec<Option<ProsodyFeature>> =
vec![Some([0, 0, 3]), Some([0, 2, 3]), Some([0, 0, 3])];
let (out_ids, out_prosody) = phonemizer.post_process_ids(ids, prosody, &id_map);
assert_eq!(out_ids.len(), 7);
assert_eq!(out_ids[0], 1, "BOS");
assert_eq!(out_ids[1], 10);
assert_eq!(out_ids[2], 0, "pad");
assert_eq!(out_ids[3], 20);
assert_eq!(out_ids[4], 0, "pad");
assert_eq!(out_ids[5], 30);
assert_eq!(out_ids[6], 2, "EOS");
assert_eq!(out_prosody.len(), 7);
}
#[test]
fn test_language_code() {
assert_eq!(SpanishPhonemizer::new().language_code(), "es");
}
#[test]
fn test_uppercase_normalized() {
assert_eq!(ph("HOLA"), ph("hola"), "uppercase normalizes to lowercase");
}
#[test]
fn test_qu_produces_k() {
let result = ph("queso");
assert!(result.contains(&"k".to_string()), "qu -> k: {:?}", result);
}
#[test]
fn test_gu_before_e_silent_u() {
let result = ph("guerra");
let g_str = IPA_G.to_string();
assert!(result.contains(&g_str), "gu before e -> g: {:?}", result);
}
#[test]
fn test_j_and_g_before_e_produce_x() {
assert!(ph("jardín").contains(&"x".to_string()), "j -> x");
assert!(ph("gente").contains(&"x".to_string()), "g before e -> x");
}
#[test]
fn test_word_final_y_vowel() {
let result = ph("hoy");
assert!(
result.contains(&"i".to_string()),
"word-final y -> i: {:?}",
result
);
}
#[test]
fn test_x_produces_ks() {
let result = ph("examen");
assert!(result.iter().any(|s| s == "k"), "x -> k: {:?}", result);
assert!(result.iter().any(|s| s == "s"), "x -> s: {:?}", result);
}
#[test]
fn test_v_same_as_b_word_initial() {
assert!(ph("vino").contains(&"b".to_string()), "word-initial v -> b");
}
#[test]
fn test_empty_text() {
assert!(ph("").is_empty());
}
#[test]
fn test_space_between_words() {
assert!(
ph("el sol").contains(&" ".to_string()),
"space between words"
);
}
#[test]
fn test_b_after_nasal_is_stop() {
let result = ph("amba");
assert!(result.contains(&"b".to_string()), "b after nasal -> stop");
assert!(
!result.contains(&IPA_BETA.to_string()),
"b after nasal NOT β"
);
}
#[test]
fn test_r_after_n_is_trill() {
let result = ph("enrique");
assert!(
result.contains(&PUA_RR.to_string()),
"r after n -> trill: {:?}",
result
);
}
#[test]
fn test_multiple_words_sentence() {
let (phonemes, prosody) = ph_with_prosody("hola, como estas");
assert_eq!(phonemes.len(), prosody.len());
let stress = IPA_STRESS.to_string();
let stress_count = phonemes.iter().filter(|s| **s == stress).count();
assert!(
stress_count >= 2,
"multiple content words have stress: {:?}",
phonemes
);
}
#[test]
fn test_sc_before_e_produces_single_s() {
let result = ph("escena");
assert!(
result.iter().any(|s| s == "s"),
"sc before e -> s: {:?}",
result
);
}
}