use super::token_map::token_to_pua;
use super::{Phonemizer, ProsodyFeature, ProsodyInfo};
use crate::config::PhonemeIdMap;
use crate::error::PiperError;
const PUA_Y_VOWEL: char = '\u{E01E}'; const PUA_NASAL_EIN: char = '\u{E056}'; const PUA_NASAL_AN: char = '\u{E057}'; const PUA_NASAL_ON: char = '\u{E058}';
const IPA_OPEN_E: char = '\u{025B}'; const IPA_OPEN_O: char = '\u{0254}'; const IPA_SCHWA: char = '\u{0259}'; const IPA_VOICED_G: char = '\u{0261}'; const IPA_ESH: char = '\u{0283}'; const IPA_EZH: char = '\u{0292}'; const IPA_UVULAR_R: char = '\u{0281}'; const IPA_PALATAL_N: char = '\u{0272}'; const IPA_TURNED_H: char = '\u{0265}'; const IPA_SLASHED_O: char = '\u{00F8}'; const IPA_OE_LIG: char = '\u{0153}';
fn is_vowel_char(ch: char) -> bool {
matches!(
ch,
'a' | 'e'
| 'i'
| 'o'
| 'u'
| 'y'
| '\u{00E0}' | '\u{00E2}' | '\u{00E6}' | '\u{00E9}' | '\u{00E8}' | '\u{00EA}' | '\u{00EB}' | '\u{00EE}' | '\u{00EF}' | '\u{00F4}' | '\u{00F9}' | '\u{00FB}' | '\u{00FC}' | '\u{0153}' )
}
fn is_consonant_char(ch: char) -> bool {
matches!(
ch,
'b' | 'c'
| 'd'
| 'f'
| 'g'
| 'h'
| 'j'
| 'k'
| 'l'
| 'm'
| 'n'
| 'p'
| 'q'
| 'r'
| 's'
| 't'
| 'v'
| 'w'
| 'x'
| 'z'
)
}
fn is_silent_final(ch: char) -> bool {
matches!(
ch,
'd' | 'g' | 'h' | 'm' | 'n' | 'p' | 's' | 't' | 'x' | 'z'
)
}
fn is_punctuation(ch: char) -> bool {
matches!(
ch,
',' | '.'
| ';'
| ':'
| '!'
| '?'
| '\u{00A1}'
| '\u{00BF}'
| '\u{2014}'
| '\u{2013}'
| '\u{2026}'
| '\u{00AB}'
| '\u{00BB}'
)
}
fn is_front_vowel_for_cg(ch: char) -> bool {
matches!(
ch,
'e' | 'i'
| 'y'
| '\u{00E9}'
| '\u{00E8}'
| '\u{00EA}'
| '\u{00EB}'
| '\u{00EE}'
| '\u{00EF}'
)
}
fn is_letter_fr(ch: char) -> bool {
if ch.is_ascii_lowercase() {
return true;
}
matches!(
ch,
'\u{00E0}'
| '\u{00E2}'
| '\u{00E6}'
| '\u{00E9}'
| '\u{00E8}'
| '\u{00EA}'
| '\u{00EB}'
| '\u{00EE}'
| '\u{00EF}'
| '\u{00F4}'
| '\u{00F9}'
| '\u{00FB}'
| '\u{00FC}'
| '\u{0153}'
| '\u{00E7}'
| '\u{00F1}'
)
}
fn is_ille_as_il(word: &[char]) -> bool {
let s: String = word.iter().collect();
matches!(s.as_str(), "ville" | "mille" | "tranquille")
}
fn is_er_as_ehr(word: &[char]) -> bool {
let s: String = word.iter().collect();
matches!(
s.as_str(),
"hiver"
| "enfer"
| "amer"
| "cancer"
| "super"
| "laser"
| "hamster"
| "master"
| "poster"
| "cluster"
| "starter"
| "leader"
| "transfer"
| "fer"
)
}
fn collapse_nfd(input: &[char]) -> Vec<char> {
let mut out = Vec::with_capacity(input.len());
let mut i = 0;
let n = input.len();
while i < n {
let ch = input[i];
if i + 1 < n {
let comb = input[i + 1];
let composed = match comb {
'\u{0300}' => match ch {
'A' => Some('\u{00C0}'),
'a' => Some('\u{00E0}'),
'E' => Some('\u{00C8}'),
'e' => Some('\u{00E8}'),
'U' => Some('\u{00D9}'),
'u' => Some('\u{00F9}'),
_ => None,
},
'\u{0301}' => match ch {
'E' => Some('\u{00C9}'),
'e' => Some('\u{00E9}'),
_ => None,
},
'\u{0302}' => match ch {
'A' => Some('\u{00C2}'),
'a' => Some('\u{00E2}'),
'E' => Some('\u{00CA}'),
'e' => Some('\u{00EA}'),
'I' => Some('\u{00CE}'),
'i' => Some('\u{00EE}'),
'O' => Some('\u{00D4}'),
'o' => Some('\u{00F4}'),
'U' => Some('\u{00DB}'),
'u' => Some('\u{00FB}'),
_ => None,
},
'\u{0303}' => match ch {
'N' => Some('\u{00D1}'),
'n' => Some('\u{00F1}'),
_ => None,
},
'\u{0308}' => match ch {
'E' => Some('\u{00CB}'),
'e' => Some('\u{00EB}'),
'I' => Some('\u{00CF}'),
'i' => Some('\u{00EF}'),
'U' => Some('\u{00DC}'),
'u' => Some('\u{00FC}'),
_ => None,
},
'\u{0327}' => match ch {
'C' => Some('\u{00C7}'),
'c' => Some('\u{00E7}'),
_ => None,
},
_ => None,
};
if let Some(c) = composed {
out.push(c);
i += 2;
continue;
}
}
out.push(ch);
i += 1;
}
out
}
fn to_lower_fr(ch: char) -> char {
if ch.is_ascii_uppercase() {
return (ch as u8 + 32) as char;
}
let code = ch as u32;
if (0x00C0..=0x00D6).contains(&code) || (0x00D8..=0x00DE).contains(&code) {
return char::from_u32(code + 0x20).unwrap_or(ch);
}
if code == 0x0152 {
return '\u{0153}';
}
ch
}
fn normalize(text: &str) -> Vec<char> {
let chars: Vec<char> = text.chars().collect();
let nfc = collapse_nfd(&chars);
let mut result = Vec::with_capacity(nfc.len());
let mut last_was_space = true;
for ch in nfc {
if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
if !last_was_space {
result.push(' ');
last_was_space = true;
}
continue;
}
last_was_space = false;
result.push(to_lower_fr(ch));
}
if result.last() == Some(&' ') {
result.pop();
}
result
}
fn normalize_apostrophes(chars: &[char]) -> Vec<char> {
chars
.iter()
.map(|&ch| {
if ch == '\'' || ch == '\u{2019}' || ch == '\u{2018}' {
' '
} else {
ch
}
})
.collect()
}
#[derive(Debug)]
struct Token {
text: Vec<char>,
is_punct: bool,
}
fn split_words(text: &[char]) -> Vec<Token> {
let processed = normalize_apostrophes(text);
let mut tokens = Vec::new();
let n = processed.len();
let mut i = 0;
while i < n {
let ch = processed[i];
if ch == ' ' {
i += 1;
continue;
}
if is_punctuation(ch) {
tokens.push(Token {
text: vec![ch],
is_punct: true,
});
i += 1;
continue;
}
if is_letter_fr(ch) {
let mut word = Vec::new();
while i < n && is_letter_fr(processed[i]) {
word.push(processed[i]);
i += 1;
}
tokens.push(Token {
text: word,
is_punct: false,
});
continue;
}
i += 1;
}
tokens
}
fn count_vowels(word: &[char]) -> usize {
word.iter().filter(|&&ch| is_vowel_char(ch)).count()
}
fn convert_word(word: &[char]) -> Vec<char> {
let mut phonemes: Vec<char> = Vec::new();
let mut i = 0;
let n = word.len();
while i < n {
let ch = word[i];
if ch == 'e' && i + 1 == n - 1 && word[i + 1] == 'r' {
let vc = count_vowels(word);
if vc >= 2 && !is_er_as_ehr(word) {
phonemes.push('e');
i += 2;
continue;
}
}
if ch == 'e' && i + 2 < n && word[i + 1] == 'a' && word[i + 2] == 'u' {
phonemes.push('o');
i += 3;
continue;
}
if ch == 'o'
&& i + 5 < n
&& word[i + 1] == 'u'
&& word[i + 2] == 'i'
&& word[i + 3] == 'l'
&& word[i + 4] == 'l'
&& word[i + 5] == 'e'
&& (i + 6 >= n || !is_vowel_char(word[i + 6]))
{
phonemes.push('u');
phonemes.push('j');
i += 6;
continue;
}
if ch == 'a'
&& i + 4 < n
&& word[i + 1] == 'i'
&& word[i + 2] == 'l'
&& word[i + 3] == 'l'
&& word[i + 4] == 'e'
&& (i + 5 >= n || !is_vowel_char(word[i + 5]))
{
phonemes.push('a');
phonemes.push('j');
i += 5;
continue;
}
if ch == 'e'
&& i + 5 < n
&& word[i + 1] == 'u'
&& word[i + 2] == 'i'
&& word[i + 3] == 'l'
&& word[i + 4] == 'l'
&& word[i + 5] == 'e'
&& i + 6 >= n
{
phonemes.push(IPA_OE_LIG);
phonemes.push('j');
i += 6;
continue;
}
if ch == 'e' && i + 2 < n && word[i + 1] == 'i' && word[i + 2] == 'l' && i + 3 >= n {
phonemes.push(IPA_OPEN_E);
phonemes.push('j');
i += 3;
continue;
}
if ch == 'e'
&& i + 4 < n
&& word[i + 1] == 'i'
&& word[i + 2] == 'l'
&& word[i + 3] == 'l'
&& word[i + 4] == 'e'
&& (i + 5 >= n || !is_vowel_char(word[i + 5]))
{
phonemes.push(IPA_OPEN_E);
phonemes.push('j');
i += 5;
continue;
}
if ch == 'a'
&& i + 2 < n
&& word[i + 1] == 'i'
&& (word[i + 2] == 'n' || word[i + 2] == 'm')
&& (i + 3 >= n || !is_vowel_char(word[i + 3]))
{
phonemes.push(PUA_NASAL_EIN);
i += 3;
continue;
}
if ch == 'e'
&& i + 2 < n
&& word[i + 1] == 'i'
&& (word[i + 2] == 'n' || word[i + 2] == 'm')
&& (i + 3 >= n || !is_vowel_char(word[i + 3]))
{
phonemes.push(PUA_NASAL_EIN);
i += 3;
continue;
}
if ch == 'o'
&& i + 2 < n
&& word[i + 1] == 'i'
&& word[i + 2] == 'n'
&& (i + 3 >= n || !is_vowel_char(word[i + 3]))
{
phonemes.push('w');
phonemes.push(PUA_NASAL_EIN);
i += 3;
continue;
}
if ch == 'i'
&& i + 2 < n
&& word[i + 1] == 'e'
&& word[i + 2] == 'n'
&& (i + 3 >= n || !is_vowel_char(word[i + 3]))
{
phonemes.push('j');
phonemes.push(PUA_NASAL_EIN);
i += 3;
continue;
}
if ch == 't'
&& i + 3 < n
&& word[i + 1] == 'i'
&& word[i + 2] == 'o'
&& word[i + 3] == 'n'
&& (i + 4 >= n || !is_vowel_char(word[i + 4]))
{
if i > 0 && word[i - 1] == 's' {
phonemes.push('t');
} else {
phonemes.push('s');
}
phonemes.push('j');
phonemes.push(PUA_NASAL_ON);
i += 4;
continue;
}
if ch == 'i'
&& i + 3 < n
&& word[i + 1] == 'l'
&& word[i + 2] == 'l'
&& word[i + 3] == 'e'
&& (i + 4 >= n || !is_vowel_char(word[i + 4]))
{
phonemes.push('i');
if is_ille_as_il(word) {
phonemes.push('l');
} else {
phonemes.push('j');
}
i += 4;
continue;
}
if ch == 'g' && i + 1 < n && word[i + 1] == 'n' {
phonemes.push(IPA_PALATAL_N);
i += 2;
continue;
}
if ch == 'p' && i + 1 < n && word[i + 1] == 'h' {
phonemes.push('f');
i += 2;
continue;
}
if ch == 't' && i + 1 < n && word[i + 1] == 'h' {
phonemes.push('t');
i += 2;
continue;
}
if ch == 'c' && i + 1 < n && word[i + 1] == 'h' {
phonemes.push(IPA_ESH);
i += 2;
continue;
}
if ch == 'q' && i + 1 < n && word[i + 1] == 'u' {
phonemes.push('k');
i += 2;
continue;
}
if ch == 'g'
&& i + 1 < n
&& word[i + 1] == 'u'
&& i + 2 < n
&& is_front_vowel_for_cg(word[i + 2])
{
phonemes.push(IPA_VOICED_G);
i += 2;
continue;
}
if (ch == 'a' || ch == 'e') && i + 1 < n && (word[i + 1] == 'n' || word[i + 1] == 'm') {
if i + 2 >= n {
phonemes.push(PUA_NASAL_AN);
i += 2;
continue;
}
if !is_vowel_char(word[i + 2]) && word[i + 2] != word[i + 1] {
phonemes.push(PUA_NASAL_AN);
i += 2;
continue;
}
}
if ch == 'i' && i + 1 < n && (word[i + 1] == 'n' || word[i + 1] == 'm') {
if i + 2 >= n {
phonemes.push(PUA_NASAL_EIN);
i += 2;
continue;
}
if !is_vowel_char(word[i + 2]) && word[i + 2] != word[i + 1] {
phonemes.push(PUA_NASAL_EIN);
i += 2;
continue;
}
}
if ch == 'o' && i + 1 < n && (word[i + 1] == 'n' || word[i + 1] == 'm') {
if i + 2 >= n {
phonemes.push(PUA_NASAL_ON);
i += 2;
continue;
}
if !is_vowel_char(word[i + 2]) && word[i + 2] != word[i + 1] {
phonemes.push(PUA_NASAL_ON);
i += 2;
continue;
}
}
if ch == 'u' && i + 1 < n && (word[i + 1] == 'n' || word[i + 1] == 'm') {
if i + 2 >= n {
phonemes.push(PUA_NASAL_EIN);
i += 2;
continue;
}
if !is_vowel_char(word[i + 2]) && word[i + 2] != word[i + 1] {
phonemes.push(PUA_NASAL_EIN);
i += 2;
continue;
}
}
if ch == 'y' && i + 1 < n && (word[i + 1] == 'n' || word[i + 1] == 'm') {
if i + 2 >= n {
phonemes.push(PUA_NASAL_EIN);
i += 2;
continue;
}
if !is_vowel_char(word[i + 2]) && word[i + 2] != word[i + 1] {
phonemes.push(PUA_NASAL_EIN);
i += 2;
continue;
}
}
if ch == 'o' && i + 1 < n && word[i + 1] == 'u' {
phonemes.push('u');
i += 2;
continue;
}
if ch == 'a' && i + 1 < n && word[i + 1] == 'u' {
phonemes.push('o');
i += 2;
continue;
}
if ch == 'o' && i + 1 < n && word[i + 1] == 'i' {
phonemes.push('w');
phonemes.push('a');
i += 2;
continue;
}
if ch == 'a' && i + 1 < n && word[i + 1] == 'i' {
phonemes.push(IPA_OPEN_E);
i += 2;
continue;
}
if ch == 'e' && i + 1 < n && word[i + 1] == 'i' {
phonemes.push(IPA_OPEN_E);
i += 2;
continue;
}
if (ch == 'e' || ch == '\u{0153}') && i + 1 < n && word[i + 1] == 'u' {
if i + 2 < n && is_consonant_char(word[i + 2]) && !is_silent_final(word[i + 2]) {
phonemes.push(IPA_OE_LIG);
} else {
phonemes.push(IPA_SLASHED_O);
}
i += 2;
continue;
}
if ch == '\u{00E9}' {
phonemes.push('e');
i += 1;
continue;
}
if ch == '\u{00E8}' || ch == '\u{00EA}' {
phonemes.push(IPA_OPEN_E);
i += 1;
continue;
}
if ch == '\u{00EB}' {
phonemes.push(IPA_OPEN_E);
i += 1;
continue;
}
if ch == '\u{00E0}' || ch == '\u{00E2}' {
phonemes.push('a');
i += 1;
continue;
}
if ch == 'a' {
phonemes.push('a');
i += 1;
continue;
}
if ch == '\u{00EE}' || ch == '\u{00EF}' {
phonemes.push('i');
i += 1;
continue;
}
if ch == 'i' {
if i + 1 < n && is_vowel_char(word[i + 1]) {
if i + 1 == n - 1 && word[i + 1] == 'e' {
phonemes.push('i');
} else {
phonemes.push('j');
}
} else {
phonemes.push('i');
}
i += 1;
continue;
}
if ch == '\u{00F4}' {
phonemes.push('o');
i += 1;
continue;
}
if ch == 'o' {
let eff_start = i + 1;
let mut eff_end = n;
if eff_end > eff_start {
if eff_end - eff_start >= 2 && word[eff_end - 2] == 'e' && word[eff_end - 1] == 's'
{
eff_end -= 2;
} else if word[eff_end - 1] == 'e' {
eff_end -= 1;
}
}
let mut has_effective = false;
let mut all_consonants = true;
let mut has_pronounced = false;
for &c in &word[eff_start..eff_end] {
has_effective = true;
if !is_consonant_char(c) {
all_consonants = false;
break;
}
if !is_silent_final(c) {
has_pronounced = true;
}
}
if has_effective && all_consonants && has_pronounced {
phonemes.push(IPA_OPEN_O);
} else {
phonemes.push('o');
}
i += 1;
continue;
}
if ch == '\u{00F9}' || ch == '\u{00FB}' {
phonemes.push(PUA_Y_VOWEL);
i += 1;
continue;
}
if ch == '\u{00FC}' {
phonemes.push(PUA_Y_VOWEL);
i += 1;
continue;
}
if ch == 'u' {
if i + 1 < n && word[i + 1] == 'i' {
phonemes.push(IPA_TURNED_H);
phonemes.push('i');
i += 2;
continue;
}
phonemes.push(PUA_Y_VOWEL);
i += 1;
continue;
}
if ch == 'y' {
if i + 1 < n && is_vowel_char(word[i + 1]) {
phonemes.push('j');
} else {
phonemes.push('i');
}
i += 1;
continue;
}
if ch == '\u{0153}' {
phonemes.push(IPA_OE_LIG);
i += 1;
continue;
}
if ch == '\u{00E6}' {
phonemes.push('e');
i += 1;
continue;
}
if ch == 'e' {
if i == n - 1 {
i += 1;
continue;
}
let mut cons_count = 0;
for &c in &word[(i + 1)..n] {
if is_consonant_char(c) {
cons_count += 1;
} else {
break;
}
}
if cons_count >= 2 {
phonemes.push(IPA_OPEN_E);
i += 1;
continue;
}
let remaining = &word[(i + 1)..];
let all_cons = !remaining.is_empty() && remaining.iter().all(|&c| is_consonant_char(c));
let has_pronounced = remaining.iter().any(|&c| !is_silent_final(c));
if !remaining.is_empty() && all_cons && has_pronounced {
phonemes.push(IPA_OPEN_E);
} else {
phonemes.push(IPA_SCHWA);
}
i += 1;
continue;
}
if ch == 'c' {
if i + 1 < n && is_front_vowel_for_cg(word[i + 1]) {
phonemes.push('s');
} else {
phonemes.push('k');
}
i += 1;
continue;
}
if ch == '\u{00E7}' {
phonemes.push('s');
i += 1;
continue;
}
if ch == 'g' {
if i + 1 < n && is_front_vowel_for_cg(word[i + 1]) {
phonemes.push(IPA_EZH);
} else {
phonemes.push(IPA_VOICED_G);
}
i += 1;
continue;
}
if ch == 'j' {
phonemes.push(IPA_EZH);
i += 1;
continue;
}
if ch == 'r' {
phonemes.push(IPA_UVULAR_R);
if i + 1 < n && word[i + 1] == 'r' {
i += 2;
} else {
i += 1;
}
continue;
}
if ch == 'x' {
if i == n - 1 {
i += 1;
continue;
}
{
let rem_len = n - (i + 1);
let silent_before = if rem_len == 1 && word[i + 1] == 'e' {
true
} else {
rem_len == 2 && word[i + 1] == 'e' && word[i + 2] == 's'
};
if silent_before {
i += 1;
continue;
}
}
if i > 0 && word[i - 1] == 'e' && i + 1 < n && is_vowel_char(word[i + 1]) {
phonemes.push(IPA_VOICED_G);
phonemes.push('z');
i += 1;
continue;
}
phonemes.push('k');
phonemes.push('s');
i += 1;
continue;
}
if ch == 'h' {
i += 1;
continue;
}
let mapped = match ch {
'b' => Some('b'),
'd' => Some('d'),
'f' => Some('f'),
'k' => Some('k'),
'l' => Some('l'),
'm' => Some('m'),
'n' => Some('n'),
'p' => Some('p'),
's' => Some('s'),
't' => Some('t'),
'v' => Some('v'),
'w' => Some('w'),
'z' => Some('z'),
_ => None,
};
if let Some(mapped_ch) = mapped {
let is_word_final = i == n - 1;
let is_before_final_s = n >= 2 && i == n - 2 && word[n - 1] == 's';
let is_final = is_word_final || is_before_final_s;
if is_final && is_silent_final(ch) {
i += 1;
continue;
}
if ch == 's' {
let prev_vowel = i > 0 && is_vowel_char(word[i - 1]);
let next_vowel = i + 1 < n && is_vowel_char(word[i + 1]);
let is_single = !(i + 1 < n && word[i + 1] == 's');
if prev_vowel && next_vowel && is_single {
phonemes.push('z');
i += 1;
continue;
}
}
phonemes.push(mapped_ch);
if i + 1 < n && word[i + 1] == ch {
i += 2;
} else {
i += 1;
}
continue;
}
if is_punctuation(ch) {
phonemes.push(ch);
i += 1;
continue;
}
i += 1;
}
phonemes
}
fn is_vowel_phoneme(ch: char) -> bool {
matches!(
ch,
'a' | 'e'
| 'i'
| 'o'
| 'u'
| IPA_OPEN_E
| IPA_OPEN_O
| IPA_SCHWA
| IPA_SLASHED_O
| IPA_OE_LIG
| PUA_Y_VOWEL
| PUA_NASAL_EIN
| PUA_NASAL_AN
| PUA_NASAL_ON
)
}
fn map_sequence(tokens: Vec<String>) -> Vec<String> {
tokens
.into_iter()
.map(|t| {
if let Some(pua_char) = token_to_pua(&t) {
pua_char.to_string()
} else {
t
}
})
.collect()
}
pub fn phonemize_french_with_prosody(text: &str) -> (Vec<String>, Vec<Option<ProsodyInfo>>) {
let normalized = normalize(text);
let tokens = split_words(&normalized);
let mut phonemes: Vec<String> = Vec::new();
let mut prosody_list: Vec<Option<ProsodyInfo>> = Vec::new();
let mut need_space = false;
for tok in &tokens {
if !tok.is_punct && need_space {
phonemes.push(" ".to_string());
prosody_list.push(Some(ProsodyInfo {
a1: 0,
a2: 0,
a3: 0,
}));
}
if tok.is_punct {
for &ch in &tok.text {
phonemes.push(ch.to_string());
prosody_list.push(Some(ProsodyInfo {
a1: 0,
a2: 0,
a3: 0,
}));
}
} else {
let word_phonemes = convert_word(&tok.text);
let word_phoneme_count = word_phonemes.len() as i32;
let last_vowel_idx = word_phonemes
.iter()
.enumerate()
.rev()
.find(|&(_, &ph)| is_vowel_phoneme(ph))
.map(|(idx, _)| idx);
for (j, &ph) in word_phonemes.iter().enumerate() {
let a2 = if Some(j) == last_vowel_idx { 2 } else { 0 };
phonemes.push(ph.to_string());
prosody_list.push(Some(ProsodyInfo {
a1: 0,
a2,
a3: word_phoneme_count,
}));
}
}
need_space = true;
}
let mapped = map_sequence(phonemes);
(mapped, prosody_list)
}
pub fn phonemize_french(text: &str) -> Vec<String> {
let (phonemes, _) = phonemize_french_with_prosody(text);
phonemes
}
pub struct FrenchPhonemizer;
impl FrenchPhonemizer {
pub fn new() -> Self {
Self
}
}
impl Default for FrenchPhonemizer {
fn default() -> Self {
Self::new()
}
}
impl Phonemizer for FrenchPhonemizer {
fn phonemize_with_prosody(
&self,
text: &str,
) -> Result<(Vec<String>, Vec<Option<ProsodyInfo>>), PiperError> {
Ok(phonemize_french_with_prosody(text))
}
fn get_phoneme_id_map(&self) -> Option<&PhonemeIdMap> {
None
}
fn post_process_ids(
&self,
ids: Vec<i64>,
prosody: Vec<Option<ProsodyFeature>>,
id_map: &PhonemeIdMap,
) -> (Vec<i64>, Vec<Option<ProsodyFeature>>) {
let bos_id = id_map
.get("^")
.and_then(|v| v.first().copied())
.unwrap_or(1);
let eos_id = id_map
.get("$")
.and_then(|v| v.first().copied())
.unwrap_or(2);
let pad_id = id_map
.get("_")
.and_then(|v| v.first().copied())
.unwrap_or(0);
let out_len = 1 + ids.len() * 2 + 1;
let mut out_ids = Vec::with_capacity(out_len);
let mut out_prosody: Vec<Option<ProsodyFeature>> = Vec::with_capacity(out_len);
out_ids.push(bos_id);
out_prosody.push(None);
for (idx, &id) in ids.iter().enumerate() {
out_ids.push(pad_id);
out_prosody.push(None);
out_ids.push(id);
out_prosody.push(prosody.get(idx).copied().flatten());
}
out_ids.push(pad_id);
out_prosody.push(None);
out_ids.push(eos_id);
out_prosody.push(None);
(out_ids, out_prosody)
}
fn language_code(&self) -> &str {
"fr"
}
}
#[cfg(test)]
mod tests {
use super::*;
fn word_ph(word: &str) -> String {
let chars: Vec<char> = word.chars().collect();
convert_word(&chars).iter().collect()
}
fn ph_str(text: &str) -> String {
let (tokens, _) = phonemize_french_with_prosody(text);
tokens.join("")
}
#[test]
fn test_nasal_an() {
let result = word_ph("france");
assert!(
result.contains(PUA_NASAL_AN),
"expected nasal-an in france: {result}"
);
}
#[test]
fn test_nasal_on() {
assert_eq!(word_ph("bon"), format!("b{PUA_NASAL_ON}"));
}
#[test]
fn test_nasal_ein() {
assert_eq!(word_ph("vin"), format!("v{PUA_NASAL_EIN}"));
}
#[test]
fn test_silent_final_t() {
let result = word_ph("chat");
assert!(result.contains(IPA_ESH), "expected esh in chat: {result}");
assert!(
!result.ends_with('t'),
"final t should be silent in chat: {result}"
);
}
#[test]
fn test_tion_suffix() {
let result = word_ph("nation");
assert!(result.contains('s'), "expected 's' from -tion: {result}");
assert!(result.contains('j'), "expected 'j' from -tion: {result}");
assert!(result.contains(PUA_NASAL_ON), "expected nasal-on: {result}");
}
#[test]
fn test_ille_default() {
let result = word_ph("fille");
assert!(result.contains('j'), "fille should have j: {result}");
assert!(!result.contains('l'), "fille should not have l: {result}");
}
#[test]
fn test_ille_exception_ville() {
let result = word_ph("ville");
assert!(result.contains('l'), "ville should have l: {result}");
}
#[test]
fn test_eau() {
assert_eq!(word_ph("beau"), "bo");
}
#[test]
fn test_oi() {
assert_eq!(word_ph("moi"), "mwa");
}
#[test]
fn test_er_verb_ending() {
let result = word_ph("parler");
assert!(
result.ends_with('e'),
"polysyllabic -er should end /e/: {result}"
);
}
#[test]
fn test_er_exception() {
let result = word_ph("hiver");
assert!(
result.contains(IPA_UVULAR_R),
"hiver should have uvular-R: {result}"
);
}
#[test]
fn test_ch_digraph() {
let result = word_ph("cher");
assert!(result.contains(IPA_ESH), "expected esh in cher: {result}");
}
#[test]
fn test_gn_digraph() {
let result = word_ph("ligne");
assert!(
result.contains(IPA_PALATAL_N),
"expected palatal-N in ligne: {result}"
);
}
#[test]
fn test_intervocalic_s() {
let result = word_ph("maison");
assert!(
result.contains('z'),
"intervocalic s should be z in maison: {result}"
);
}
#[test]
fn test_u_before_i() {
let result = word_ph("lui");
assert!(
result.contains(IPA_TURNED_H),
"u before i -> turned-h in lui: {result}"
);
}
#[test]
fn test_uppercase_normalization() {
let result = ph_str("BONJOUR");
assert!(result.contains('b'), "uppercase should normalize: {result}");
}
#[test]
fn test_nfd_normalization() {
let nfd = "e\u{0301}";
let chars: Vec<char> = nfd.chars().collect();
let collapsed = collapse_nfd(&chars);
assert_eq!(collapsed, vec!['\u{00E9}']);
}
#[test]
fn test_post_process_ids_bos_eos_padding() {
use std::collections::HashMap;
let phonemizer = FrenchPhonemizer::new();
let mut id_map: HashMap<String, Vec<i64>> = HashMap::new();
id_map.insert("^".into(), vec![1]);
id_map.insert("$".into(), vec![2]);
id_map.insert("_".into(), vec![0]);
let ids = vec![10, 20, 30];
let prosody = vec![Some([0, 0, 3]), Some([0, 2, 3]), Some([0, 0, 3])];
let (out_ids, out_prosody) = phonemizer.post_process_ids(ids, prosody, &id_map);
assert_eq!(out_ids, vec![1, 0, 10, 0, 20, 0, 30, 0, 2]);
assert_eq!(out_prosody.len(), out_ids.len());
assert!(out_prosody[0].is_none());
assert_eq!(out_prosody[2], Some([0, 0, 3]));
}
#[test]
fn test_full_sentence() {
let (tokens, prosody) = phonemize_french_with_prosody("Bonjour, comment allez-vous?");
assert!(!tokens.is_empty());
assert_eq!(tokens.len(), prosody.len());
assert!(tokens.contains(&",".to_string()));
assert!(tokens.contains(&"?".to_string()));
}
#[test]
fn test_doubled_consonants() {
let result = word_ph("belle");
let l_count = result.chars().filter(|&c| c == 'l').count();
assert_eq!(l_count, 1, "doubled l -> single l in belle: {result}");
}
#[test]
fn test_c_before_front_vowel() {
let result = word_ph("ciel");
assert!(result.starts_with('s'), "c before i -> s in ciel: {result}");
}
#[test]
fn test_g_before_front_vowel() {
let result = word_ph("gel");
assert!(
result.starts_with(IPA_EZH),
"g before e -> ezh in gel: {result}"
);
}
#[test]
fn test_pua_nasal_in_output() {
let (tokens, _) = phonemize_french_with_prosody("bon");
let nasal_on_pua = PUA_NASAL_ON.to_string();
assert!(
tokens.contains(&nasal_on_pua),
"bon -> PUA nasal-on: {:?}",
tokens
);
}
#[test]
fn test_language_code() {
assert_eq!(FrenchPhonemizer::new().language_code(), "fr");
}
#[test]
fn test_prosody_stress_on_last_vowel() {
let (_, prosody) = phonemize_french_with_prosody("bonjour");
let stressed: Vec<_> = prosody
.iter()
.filter(|p| p.map_or(false, |pi| pi.a2 == 2))
.collect();
assert!(!stressed.is_empty(), "should have stressed phoneme");
}
#[test]
fn test_empty_input() {
let (tokens, prosody) = phonemize_french_with_prosody("");
assert!(tokens.is_empty());
assert!(prosody.is_empty());
}
#[test]
fn test_oin_nasal() {
let result = word_ph("loin");
assert!(result.contains('w'), "oin -> w: {result}");
assert!(result.contains(PUA_NASAL_EIN), "oin -> nasal-ein: {result}");
}
#[test]
fn test_qu_digraph() {
assert_eq!(word_ph("que"), "k");
}
#[test]
fn test_eu_open() {
let result = word_ph("fleur");
assert!(
result.contains(IPA_OE_LIG),
"eu before r -> open in fleur: {result}"
);
}
#[test]
fn test_eu_closed() {
let result = word_ph("jeu");
assert!(
result.contains(IPA_SLASHED_O),
"eu at end -> closed in jeu: {result}"
);
}
#[test]
fn test_phonemizer_trait() {
let p = FrenchPhonemizer::new();
let result = p.phonemize_with_prosody("Bonjour");
assert!(result.is_ok());
let (tokens, prosody) = result.unwrap();
assert!(!tokens.is_empty());
assert_eq!(tokens.len(), prosody.len());
}
#[test]
fn test_gu_before_front_vowel() {
let result = word_ph("guerre");
assert!(
result.contains(IPA_VOICED_G),
"gu+e -> voiced-g in guerre: {result}"
);
}
#[test]
fn test_c_cedilla() {
let result = word_ph("gar\u{00E7}on");
assert!(result.contains('s'), "c-cedilla -> s: {result}");
}
#[test]
fn test_eille_pattern() {
let result = word_ph("abeille");
assert!(result.contains(IPA_OPEN_E), "eille -> open-e: {result}");
assert!(result.contains('j'), "eille -> j: {result}");
}
#[test]
fn test_y_vowel_pua() {
let (tokens, _) = phonemize_french_with_prosody("tu");
let y_pua = PUA_Y_VOWEL.to_string();
assert!(tokens.contains(&y_pua), "tu -> PUA y_vowel: {:?}", tokens);
}
#[test]
fn test_doubled_r() {
let result = word_ph("terre");
let r_count = result.chars().filter(|&c| c == IPA_UVULAR_R).count();
assert_eq!(r_count, 1, "doubled r -> single R in terre: {result}");
}
#[test]
fn test_ien_nasal() {
let result = word_ph("bien");
assert!(result.contains('j'), "ien -> j: {result}");
assert!(result.contains(PUA_NASAL_EIN), "ien -> nasal-ein: {result}");
}
#[test]
fn test_ph_digraph() {
let result = word_ph("photo");
assert!(result.starts_with('f'), "ph -> f in photo: {result}");
}
#[test]
fn test_apostrophe_word_boundary() {
let result = ph_str("l'ami");
assert!(result.contains('l'), "expected l in l'ami: {result}");
assert!(result.contains('a'), "expected a in l'ami: {result}");
}
}