pub mod pls;
pub mod ssml;
use alloc::string::String;
use crate::dictionary::PronunciationDict;
use crate::dictionary::entry::{DictEntry, Pronunciation, Region};
use crate::error::{Result, ShabdakoshError};
pub fn parse_cmudict(input: &str) -> Result<PronunciationDict> {
use crate::arpabet;
use alloc::collections::BTreeMap;
let mut entries: BTreeMap<String, alloc::vec::Vec<Pronunciation>> = BTreeMap::new();
let mut pending_freq: Option<f32> = None;
let mut pending_region: Option<Region> = None;
for (line_num, line) in input.lines().enumerate() {
let line = line.trim();
if line.is_empty() {
continue;
}
if let Some(comment) = line.strip_prefix(";;;") {
for token in comment.split_whitespace() {
if let Some(val) = token.strip_prefix("@freq=") {
pending_freq = Some(val.parse::<f32>().map_err(|_| {
ShabdakoshError::DictParseError(alloc::format!(
"line {}: invalid @freq value: {val}",
line_num + 1
))
})?);
} else if let Some(val) = token.strip_prefix("@region=") {
pending_region = Some(Region::from_code(val).ok_or_else(|| {
ShabdakoshError::DictParseError(alloc::format!(
"line {}: unknown region code: {val}",
line_num + 1
))
})?);
}
}
continue;
}
let (word_part, phonemes_str) = line.split_once(" ").ok_or_else(|| {
ShabdakoshError::DictParseError(alloc::format!(
"line {}: missing two-space separator",
line_num + 1
))
})?;
let word = word_part
.trim()
.split('(')
.next()
.unwrap_or(word_part.trim())
.to_lowercase();
let mut phonemes = alloc::vec::Vec::new();
for sym in phonemes_str.split_whitespace() {
let phoneme = arpabet::arpabet_to_phoneme_with_stress(sym).ok_or_else(|| {
ShabdakoshError::DictParseError(alloc::format!(
"line {}: unknown ARPABET symbol '{sym}'",
line_num + 1
))
})?;
phonemes.push(phoneme);
}
if phonemes.is_empty() {
return Err(ShabdakoshError::DictParseError(alloc::format!(
"line {}: no phonemes for word '{word}'",
line_num + 1
)));
}
let mut pron = Pronunciation::new(phonemes);
if let Some(freq) = pending_freq.take() {
pron = pron.with_frequency(freq);
}
if let Some(region) = pending_region.take() {
pron = pron.with_region(region);
}
entries
.entry(alloc::string::ToString::to_string(&word))
.or_default()
.push(pron);
}
let mut dict = PronunciationDict::new();
for (word, pronunciations) in entries {
if let Some(entry) = DictEntry::from_pronunciations(pronunciations) {
dict.insert_entry(&word, entry);
}
}
Ok(dict)
}
#[must_use]
pub fn to_cmudict(dict: &PronunciationDict) -> String {
let mut output = String::new();
output.push_str(";;; Generated by shabdakosh\n");
let mut words: alloc::vec::Vec<&str> = dict.entries().keys().map(|s| s.as_str()).collect();
words.sort_unstable();
for word in words {
let Some(entry) = dict.entries().get(word) else {
continue;
};
write_entry_cmudict(&mut output, word, entry);
}
output
}
#[must_use]
pub fn to_cmudict_with_user(dict: &PronunciationDict) -> String {
let mut output = String::new();
output.push_str(";;; Generated by shabdakosh\n");
let mut base_words: alloc::vec::Vec<&str> = dict.entries().keys().map(|s| s.as_str()).collect();
base_words.sort_unstable();
for word in base_words {
if dict.user_entries().contains_key(word) {
continue;
}
let Some(entry) = dict.entries().get(word) else {
continue;
};
write_entry_cmudict(&mut output, word, entry);
}
for (word, entry) in dict.user_entries() {
write_entry_cmudict(&mut output, word, entry);
}
output
}
fn write_entry_cmudict(output: &mut String, word: &str, entry: &DictEntry) {
use crate::arpabet;
use core::fmt::Write;
for (i, pron) in entry.all().iter().enumerate() {
let has_freq = pron.frequency().is_some();
let has_region = pron.region().is_some();
if has_freq || has_region {
output.push_str(";;;");
if let Some(freq) = pron.frequency() {
let _ = write!(output, " @freq={freq}");
}
if let Some(region) = pron.region() {
let _ = write!(output, " @region={}", region.code());
}
output.push('\n');
}
output.push_str(word);
if i > 0 {
let _ = write!(output, "({})", i + 1);
}
output.push_str(" ");
let symbols: alloc::vec::Vec<&str> = pron
.phonemes()
.iter()
.filter_map(arpabet::phoneme_to_arpabet)
.collect();
output.push_str(&symbols.join(" "));
output.push('\n');
}
}
pub fn parse_ipa(input: &str) -> Result<PronunciationDict> {
use crate::ipa;
let mut dict = PronunciationDict::new();
for (line_num, line) in input.lines().enumerate() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let (word, ipa_str) = if let Some(slash_start) = line.find('/') {
let word = line[..slash_start].trim();
let rest = &line[slash_start + 1..];
let ipa_str = rest.trim_end_matches('/').trim();
(word, ipa_str)
} else {
let Some((word, ipa_str)) = line.split_once(char::is_whitespace) else {
return Err(ShabdakoshError::DictParseError(alloc::format!(
"line {}: missing IPA transcription",
line_num + 1
)));
};
(word.trim(), ipa_str.trim())
};
let phonemes = ipa::parse_ipa_word(ipa_str);
if phonemes.is_empty() {
return Err(ShabdakoshError::DictParseError(alloc::format!(
"line {}: no phonemes parsed from IPA '{ipa_str}'",
line_num + 1
)));
}
dict.insert(word, &phonemes);
}
Ok(dict)
}
#[must_use]
pub fn to_ipa(dict: &PronunciationDict) -> String {
use crate::ipa;
let mut output = String::new();
let mut words: alloc::vec::Vec<&str> = dict.entries().keys().map(|s| s.as_str()).collect();
words.sort_unstable();
for word in words {
let Some(entry) = dict.entries().get(word) else {
continue;
};
let ipa_str = ipa::phonemes_to_ipa(entry.primary_phonemes());
output.push_str(word);
output.push_str(" /");
output.push_str(&ipa_str);
output.push_str("/\n");
}
output
}
#[cfg(feature = "json")]
pub fn from_json(input: &str) -> Result<PronunciationDict> {
serde_json::from_str(input)
.map_err(|e| ShabdakoshError::DictParseError(alloc::format!("JSON parse error: {e}")))
}
#[cfg(feature = "json")]
pub fn to_json(dict: &PronunciationDict) -> Result<String> {
serde_json::to_string(dict)
.map_err(|e| ShabdakoshError::DictParseError(alloc::format!("JSON serialize error: {e}")))
}
#[cfg(feature = "std")]
pub fn load_cmudict_file(path: &std::path::Path) -> Result<PronunciationDict> {
let data = std::fs::read_to_string(path)
.map_err(|e| ShabdakoshError::DictParseError(alloc::format!("failed to read file: {e}")))?;
parse_cmudict(&data)
}
#[cfg(feature = "std")]
pub fn save_cmudict_file(dict: &PronunciationDict, path: &std::path::Path) -> Result<()> {
let data = to_cmudict(dict);
std::fs::write(path, data).map_err(|e| {
ShabdakoshError::DictParseError(alloc::format!("failed to write file: {e}"))
})?;
Ok(())
}
pub(crate) fn xml_escape(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for ch in s.chars() {
match ch {
'&' => out.push_str("&"),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
'"' => out.push_str("""),
'\'' => out.push_str("'"),
_ => out.push(ch),
}
}
out
}
pub(crate) fn xml_unescape(s: &str) -> String {
s.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
}