shabdakosh 2.0.0

shabdakosh — Pronunciation dictionary with ARPABET/CMUdict support for svara phonemes
Documentation
//! Import/export for pronunciation dictionaries.
//!
//! Supports multiple formats:
//! - **CMUdict** text format with variant/annotation extensions
//! - **IPA** text format (`word /IPA/`)
//! - **W3C PLS** (Pronunciation Lexicon Specification) XML — [`pls`] module
//! - **SSML** `<phoneme>` tags — [`ssml`] module
//! - **JSON** (requires `json` feature)

#[cfg(feature = "binary")]
pub mod binary;
pub mod pls;
pub mod ssml;

use alloc::string::String;

use crate::dictionary::PronunciationDict;
use crate::dictionary::entry::{DictEntry, Pronunciation, Region};
use crate::error::{Result, ShabdakoshError};

/// Parses an extended CMUdict-format string into a [`PronunciationDict`].
///
/// Supports variant pronunciations (`WORD(n)`) and metadata annotations
/// (`@freq`, `@region`).
///
/// # Errors
///
/// Returns [`ShabdakoshError::DictParseError`] if the format is invalid or
/// an ARPABET symbol is unrecognized.
pub fn parse_cmudict(input: &str) -> Result<PronunciationDict> {
    use crate::arpabet;
    use alloc::collections::BTreeMap;

    let mut entries: BTreeMap<String, alloc::vec::Vec<Pronunciation>> = BTreeMap::new();
    let mut pending_freq: Option<f32> = None;
    let mut pending_region: Option<Region> = None;

    for (line_num, line) in input.lines().enumerate() {
        let line = line.trim();
        if line.is_empty() {
            continue;
        }

        // Handle comment lines (may contain annotations)
        if let Some(comment) = line.strip_prefix(";;;") {
            for token in comment.split_whitespace() {
                if let Some(val) = token.strip_prefix("@freq=") {
                    pending_freq = Some(val.parse::<f32>().map_err(|_| {
                        ShabdakoshError::DictParseError(alloc::format!(
                            "line {}: invalid @freq value: {val}",
                            line_num + 1
                        ))
                    })?);
                } else if let Some(val) = token.strip_prefix("@region=") {
                    pending_region = Some(Region::from_code(val).ok_or_else(|| {
                        ShabdakoshError::DictParseError(alloc::format!(
                            "line {}: unknown region code: {val}",
                            line_num + 1
                        ))
                    })?);
                }
            }
            continue;
        }

        let (word_part, phonemes_str) = line.split_once("  ").ok_or_else(|| {
            ShabdakoshError::DictParseError(alloc::format!(
                "line {}: missing two-space separator",
                line_num + 1
            ))
        })?;

        // Strip (n) variant suffix
        let word = word_part
            .trim()
            .split('(')
            .next()
            .unwrap_or(word_part.trim())
            .to_lowercase();

        let mut phonemes = alloc::vec::Vec::new();
        for sym in phonemes_str.split_whitespace() {
            let phoneme = arpabet::arpabet_to_phoneme_with_stress(sym).ok_or_else(|| {
                ShabdakoshError::DictParseError(alloc::format!(
                    "line {}: unknown ARPABET symbol '{sym}'",
                    line_num + 1
                ))
            })?;
            phonemes.push(phoneme);
        }

        if phonemes.is_empty() {
            return Err(ShabdakoshError::DictParseError(alloc::format!(
                "line {}: no phonemes for word '{word}'",
                line_num + 1
            )));
        }

        let mut pron = Pronunciation::new(phonemes);
        if let Some(freq) = pending_freq.take() {
            pron = pron.with_frequency(freq);
        }
        if let Some(region) = pending_region.take() {
            pron = pron.with_region(region);
        }

        entries
            .entry(alloc::string::ToString::to_string(&word))
            .or_default()
            .push(pron);
    }

    // Build the dictionary from collected entries
    let mut dict = PronunciationDict::new();
    for (word, pronunciations) in entries {
        if let Some(entry) = DictEntry::from_pronunciations(pronunciations) {
            dict.insert_entry(&word, entry);
        }
    }

    Ok(dict)
}

/// Serializes a [`PronunciationDict`] to extended CMUdict text format.
///
/// Only base entries are exported (not user overlay entries).
/// Use [`to_cmudict_with_user`] to include user entries.
#[must_use]
pub fn to_cmudict(dict: &PronunciationDict) -> String {
    let mut output = String::new();
    output.push_str(";;; Generated by shabdakosh\n");

    // Sort keys for deterministic output
    let mut words: alloc::vec::Vec<&str> = dict.entries().keys().map(|s| s.as_str()).collect();
    words.sort_unstable();

    for word in words {
        let Some(entry) = dict.entries().get(word) else {
            continue;
        };
        write_entry_cmudict(&mut output, word, entry);
    }

    output
}

/// Serializes a [`PronunciationDict`] to extended CMUdict text format,
/// including both base and user overlay entries.
#[must_use]
pub fn to_cmudict_with_user(dict: &PronunciationDict) -> String {
    let mut output = String::new();
    output.push_str(";;; Generated by shabdakosh\n");

    // Base entries (skip if overridden by user), sorted for deterministic output
    let mut base_words: alloc::vec::Vec<&str> = dict.entries().keys().map(|s| s.as_str()).collect();
    base_words.sort_unstable();

    for word in base_words {
        if dict.user_entries().contains_key(word) {
            continue;
        }
        let Some(entry) = dict.entries().get(word) else {
            continue;
        };
        write_entry_cmudict(&mut output, word, entry);
    }

    // User entries (BTreeMap, already sorted)
    for (word, entry) in dict.user_entries() {
        write_entry_cmudict(&mut output, word, entry);
    }

    output
}

/// Writes a single dictionary entry in extended CMUdict format.
fn write_entry_cmudict(output: &mut String, word: &str, entry: &DictEntry) {
    use crate::arpabet;
    use core::fmt::Write;

    for (i, pron) in entry.all().iter().enumerate() {
        // Emit metadata annotations if present
        let has_freq = pron.frequency().is_some();
        let has_region = pron.region().is_some();
        if has_freq || has_region {
            output.push_str(";;;");
            if let Some(freq) = pron.frequency() {
                let _ = write!(output, " @freq={freq}");
            }
            if let Some(region) = pron.region() {
                let _ = write!(output, " @region={}", region.code());
            }
            output.push('\n');
        }

        // Word (with variant suffix for non-primary)
        output.push_str(word);
        if i > 0 {
            let _ = write!(output, "({})", i + 1);
        }
        output.push_str("  ");

        let symbols: alloc::vec::Vec<&str> = pron
            .phonemes()
            .iter()
            .filter_map(arpabet::phoneme_to_arpabet)
            .collect();
        output.push_str(&symbols.join(" "));
        output.push('\n');
    }
}

/// Parses an IPA-format dictionary string into a [`PronunciationDict`].
///
/// Each line: `word /IPA/` or `word IPA` (one word per line).
/// Comment lines starting with `#` are ignored.
///
/// # Errors
///
/// Returns [`ShabdakoshError::DictParseError`] if the format is invalid.
pub fn parse_ipa(input: &str) -> Result<PronunciationDict> {
    use crate::ipa;

    let mut dict = PronunciationDict::new();

    for (line_num, line) in input.lines().enumerate() {
        let line = line.trim();
        if line.is_empty() || line.starts_with('#') {
            continue;
        }

        // Try "word /IPA/" format first, then "word IPA"
        let (word, ipa_str) = if let Some(slash_start) = line.find('/') {
            let word = line[..slash_start].trim();
            let rest = &line[slash_start + 1..];
            let ipa_str = rest.trim_end_matches('/').trim();
            (word, ipa_str)
        } else {
            // Split on first whitespace
            let Some((word, ipa_str)) = line.split_once(char::is_whitespace) else {
                return Err(ShabdakoshError::DictParseError(alloc::format!(
                    "line {}: missing IPA transcription",
                    line_num + 1
                )));
            };
            (word.trim(), ipa_str.trim())
        };

        let phonemes = ipa::parse_ipa_word(ipa_str);
        if phonemes.is_empty() {
            return Err(ShabdakoshError::DictParseError(alloc::format!(
                "line {}: no phonemes parsed from IPA '{ipa_str}'",
                line_num + 1
            )));
        }

        dict.insert(word, &phonemes);
    }

    Ok(dict)
}

/// Serializes a [`PronunciationDict`] to IPA format.
///
/// Output format: `word /IPA/` (one per line, sorted alphabetically).
/// Only the primary pronunciation is exported for each word.
#[must_use]
pub fn to_ipa(dict: &PronunciationDict) -> String {
    use crate::ipa;

    let mut output = String::new();

    let mut words: alloc::vec::Vec<&str> = dict.entries().keys().map(|s| s.as_str()).collect();
    words.sort_unstable();

    for word in words {
        let Some(entry) = dict.entries().get(word) else {
            continue;
        };
        let ipa_str = ipa::phonemes_to_ipa(entry.primary_phonemes());
        output.push_str(word);
        output.push_str(" /");
        output.push_str(&ipa_str);
        output.push_str("/\n");
    }

    output
}

/// Parses a JSON string into a [`PronunciationDict`].
///
/// The JSON should be a direct serialization of [`PronunciationDict`].
///
/// # Errors
///
/// Returns [`ShabdakoshError::DictParseError`] if deserialization fails.
#[cfg(feature = "json")]
pub fn from_json(input: &str) -> Result<PronunciationDict> {
    serde_json::from_str(input)
        .map_err(|e| ShabdakoshError::DictParseError(alloc::format!("JSON parse error: {e}")))
}

/// Serializes a [`PronunciationDict`] to a JSON string.
///
/// # Errors
///
/// Returns [`ShabdakoshError::DictParseError`] if serialization fails.
#[cfg(feature = "json")]
pub fn to_json(dict: &PronunciationDict) -> Result<String> {
    serde_json::to_string(dict)
        .map_err(|e| ShabdakoshError::DictParseError(alloc::format!("JSON serialize error: {e}")))
}

/// Loads a CMUdict file from the filesystem.
///
/// # Errors
///
/// Returns [`ShabdakoshError::DictParseError`] on I/O or parse failure.
#[cfg(feature = "std")]
pub fn load_cmudict_file(path: &std::path::Path) -> Result<PronunciationDict> {
    let data = std::fs::read_to_string(path)
        .map_err(|e| ShabdakoshError::DictParseError(alloc::format!("failed to read file: {e}")))?;
    parse_cmudict(&data)
}

/// Saves a [`PronunciationDict`] to a CMUdict file.
///
/// # Errors
///
/// Returns [`ShabdakoshError::DictParseError`] on I/O failure.
#[cfg(feature = "std")]
pub fn save_cmudict_file(dict: &PronunciationDict, path: &std::path::Path) -> Result<()> {
    let data = to_cmudict(dict);
    std::fs::write(path, data).map_err(|e| {
        ShabdakoshError::DictParseError(alloc::format!("failed to write file: {e}"))
    })?;
    Ok(())
}

// --- XML helpers (shared by PLS and SSML modules) ---

/// Escapes XML special characters in text content.
pub(crate) fn xml_escape(s: &str) -> String {
    let mut out = String::with_capacity(s.len());
    for ch in s.chars() {
        match ch {
            '&' => out.push_str("&amp;"),
            '<' => out.push_str("&lt;"),
            '>' => out.push_str("&gt;"),
            '"' => out.push_str("&quot;"),
            '\'' => out.push_str("&apos;"),
            _ => out.push(ch),
        }
    }
    out
}

/// Unescapes XML entities in text content.
pub(crate) fn xml_unescape(s: &str) -> String {
    s.replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&quot;", "\"")
        .replace("&apos;", "'")
}