piper-phoneme-streaming 0.1.1

A high-performance Rust library for streaming Text-to-Phoneme (G2P) conversion.
Documentation
use std::collections::HashMap;
use std::sync::Arc;

use crate::embedded_data::materialized_data_dir;
use crate::error::Result;
use crate::lang_detect::StreamingLanguageDetector;
use crate::phoneme::PhonemeData;
use crate::{
    G2pToken, Language, SentenceUnit, StreamingSentencePhonemeUpgrade,
    StreamingSentencePhonemeUpgradeSession, TextExpand, TextUnit, WordPhonemizer,
};

/// A stateful session for streaming grapheme-to-phoneme conversion.
///
/// This session holds the partial text expansions and sentence syntax state
/// between character pushes. It must be created via [`StreamingG2P::new_session`].
pub struct StreamingG2pSession {
    expander: TextExpand,
    session: StreamingSentencePhonemeUpgradeSession,
}

/// The core engine for streaming Text-to-Phoneme (G2P) conversion.
///
/// `StreamingG2P` processes text incrementally, expanding abbreviations/numbers
/// and resolving phonemes dynamically. It supports multi-language text using
/// internal heuristics.
pub struct StreamingG2P {
    phonemizers: HashMap<Language, WordPhonemizer>,
    stream_sentence_upgrade: StreamingSentencePhonemeUpgrade,
    default_language: Language,
    languages: Vec<Language>,
}

impl StreamingG2P {
    /// Constructs a single-language streaming G2P engine.
    pub fn new(lang: Language) -> Result<Self> {
        Self::with_languages(&[lang], lang)
    }

    /// Constructs a multi-language streaming G2P engine.
    ///
    /// It intelligently detects the language of the incoming stream from the provided
    /// `languages` list. If detection fails or is ambiguous, it falls back to the
    /// `default_language`.
    pub fn with_languages(languages: &[Language], default_language: Language) -> Result<Self> {
        let data_dir = materialized_data_dir()?;
        let phdata = Arc::new(PhonemeData::load(data_dir)?);

        let mut phonemizers = HashMap::new();
        for &lang in languages {
            phonemizers.insert(
                lang,
                WordPhonemizer::new_with_data(lang, Arc::clone(&phdata))?,
            );
        }
        Ok(Self {
            phonemizers,
            stream_sentence_upgrade: StreamingSentencePhonemeUpgrade::new(default_language)?,
            default_language,
            languages: languages.to_vec(),
        })
    }

    /// Creates a new isolated phonemization session.
    ///
    /// The session must be mutated with `push_text` and finalized with `finish`.
    pub fn new_session(&self) -> StreamingG2pSession {
        let expander = if self.languages.len() == 1 {
            TextExpand::with_language(self.default_language)
        } else {
            let detector =
                StreamingLanguageDetector::with_lingua(&self.languages, self.default_language);
            TextExpand::with_detector(&self.languages, self.default_language, detector)
        };

        StreamingG2pSession {
            session: self.stream_sentence_upgrade.new_session(),
            expander,
        }
    }

    /// Pushes a chunk of text into the streaming g2p processor.
    ///
    /// Yields any phonemes that have been fully resolved (typically at word boundaries).
    /// Partial words or unresolved expansions are buffered internally within the `session`.
    pub fn push_text(
        &self,
        session: &mut StreamingG2pSession,
        text: &str,
    ) -> Result<Vec<G2pToken>> {
        let mut outs = Vec::new();
        for ch in text.chars() {
            if let Some((unit, lang)) = session.expander.push(ch) {
                let text_unit = TextUnit::from_expand_unit(unit, lang);
                let phonemizer = self.phonemizer_for(lang);
                let su = SentenceUnit::from_text_unit(text_unit, phonemizer)?;
                outs.extend(session.session.push(su));
            }
        }
        Ok(outs)
    }

    /// Finalizes the stream and flushes any pending text buffered in the session.
    ///
    /// This should be called exactly once at the end of the text stream to guarantee
    /// the final phonemes are emitted.
    pub fn finish(&self, session: &mut StreamingG2pSession) -> Result<Vec<G2pToken>> {
        let mut outs = Vec::new();
        while let Some((unit, lang)) = session.expander.finish() {
            let text_unit = TextUnit::from_expand_unit(unit, lang);
            let phonemizer = self.phonemizer_for(lang);
            let su = SentenceUnit::from_text_unit(text_unit, phonemizer)?;
            outs.extend(session.session.push(su));
        }
        outs.extend(session.session.finish());
        Ok(outs)
    }

    fn phonemizer_for(&self, lang: Language) -> &WordPhonemizer {
        self.phonemizers
            .get(&lang)
            .or_else(|| self.phonemizers.get(&self.default_language))
            .expect("at least the default language phonemizer must be present")
    }
}