piper-phoneme-streaming 0.1.1

A high-performance Rust library for streaming Text-to-Phoneme (G2P) conversion.
Documentation
use std::collections::{HashMap, VecDeque};

use crate::text_expand::{ExpandResult, ExpandTask, ExpandUnit};

#[derive(Default, Debug)]
struct TrieNode {
    children: HashMap<ExpandUnit, TrieNode>,
    value: Option<Vec<ExpandUnit>>,
}

pub struct ShortDictExpandTask {
    root: TrieNode,
}

impl ShortDictExpandTask {
    pub fn new(content: &str) -> Self {
        let mut root = TrieNode::default();
        for line in content.lines() {
            let line = line.trim();
            if line.is_empty() || line.starts_with('#') {
                continue;
            }
            if let Some((short, expanded)) = line.split_once('=') {
                let short_units = ExpandUnit::tokenize(short.trim());
                let expanded_units = ExpandUnit::tokenize(expanded.trim());

                if short_units.is_empty() {
                    continue;
                }

                let mut current = &mut root;
                for unit in short_units {
                    current = current.children.entry(unit).or_default();
                }
                current.value = Some(expanded_units);
            }
        }
        Self { root }
    }

    pub fn load_from_file_or_default(path: &str, default_content: &str) -> Self {
        if let Ok(content) = std::fs::read_to_string(path) {
            Self::new(&content)
        } else {
            Self::new(default_content)
        }
    }
}

impl ExpandTask for ShortDictExpandTask {
    fn expand(&self, queue: &VecDeque<ExpandUnit>) -> Option<ExpandResult> {
        let mut current = &self.root;
        let mut best_match = None;
        let mut matched_all_queue = true;

        for (depth, unit) in queue.iter().enumerate() {
            if let Some(child) = current.children.get(unit) {
                current = child;
                if let Some(val) = &current.value {
                    best_match = Some((depth + 1, val.clone()));
                }
            } else {
                matched_all_queue = false;
                break;
            }
        }

        // If we ran out of queue items but the current node still has children,
        // we might match a longer prefix if more items arrive.
        if matched_all_queue && !current.children.is_empty() {
            return Some(ExpandResult::Maybe);
        }

        if let Some((len, replacement)) = best_match {
            return Some(ExpandResult::Replace(len, replacement));
        }

        None
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_short_dict_expansion() {
        let dict = "
# Comment
TP=thành phố
v.v.=vân vân
A-B=a trừ b
";
        let task = ShortDictExpandTask::new(dict);

        // Happy case: exact match (TP)
        let mut queue = VecDeque::new();
        queue.push_back(ExpandUnit::Word("TP".to_string()));
        assert_eq!(
            task.expand(&queue),
            Some(ExpandResult::Replace(
                1,
                vec![
                    ExpandUnit::Word("thành".to_string()),
                    ExpandUnit::Mark(' '),
                    ExpandUnit::Word("phố".to_string()),
                ]
            ))
        );

        // Happy case: matching multi-token prefix
        let mut queue = VecDeque::new();
        queue.push_back(ExpandUnit::Word("v".to_string()));
        queue.push_back(ExpandUnit::Mark('.'));
        queue.push_back(ExpandUnit::Word("v".to_string()));
        queue.push_back(ExpandUnit::Mark('.'));
        queue.push_back(ExpandUnit::Mark(' ')); // Extra token not matching
        assert_eq!(
            task.expand(&queue),
            Some(ExpandResult::Replace(
                4,
                vec![
                    ExpandUnit::Word("vân".to_string()),
                    ExpandUnit::Mark(' '),
                    ExpandUnit::Word("vân".to_string()),
                ]
            ))
        );

        // Edge case: Need more tokens
        let mut queue = VecDeque::new();
        queue.push_back(ExpandUnit::Word("A".to_string()));
        queue.push_back(ExpandUnit::Mark('-'));
        // Could be the start of A-B so it returns Maybe,
        // because it matched the whole queue but has children.
        assert_eq!(task.expand(&queue), Some(ExpandResult::Maybe));

        // Edge case: Mismatch case (tp)
        let mut queue = VecDeque::new();
        queue.push_back(ExpandUnit::Word("tp".to_string()));
        assert_eq!(task.expand(&queue), None);

        // Edge case: Mismatch but queue fully exhausted (returns None because node has no children)
        let mut queue = VecDeque::new();
        queue.push_back(ExpandUnit::Word("xyz".to_string()));
        assert_eq!(task.expand(&queue), None);
    }
}