use std::collections::{HashMap, VecDeque};
use crate::expand_tasks::get_tasks_for_language;
use crate::lang_detect::StreamingLanguageDetector;
use crate::semantic::Language;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TextUnit {
Word(String, Language),
Space,
ClauseBoundary(char),
Punctuation(char),
}
impl TextUnit {
pub fn from_expand_unit(unit: ExpandUnit, language: Language) -> Self {
match unit {
ExpandUnit::Word(s) | ExpandUnit::Number(s) => TextUnit::Word(s, language),
ExpandUnit::Mark(c) if c.is_whitespace() => TextUnit::Space,
ExpandUnit::Mark(c) if matches!(c, ',' | '.' | '!' | '?' | ';' | ':') => {
TextUnit::ClauseBoundary(c)
}
ExpandUnit::Mark(c) => TextUnit::Punctuation(c),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum ExpandUnit {
Word(String),
Mark(char),
Number(String),
}
impl ExpandUnit {
pub fn tokenize(input: &str) -> Vec<Self> {
let mut units = Vec::new();
let mut buffer = String::new();
let mut buffer_is_number = false;
let flush = |buffer: &mut String, is_number: bool, units: &mut Vec<Self>| {
if !buffer.is_empty() {
let content = std::mem::take(buffer);
if is_number {
units.push(ExpandUnit::Number(content));
} else {
units.push(ExpandUnit::Word(content));
}
}
};
for ch in input.chars() {
if ch.is_alphabetic() || ch == '\'' {
if !buffer.is_empty() && buffer_is_number {
flush(&mut buffer, buffer_is_number, &mut units);
}
buffer.push(ch);
buffer_is_number = false;
} else if ch.is_ascii_digit() {
if !buffer.is_empty() && !buffer_is_number {
flush(&mut buffer, buffer_is_number, &mut units);
}
buffer.push(ch);
buffer_is_number = true;
} else {
flush(&mut buffer, buffer_is_number, &mut units);
units.push(ExpandUnit::Mark(ch));
}
}
flush(&mut buffer, buffer_is_number, &mut units);
units
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ExpandResult {
Maybe,
Replace(usize, Vec<ExpandUnit>),
}
pub trait ExpandTask: Send + Sync {
fn expand(&self, queue: &VecDeque<ExpandUnit>) -> Option<ExpandResult>;
}
pub struct TextExpand {
tasks_by_lang: HashMap<Language, Vec<Box<dyn ExpandTask>>>,
current_language: Language,
lang_detector: Option<StreamingLanguageDetector>,
input_units: VecDeque<ExpandUnit>,
input_langs: VecDeque<Language>,
output_units: VecDeque<(ExpandUnit, Language)>,
buffer: String,
buffer_is_number: bool,
}
impl TextExpand {
pub fn with_language(language: Language) -> Self {
let mut tasks_by_lang = HashMap::new();
tasks_by_lang.insert(language, get_tasks_for_language(language));
Self {
tasks_by_lang,
current_language: language,
lang_detector: None,
input_units: VecDeque::new(),
input_langs: VecDeque::new(),
output_units: VecDeque::new(),
buffer: String::new(),
buffer_is_number: false,
}
}
pub fn with_detector(
languages: &[Language],
default_language: Language,
detector: StreamingLanguageDetector,
) -> Self {
let mut tasks_by_lang = HashMap::new();
for &lang in languages {
tasks_by_lang.insert(lang, get_tasks_for_language(lang));
}
Self {
tasks_by_lang,
current_language: default_language,
lang_detector: Some(detector),
input_units: VecDeque::new(),
input_langs: VecDeque::new(),
output_units: VecDeque::new(),
buffer: String::new(),
buffer_is_number: false,
}
}
pub fn new(tasks: Vec<Box<dyn ExpandTask>>) -> Self {
let mut tasks_by_lang = HashMap::new();
tasks_by_lang.insert(Language::English, tasks);
Self {
tasks_by_lang,
current_language: Language::English,
lang_detector: None,
input_units: VecDeque::new(),
input_langs: VecDeque::new(),
output_units: VecDeque::new(),
buffer: String::new(),
buffer_is_number: false,
}
}
pub fn push(&mut self, ch: char) -> Option<(ExpandUnit, Language)> {
self.process_char(ch);
self.try_expand(false);
self.output_units.pop_front()
}
pub fn finish(&mut self) -> Option<(ExpandUnit, Language)> {
self.flush_buffer();
self.try_expand(true);
self.output_units.pop_front()
}
fn process_char(&mut self, ch: char) {
if ch.is_alphabetic() || ch == '\'' {
if !self.buffer.is_empty() && self.buffer_is_number {
self.flush_buffer();
}
self.buffer.push(ch);
self.buffer_is_number = false;
} else if ch.is_ascii_digit() {
if !self.buffer.is_empty() && !self.buffer_is_number {
self.flush_buffer();
}
self.buffer.push(ch);
self.buffer_is_number = true;
} else {
self.flush_buffer();
let mark = ExpandUnit::Mark(ch);
let lang = if let Some(detector) = &mut self.lang_detector {
let lang = detector.push(&mark);
if matches!(ch, '.' | '?' | '!') {
detector.reset_context();
}
lang
} else {
self.current_language
};
self.input_units.push_back(mark);
self.input_langs.push_back(lang);
}
}
fn flush_buffer(&mut self) {
if self.buffer.is_empty() {
return;
}
let content = std::mem::take(&mut self.buffer);
let unit = if self.buffer_is_number {
ExpandUnit::Number(content)
} else {
ExpandUnit::Word(content)
};
let lang = if let Some(detector) = &mut self.lang_detector {
detector.push(&unit)
} else {
self.current_language
};
self.input_units.push_back(unit);
self.input_langs.push_back(lang);
}
fn try_expand(&mut self, is_final: bool) {
'outer: while !self.input_units.is_empty() {
debug_assert_eq!(
self.input_units.len(),
self.input_langs.len(),
"parallel queue invariant violated"
);
let front_lang = self.input_langs[0];
let tasks = self
.tasks_by_lang
.get(&front_lang)
.map(Vec::as_slice)
.unwrap_or(&[]);
for task in tasks {
match task.expand(&self.input_units) {
Some(ExpandResult::Maybe) => {
if !is_final {
break 'outer;
}
}
Some(ExpandResult::Replace(n, new_units)) => {
debug_assert!(n > 0, "ExpandTask::expand must consume at least one unit");
for _ in 0..n {
self.input_units.pop_front();
self.input_langs.pop_front();
}
for unit in new_units.into_iter().rev() {
self.input_units.push_front(unit);
self.input_langs.push_front(front_lang);
}
continue 'outer;
}
None => {}
}
}
if let Some(unit) = self.input_units.pop_front() {
let lang = self
.input_langs
.pop_front()
.unwrap_or(self.current_language);
self.output_units.push_back((unit, lang));
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::semantic::Language;
fn run_test(lang: Language, input: &str, expected: Vec<ExpandUnit>) {
let mut expander = TextExpand::with_language(lang);
let mut units = Vec::new();
for ch in input.chars() {
if let Some((unit, _lang)) = expander.push(ch) {
units.push(unit);
}
}
while let Some((unit, _lang)) = expander.finish() {
units.push(unit);
}
assert_eq!(
units, expected,
"Failed for input: '{}' in {:?}",
input, lang
);
}
#[test]
fn test_text_expand_cases_en() {
let cases = vec![
(
"12:30",
vec![
ExpandUnit::Word("twelve".into()),
ExpandUnit::Word("thirty".into()),
],
),
(
"12:00",
vec![
ExpandUnit::Word("twelve".into()),
ExpandUnit::Word("o'clock".into()),
],
),
(
"12:05",
vec![
ExpandUnit::Word("twelve".into()),
ExpandUnit::Word("oh".into()),
ExpandUnit::Word("five".into()),
],
),
(
"24/03/2026",
vec![
ExpandUnit::Word("March".into()),
ExpandUnit::Word("twenty".into()),
ExpandUnit::Word("fourth".into()),
ExpandUnit::Mark(','),
ExpandUnit::Word("two".into()),
ExpandUnit::Word("thousand".into()),
ExpandUnit::Word("and".into()),
ExpandUnit::Word("twenty".into()),
ExpandUnit::Word("six".into()),
],
),
(
"hello 123",
vec![
ExpandUnit::Word("hello".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("one".into()),
ExpandUnit::Word("hundred".into()),
ExpandUnit::Word("and".into()),
ExpandUnit::Word("twenty".into()),
ExpandUnit::Word("three".into()),
],
),
(
"ABC HFP",
vec![
ExpandUnit::Word("A".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("B".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("C".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("H".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("F".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("P".into()),
],
),
(
"Dr Smith vs Mr John",
vec![
ExpandUnit::Word("doctor".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("Smith".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("versus".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("mister".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("John".into()),
],
),
];
for (input, expected) in cases {
run_test(Language::English, input, expected);
}
}
#[test]
fn test_text_expand_cases_vi() {
let cases = vec![
(
"12:30",
vec![
ExpandUnit::Word("mười".into()),
ExpandUnit::Word("hai".into()),
ExpandUnit::Word("giờ".into()),
ExpandUnit::Word("ba".into()),
ExpandUnit::Word("mươi".into()),
ExpandUnit::Word("phút".into()),
],
),
(
"24/03",
vec![
ExpandUnit::Word("ngày".into()),
ExpandUnit::Word("hai".into()),
ExpandUnit::Word("mươi".into()),
ExpandUnit::Word("tư".into()),
ExpandUnit::Word("tháng".into()),
ExpandUnit::Word("ba".into()),
],
),
(
"105",
vec![
ExpandUnit::Word("một".into()),
ExpandUnit::Word("trăm".into()),
ExpandUnit::Word("linh".into()),
ExpandUnit::Word("năm".into()),
],
),
(
"21",
vec![
ExpandUnit::Word("hai".into()),
ExpandUnit::Word("mươi".into()),
ExpandUnit::Word("mốt".into()),
],
),
(
"15",
vec![
ExpandUnit::Word("mười".into()),
ExpandUnit::Word("lăm".into()),
],
),
(
"FPT abc TP hcm v.v.",
vec![
ExpandUnit::Word("F".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("P".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("T".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("abc".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("thành".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("phố".into()),
ExpandUnit::Mark(' '),
ExpandUnit::Word("hcm".into()), ExpandUnit::Mark(' '),
ExpandUnit::Word("v".into()),
ExpandUnit::Mark('.'),
ExpandUnit::Word("v".into()),
ExpandUnit::Mark('.'),
],
),
];
for (input, expected) in cases {
run_test(Language::Vietnamese, input, expected);
}
}
}