use std::collections::VecDeque;
use crate::semantic::Language;
use crate::text_expand::ExpandUnit;
const CONTEXT_WINDOW_SIZE: usize = 5;
const SINGLE_WORD_SWITCH_THRESHOLD: f64 = 0.80;
const CONTEXT_SWITCH_THRESHOLD: f64 = 0.70;
pub(crate) trait LanguageDetector: Send + Sync {
fn detect(&self, context: &str) -> Option<(Language, f64)>;
}
pub struct StreamingLanguageDetector {
detector: Box<dyn LanguageDetector>,
current_language: Language,
context_window: VecDeque<String>,
}
impl StreamingLanguageDetector {
pub(crate) fn new(default_language: Language, detector: Box<dyn LanguageDetector>) -> Self {
Self {
detector,
current_language: default_language,
context_window: VecDeque::new(),
}
}
pub fn with_lingua(languages: &[Language], default_language: Language) -> Self {
Self::new(default_language, Box::new(LinguaDetector::new(languages)))
}
pub fn push(&mut self, unit: &ExpandUnit) -> Language {
match unit {
ExpandUnit::Word(word) => self.detect_for_word(word),
ExpandUnit::Number(_) | ExpandUnit::Mark(_) => self.current_language,
}
}
pub fn reset_context(&mut self) {
self.context_window.clear();
}
fn detect_for_word(&mut self, word: &str) -> Language {
if let Some((word_lang, word_conf)) = self.detector.detect(word)
&& word_lang != self.current_language
&& word_conf >= SINGLE_WORD_SWITCH_THRESHOLD
{
self.current_language = word_lang;
self.context_window.clear();
self.context_window.push_back(word.to_string());
return self.current_language;
}
self.context_window.push_back(word.to_string());
if self.context_window.len() > CONTEXT_WINDOW_SIZE {
self.context_window.pop_front();
}
let context: String = self
.context_window
.iter()
.map(String::as_str)
.collect::<Vec<_>>()
.join(" ");
if let Some((ctx_lang, ctx_conf)) = self.detector.detect(&context)
&& ctx_lang != self.current_language
&& ctx_conf >= CONTEXT_SWITCH_THRESHOLD
{
self.current_language = ctx_lang;
}
self.current_language
}
}
struct LinguaDetector {
detector: lingua::LanguageDetector,
}
impl LinguaDetector {
fn new(languages: &[Language]) -> Self {
let lingua_langs: Vec<lingua::Language> = languages
.iter()
.map(|l| match l {
Language::English => lingua::Language::English,
Language::Vietnamese => lingua::Language::Vietnamese,
})
.collect();
let detector = lingua::LanguageDetectorBuilder::from_languages(&lingua_langs)
.with_minimum_relative_distance(0.25)
.build();
Self { detector }
}
}
impl LanguageDetector for LinguaDetector {
fn detect(&self, context: &str) -> Option<(Language, f64)> {
let confidences = self.detector.compute_language_confidence_values(context);
confidences.first().map(|(lingua_lang, confidence)| {
let lang = match lingua_lang {
lingua::Language::English => Language::English,
lingua::Language::Vietnamese => Language::Vietnamese,
};
(lang, *confidence)
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::text_expand::ExpandUnit;
use Language::{English as EN, Vietnamese as VI};
fn det(default: Language) -> StreamingLanguageDetector {
StreamingLanguageDetector::with_lingua(&[EN, VI], default)
}
fn check(preferred: Language, steps: &[(&str, Language)]) {
let mut d = det(preferred);
for (word, expected) in steps {
let got = d.push(&ExpandUnit::Word(word.to_string()));
assert_eq!(
got, *expected,
"preferred={preferred:?} word={word:?} expected={expected:?} got={got:?}"
);
}
}
#[test]
fn case_pure_english() {
check(
EN,
&[
("the", EN), ("quick", EN), ("brown", EN), ("fox", EN), ("jumps", EN), ],
);
}
#[test]
fn case_pure_vietnamese() {
check(
VI,
&[
("xin", VI), ("chào", VI), ("bạn", VI), ("tên", VI), ("là", VI), ],
);
}
#[test]
fn case_en_to_vi_transition() {
check(
EN,
&[
("the", EN), ("quick", EN), ("brown", EN), ("fox", EN), ("jumps", EN), ("chào", VI), ("bạn", VI),
("tôi", VI),
("muốn", VI),
("học", VI),
],
);
}
#[test]
fn case_vi_to_en_transition() {
check(
VI,
&[
("xin", VI),
("chào", VI),
("bạn", VI),
("tên", VI),
("là", VI),
("the", EN), ("quick", EN),
("brown", EN),
("fox", EN),
("jumps", EN),
],
);
}
#[test]
fn case_reset_vi_context_then_en() {
let mut d = det(VI);
for w in &["xin", "chào", "bạn"] {
d.push(&ExpandUnit::Word(w.to_string()));
}
d.reset_context();
for (word, expected) in &[
("the", EN), ("quick", EN),
("brown", EN),
("fox", EN),
("jumps", EN),
] {
let got = d.push(&ExpandUnit::Word(word.to_string()));
assert_eq!(
got, *expected,
"after reset word={word:?} expected={expected:?} got={got:?}"
);
}
}
#[test]
fn case_number_inherits_en() {
let mut d = det(EN);
assert_eq!(d.push(&ExpandUnit::Number("42".into())), EN);
assert_eq!(d.push(&ExpandUnit::Number("0".into())), EN);
}
#[test]
fn case_number_inherits_vi() {
let mut d = det(VI);
assert_eq!(d.push(&ExpandUnit::Number("100".into())), VI);
assert_eq!(d.push(&ExpandUnit::Number("1000".into())), VI);
}
#[test]
fn case_mark_inherits_en() {
let mut d = det(EN);
assert_eq!(d.push(&ExpandUnit::Mark(' ')), EN);
assert_eq!(d.push(&ExpandUnit::Mark(',')), EN);
assert_eq!(d.push(&ExpandUnit::Mark('.')), EN);
}
#[test]
fn case_mark_inherits_vi() {
let mut d = det(VI);
assert_eq!(d.push(&ExpandUnit::Mark(' ')), VI);
assert_eq!(d.push(&ExpandUnit::Mark(',')), VI);
}
#[test]
fn case_number_does_not_affect_transition_timing() {
let mut d = det(EN);
for w in &["the", "quick", "brown", "fox", "jumps"] {
d.push(&ExpandUnit::Word(w.to_string()));
}
assert_eq!(d.push(&ExpandUnit::Word("chào".into())), VI); assert_eq!(d.push(&ExpandUnit::Number("100".into())), VI); assert_eq!(d.push(&ExpandUnit::Word("bạn".into())), VI); assert_eq!(d.push(&ExpandUnit::Number("7".into())), VI); }
#[test]
fn case_mark_does_not_affect_transition_timing() {
let mut d = det(EN);
for w in &["the", "quick", "brown", "fox", "jumps"] {
d.push(&ExpandUnit::Word(w.to_string()));
}
assert_eq!(d.push(&ExpandUnit::Word("chào".into())), VI); assert_eq!(d.push(&ExpandUnit::Mark(',')), VI); assert_eq!(d.push(&ExpandUnit::Word("bạn".into())), VI); }
#[test]
fn case_vi_sentence_with_embedded_en_words() {
check(
VI,
&[
("trong", VI),
("tiếng", VI),
("anh", VI),
("hello", EN), ("world", EN), ("có", VI), ("nghĩa", VI),
("là", VI),
("xin", VI),
("chào", VI),
],
);
}
#[test]
fn case_en_sentence_with_single_embedded_vi_word() {
check(
EN,
&[
("the", EN),
("quick", EN),
("brown", EN),
("fox", EN),
("chào", VI), ("jumps", EN), ("over", EN),
("lazy", EN),
("dog", EN),
("today", EN),
],
);
}
#[test]
fn case_number_in_vi_then_reset_to_en() {
let mut d = det(VI);
for w in &["giá", "tiền", "là"] {
d.push(&ExpandUnit::Word(w.to_string()));
}
assert_eq!(d.push(&ExpandUnit::Number("100".into())), VI);
d.reset_context();
assert_eq!(d.push(&ExpandUnit::Word("the".into())), EN); assert_eq!(d.push(&ExpandUnit::Word("price".into())), EN);
}
#[test]
fn case_number_tracks_language_through_en_to_vi_transition() {
let mut d = det(EN);
for w in &["the", "quick", "brown", "fox", "jumps"] {
d.push(&ExpandUnit::Word(w.to_string()));
}
assert_eq!(d.push(&ExpandUnit::Word("chào".into())), VI); assert_eq!(d.push(&ExpandUnit::Number("42".into())), VI); assert_eq!(d.push(&ExpandUnit::Word("bạn".into())), VI); assert_eq!(d.push(&ExpandUnit::Number("7".into())), VI); }
}