Skip to main content

piper_phoneme_streaming/
lang_detect.rs

1use std::collections::VecDeque;
2
3use crate::semantic::Language;
4use crate::text_expand::ExpandUnit;
5
6const CONTEXT_WINDOW_SIZE: usize = 5;
7/// A single unambiguous word with confidence ≥ this triggers an immediate
8/// language switch and resets the context window (fast path).
9const SINGLE_WORD_SWITCH_THRESHOLD: f64 = 0.80;
10/// The context window must reach this confidence before switching language
11/// (slow path).  0.70 = 0.5 base + 0.20 hysteresis.
12const CONTEXT_SWITCH_THRESHOLD: f64 = 0.70;
13
14/// Trait for language detection backends.
15///
16/// Implementations receive a context string and return the most likely language
17/// plus its confidence score, or `None` if detection fails.
18pub(crate) trait LanguageDetector: Send + Sync {
19    fn detect(&self, context: &str) -> Option<(Language, f64)>;
20}
21
22/// Streaming language detector that wraps a [`LanguageDetector`] backend.
23///
24/// Takes [`ExpandUnit`] values one at a time and returns the detected
25/// [`Language`] for each:
26///
27/// - `Word` units trigger detection via a sliding context window with
28///   hysteresis — language only switches when confidence exceeds 0.5 + 0.20.
29/// - `Number` and `Mark` units inherit the current language without detection.
30/// - Sentence boundaries (`.`, `?`, `!`) should be signalled via
31///   [`reset_context`](Self::reset_context) to clear the sliding window.
32pub struct StreamingLanguageDetector {
33    detector: Box<dyn LanguageDetector>,
34    current_language: Language,
35    context_window: VecDeque<String>,
36}
37
38impl StreamingLanguageDetector {
39    pub(crate) fn new(default_language: Language, detector: Box<dyn LanguageDetector>) -> Self {
40        Self {
41            detector,
42            current_language: default_language,
43            context_window: VecDeque::new(),
44        }
45    }
46
47    /// Build a detector backed by the `lingua` library.
48    ///
49    /// `languages` must contain at least two languages; `default_language` is
50    /// the starting assumption before any text is seen.
51    ///
52    pub fn with_lingua(languages: &[Language], default_language: Language) -> Self {
53        Self::new(default_language, Box::new(LinguaDetector::new(languages)))
54    }
55
56    /// Push one expand unit and get back the language for that unit.
57    ///
58    /// Words run through the detection algorithm; numbers and marks inherit
59    /// the current language without running detection.
60    pub fn push(&mut self, unit: &ExpandUnit) -> Language {
61        match unit {
62            ExpandUnit::Word(word) => self.detect_for_word(word),
63            ExpandUnit::Number(_) | ExpandUnit::Mark(_) => self.current_language,
64        }
65    }
66
67    /// Clear the context window on sentence boundaries (`.`, `?`, `!`).
68    pub fn reset_context(&mut self) {
69        self.context_window.clear();
70    }
71
72    fn detect_for_word(&mut self, word: &str) -> Language {
73        // Fast path: a single unambiguous word with high confidence triggers an
74        // immediate switch and resets the context window so the new language is
75        // not dragged back by old context.
76        if let Some((word_lang, word_conf)) = self.detector.detect(word)
77            && word_lang != self.current_language
78            && word_conf >= SINGLE_WORD_SWITCH_THRESHOLD
79        {
80            self.current_language = word_lang;
81            self.context_window.clear();
82            self.context_window.push_back(word.to_string());
83            return self.current_language;
84        }
85
86        // Slow path: accumulate a sliding context window and switch only when
87        // the aggregated confidence exceeds the hysteresis threshold.
88        self.context_window.push_back(word.to_string());
89        if self.context_window.len() > CONTEXT_WINDOW_SIZE {
90            self.context_window.pop_front();
91        }
92
93        let context: String = self
94            .context_window
95            .iter()
96            .map(String::as_str)
97            .collect::<Vec<_>>()
98            .join(" ");
99
100        if let Some((ctx_lang, ctx_conf)) = self.detector.detect(&context)
101            && ctx_lang != self.current_language
102            && ctx_conf >= CONTEXT_SWITCH_THRESHOLD
103        {
104            self.current_language = ctx_lang;
105        }
106
107        self.current_language
108    }
109}
110
111// ---------------------------------------------------------------------------
112// lingua backend
113// ---------------------------------------------------------------------------
114
115struct LinguaDetector {
116    detector: lingua::LanguageDetector,
117}
118
119impl LinguaDetector {
120    fn new(languages: &[Language]) -> Self {
121        let lingua_langs: Vec<lingua::Language> = languages
122            .iter()
123            .map(|l| match l {
124                Language::English => lingua::Language::English,
125                Language::Vietnamese => lingua::Language::Vietnamese,
126            })
127            .collect();
128        let detector = lingua::LanguageDetectorBuilder::from_languages(&lingua_langs)
129            .with_minimum_relative_distance(0.25)
130            .build();
131        Self { detector }
132    }
133}
134
135impl LanguageDetector for LinguaDetector {
136    fn detect(&self, context: &str) -> Option<(Language, f64)> {
137        let confidences = self.detector.compute_language_confidence_values(context);
138        confidences.first().map(|(lingua_lang, confidence)| {
139            let lang = match lingua_lang {
140                lingua::Language::English => Language::English,
141                lingua::Language::Vietnamese => Language::Vietnamese,
142            };
143            (lang, *confidence)
144        })
145    }
146}
147
148// ---------------------------------------------------------------------------
149// Tests
150//
151// Each test case is: (preferred_language, &[(word, expected_language)])
152// Every word is pushed as ExpandUnit::Word; the output language is asserted
153// per word so failures point to the exact token where behavior diverges.
154// ---------------------------------------------------------------------------
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159    use crate::text_expand::ExpandUnit;
160    use Language::{English as EN, Vietnamese as VI};
161
162    fn det(default: Language) -> StreamingLanguageDetector {
163        StreamingLanguageDetector::with_lingua(&[EN, VI], default)
164    }
165
166    /// Core helper: push each `(word, expected_language)` pair and assert per step.
167    fn check(preferred: Language, steps: &[(&str, Language)]) {
168        let mut d = det(preferred);
169        for (word, expected) in steps {
170            let got = d.push(&ExpandUnit::Word(word.to_string()));
171            assert_eq!(
172                got, *expected,
173                "preferred={preferred:?}  word={word:?}  expected={expected:?}  got={got:?}"
174            );
175        }
176    }
177
178    // -------------------------------------------------------------------------
179    // Pure-language — window fills with one language, never switches
180    // -------------------------------------------------------------------------
181
182    #[test]
183    fn case_pure_english() {
184        check(
185            EN,
186            &[
187                ("the", EN),   // window: [the]
188                ("quick", EN), // window: [the, quick]
189                ("brown", EN), // window: [the, quick, brown]
190                ("fox", EN),   // window: [the, quick, brown, fox]
191                ("jumps", EN), // window: [the, quick, brown, fox, jumps]
192            ],
193        );
194    }
195
196    #[test]
197    fn case_pure_vietnamese() {
198        check(
199            VI,
200            &[
201                ("xin", VI),  // window: [xin]
202                ("chào", VI), // window: [xin, chào]
203                ("bạn", VI),  // window: [xin, chào, bạn]
204                ("tên", VI),  // window: [xin, chào, bạn, tên]
205                ("là", VI),   // window: [xin, chào, bạn, tên, là]
206            ],
207        );
208    }
209
210    // -------------------------------------------------------------------------
211    // EN → VI transition
212    //
213    // "chào" has single-word VI confidence = 1.0 ≥ 0.80 → fast switch at
214    // first clear Vietnamese word.  Context resets; VI stable thereafter.
215    // -------------------------------------------------------------------------
216
217    #[test]
218    fn case_en_to_vi_transition() {
219        check(
220            EN,
221            &[
222                // ── build EN context ──
223                ("the", EN),   // window: [the]
224                ("quick", EN), // window: [the, quick]
225                ("brown", EN), // window: [the, quick, brown]
226                ("fox", EN),   // window: [the, quick, brown, fox]
227                ("jumps", EN), // window: [the, quick, brown, fox, jumps]
228                // ── first VI word: high single-word confidence → fast switch ──
229                ("chào", VI), // fast path: VI 1.0 ≥ 0.80 → switch; context reset to [chào]
230                // ── settled in VI ──
231                ("bạn", VI),
232                ("tôi", VI),
233                ("muốn", VI),
234                ("học", VI),
235            ],
236        );
237    }
238
239    // -------------------------------------------------------------------------
240    // VI → EN transition
241    //
242    // "the" has single-word EN confidence ≥ 0.80 → fast switch immediately.
243    // Context resets to ["the"]; EN stable for all following EN words.
244    // -------------------------------------------------------------------------
245
246    #[test]
247    fn case_vi_to_en_transition() {
248        check(
249            VI,
250            &[
251                // ── build VI context ──
252                ("xin", VI),
253                ("chào", VI),
254                ("bạn", VI),
255                ("tên", VI),
256                ("là", VI),
257                // ── first EN word: fast switch ──
258                ("the", EN), // fast path: EN ≥ 0.80 → switch; context reset to [the]
259                // ── settled in EN ──
260                ("quick", EN),
261                ("brown", EN),
262                ("fox", EN),
263                ("jumps", EN),
264            ],
265        );
266    }
267
268    // -------------------------------------------------------------------------
269    // Reset clears context — next language dominates immediately
270    //
271    // With an empty window after reset, even a single unambiguous word exceeds
272    // the 0.70 threshold and switches language.
273    // -------------------------------------------------------------------------
274
275    #[test]
276    fn case_reset_vi_context_then_en() {
277        let mut d = det(VI);
278        for w in &["xin", "chào", "bạn"] {
279            d.push(&ExpandUnit::Word(w.to_string()));
280        }
281        d.reset_context();
282        // Empty window → single EN word immediately switches
283        for (word, expected) in &[
284            ("the", EN), // window: [the]   — triggers switch on first word
285            ("quick", EN),
286            ("brown", EN),
287            ("fox", EN),
288            ("jumps", EN),
289        ] {
290            let got = d.push(&ExpandUnit::Word(word.to_string()));
291            assert_eq!(
292                got, *expected,
293                "after reset  word={word:?}  expected={expected:?}  got={got:?}"
294            );
295        }
296    }
297
298    // -------------------------------------------------------------------------
299    // Numbers and marks — always inherit current language, never enter window
300    // -------------------------------------------------------------------------
301
302    #[test]
303    fn case_number_inherits_en() {
304        let mut d = det(EN);
305        assert_eq!(d.push(&ExpandUnit::Number("42".into())), EN);
306        assert_eq!(d.push(&ExpandUnit::Number("0".into())), EN);
307    }
308
309    #[test]
310    fn case_number_inherits_vi() {
311        let mut d = det(VI);
312        assert_eq!(d.push(&ExpandUnit::Number("100".into())), VI);
313        assert_eq!(d.push(&ExpandUnit::Number("1000".into())), VI);
314    }
315
316    #[test]
317    fn case_mark_inherits_en() {
318        let mut d = det(EN);
319        assert_eq!(d.push(&ExpandUnit::Mark(' ')), EN);
320        assert_eq!(d.push(&ExpandUnit::Mark(',')), EN);
321        assert_eq!(d.push(&ExpandUnit::Mark('.')), EN);
322    }
323
324    #[test]
325    fn case_mark_inherits_vi() {
326        let mut d = det(VI);
327        assert_eq!(d.push(&ExpandUnit::Mark(' ')), VI);
328        assert_eq!(d.push(&ExpandUnit::Mark(',')), VI);
329    }
330
331    /// Numbers do not enter the context window; they inherit the current language.
332    /// With fast-path switching, "chào" switches to VI immediately.
333    #[test]
334    fn case_number_does_not_affect_transition_timing() {
335        let mut d = det(EN);
336        for w in &["the", "quick", "brown", "fox", "jumps"] {
337            d.push(&ExpandUnit::Word(w.to_string()));
338        }
339        assert_eq!(d.push(&ExpandUnit::Word("chào".into())), VI); // fast path: VI 1.0 → switch
340        assert_eq!(d.push(&ExpandUnit::Number("100".into())), VI); // inherits VI
341        assert_eq!(d.push(&ExpandUnit::Word("bạn".into())), VI); // still VI
342        assert_eq!(d.push(&ExpandUnit::Number("7".into())), VI); // inherits VI
343    }
344
345    /// Marks behave identically — no window effect.
346    #[test]
347    fn case_mark_does_not_affect_transition_timing() {
348        let mut d = det(EN);
349        for w in &["the", "quick", "brown", "fox", "jumps"] {
350            d.push(&ExpandUnit::Word(w.to_string()));
351        }
352        assert_eq!(d.push(&ExpandUnit::Word("chào".into())), VI); // fast path: VI 1.0 → switch
353        assert_eq!(d.push(&ExpandUnit::Mark(',')), VI); // inherits VI
354        assert_eq!(d.push(&ExpandUnit::Word("bạn".into())), VI); // still VI
355    }
356
357    // -------------------------------------------------------------------------
358    // Mixed sentences — fast path switches on clear single-word signals
359    //
360    // "trong tiếng anh hello world có nghĩa là xin chào"
361    // = "in English, 'hello world' means 'xin chào'"
362    //
363    // "hello" (EN 0.886) and "world" (EN 0.851) both exceed the fast-path
364    // threshold → immediate EN switch.  "có" exceeds the VI threshold →
365    // fast switch back to VI.
366    // -------------------------------------------------------------------------
367
368    #[test]
369    fn case_vi_sentence_with_embedded_en_words() {
370        check(
371            VI,
372            &[
373                // ── pure VI context ──
374                ("trong", VI),
375                ("tiếng", VI),
376                ("anh", VI),
377                // ── embedded English words — fast-path switches to EN ──
378                ("hello", EN), // fast path: EN 0.886 ≥ 0.80 → switch; context reset to [hello]
379                ("world", EN), // stays EN (EN 0.851 ≥ 0.80 but already EN)
380                // ── back to Vietnamese — fast-path switches back ──
381                ("có", VI), // fast path: VI ≥ 0.80 → switch back
382                ("nghĩa", VI),
383                ("là", VI),
384                ("xin", VI),
385                ("chào", VI),
386            ],
387        );
388    }
389
390    /// A single Vietnamese word with high single-word confidence fast-switches
391    /// out of EN, and the next clear English word fast-switches back.
392    #[test]
393    fn case_en_sentence_with_single_embedded_vi_word() {
394        check(
395            EN,
396            &[
397                ("the", EN),
398                ("quick", EN),
399                ("brown", EN),
400                ("fox", EN),
401                // ── single VI word: VI 1.0 ≥ 0.80 → fast switch ──
402                ("chào", VI), // fast path: VI 1.0 → switch; context reset to [chào]
403                // ── next clear EN word fast-switches back ──
404                ("jumps", EN), // fast path: EN ≥ 0.80 → switch back
405                ("over", EN),
406                ("lazy", EN),
407                ("dog", EN),
408                ("today", EN),
409            ],
410        );
411    }
412
413    // -------------------------------------------------------------------------
414    // Numbers in mixed-language context
415    // -------------------------------------------------------------------------
416
417    /// "giá 100 đồng" — number throughout inherits VI; reset then switches EN.
418    #[test]
419    fn case_number_in_vi_then_reset_to_en() {
420        let mut d = det(VI);
421        for w in &["giá", "tiền", "là"] {
422            d.push(&ExpandUnit::Word(w.to_string()));
423        }
424        assert_eq!(d.push(&ExpandUnit::Number("100".into())), VI);
425        d.reset_context();
426        assert_eq!(d.push(&ExpandUnit::Word("the".into())), EN); // immediate switch after reset
427        assert_eq!(d.push(&ExpandUnit::Word("price".into())), EN);
428    }
429
430    /// Number during the EN→VI transition: with fast-path, "chào" switches
431    /// immediately and the number inherits VI right away.
432    #[test]
433    fn case_number_tracks_language_through_en_to_vi_transition() {
434        let mut d = det(EN);
435        for w in &["the", "quick", "brown", "fox", "jumps"] {
436            d.push(&ExpandUnit::Word(w.to_string()));
437        }
438        assert_eq!(d.push(&ExpandUnit::Word("chào".into())), VI); // fast path: VI 1.0 → switch
439        assert_eq!(d.push(&ExpandUnit::Number("42".into())), VI); // inherits VI immediately
440        assert_eq!(d.push(&ExpandUnit::Word("bạn".into())), VI); // still VI
441        assert_eq!(d.push(&ExpandUnit::Number("7".into())), VI); // inherits VI
442    }
443}