scirs2_text/
language_detection.rs

1//! Language detection module
2//!
3//! Provides multiple strategies for detecting the language of a text sample:
4//!
5//! - **N-gram profile comparison**: Compares character n-gram frequency
6//!   profiles against reference profiles for known languages.
7//! - **Common-word frequency analysis**: Counts occurrences of high-frequency
8//!   function words unique to each language.
9//! - **Unicode script detection**: Uses Unicode block analysis for scripts
10//!   like CJK, Cyrillic, Arabic, Devanagari, etc.
11//! - **Combined detection**: Merges evidence from all strategies.
12//!
13//! The unified entry point is [`detect_language`].
14
15use crate::error::{Result, TextError};
16use std::collections::HashMap;
17
18// ---------------------------------------------------------------------------
19// Types
20// ---------------------------------------------------------------------------
21
22/// ISO 639-1 language codes supported by the detector.
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
24pub enum DetectedLanguage {
25    /// English
26    En,
27    /// Spanish
28    Es,
29    /// French
30    Fr,
31    /// German
32    De,
33    /// Italian
34    It,
35    /// Portuguese
36    Pt,
37    /// Dutch
38    Nl,
39    /// Russian
40    Ru,
41    /// Chinese (Mandarin)
42    Zh,
43    /// Japanese
44    Ja,
45    /// Korean
46    Ko,
47    /// Arabic
48    Ar,
49    /// Hindi
50    Hi,
51    /// Turkish
52    Tr,
53    /// Swedish
54    Sv,
55    /// Polish
56    Pl,
57    /// Unknown / unrecognised
58    Unknown,
59}
60
61impl DetectedLanguage {
62    /// Return the ISO 639-1 code as a string.
63    pub fn iso_code(&self) -> &'static str {
64        match self {
65            Self::En => "en",
66            Self::Es => "es",
67            Self::Fr => "fr",
68            Self::De => "de",
69            Self::It => "it",
70            Self::Pt => "pt",
71            Self::Nl => "nl",
72            Self::Ru => "ru",
73            Self::Zh => "zh",
74            Self::Ja => "ja",
75            Self::Ko => "ko",
76            Self::Ar => "ar",
77            Self::Hi => "hi",
78            Self::Tr => "tr",
79            Self::Sv => "sv",
80            Self::Pl => "pl",
81            Self::Unknown => "und",
82        }
83    }
84
85    /// Human-readable name.
86    pub fn name(&self) -> &'static str {
87        match self {
88            Self::En => "English",
89            Self::Es => "Spanish",
90            Self::Fr => "French",
91            Self::De => "German",
92            Self::It => "Italian",
93            Self::Pt => "Portuguese",
94            Self::Nl => "Dutch",
95            Self::Ru => "Russian",
96            Self::Zh => "Chinese",
97            Self::Ja => "Japanese",
98            Self::Ko => "Korean",
99            Self::Ar => "Arabic",
100            Self::Hi => "Hindi",
101            Self::Tr => "Turkish",
102            Self::Sv => "Swedish",
103            Self::Pl => "Polish",
104            Self::Unknown => "Unknown",
105        }
106    }
107}
108
109/// Result of language detection.
110#[derive(Debug, Clone)]
111pub struct LanguageDetectionOutput {
112    /// The most likely language.
113    pub language: DetectedLanguage,
114    /// Confidence in [0, 1].
115    pub confidence: f64,
116    /// Alternative candidates ranked by confidence (descending).
117    pub alternatives: Vec<(DetectedLanguage, f64)>,
118}
119
120/// Detection strategy selector.
121#[derive(Debug, Clone, Copy, PartialEq, Eq)]
122pub enum DetectionStrategy {
123    /// Character n-gram profile comparison.
124    Ngram,
125    /// Common-word frequency analysis.
126    WordFrequency,
127    /// Unicode script analysis.
128    UnicodeScript,
129    /// Combined evidence from all strategies.
130    Combined,
131}
132
133// ---------------------------------------------------------------------------
134// Unified API
135// ---------------------------------------------------------------------------
136
137/// Detect the language of `text` using the combined strategy by default.
138///
139/// Returns a [`LanguageDetectionOutput`] containing the best guess, its
140/// confidence, and a list of alternatives.
141///
142/// # Errors
143///
144/// Returns an error only if `text` is empty or too short for meaningful
145/// detection (fewer than 3 characters). For very short texts the confidence
146/// will simply be low.
147pub fn detect_language(text: &str) -> Result<LanguageDetectionOutput> {
148    detect_language_with_strategy(text, DetectionStrategy::Combined)
149}
150
151/// Detect language using a specific strategy.
152pub fn detect_language_with_strategy(
153    text: &str,
154    strategy: DetectionStrategy,
155) -> Result<LanguageDetectionOutput> {
156    let trimmed = text.trim();
157    if trimmed.is_empty() {
158        return Err(TextError::InvalidInput(
159            "Cannot detect language of empty text".to_string(),
160        ));
161    }
162
163    match strategy {
164        DetectionStrategy::Ngram => detect_by_ngram(trimmed),
165        DetectionStrategy::WordFrequency => detect_by_word_frequency(trimmed),
166        DetectionStrategy::UnicodeScript => detect_by_unicode_script(trimmed),
167        DetectionStrategy::Combined => detect_combined(trimmed),
168    }
169}
170
171// ---------------------------------------------------------------------------
172// N-gram profile comparison
173// ---------------------------------------------------------------------------
174
175fn detect_by_ngram(text: &str) -> Result<LanguageDetectionOutput> {
176    let text_profile = build_ngram_profile(text, 3);
177    if text_profile.is_empty() {
178        return Ok(unknown_result());
179    }
180
181    let reference_profiles = reference_ngram_profiles();
182    let mut scores: Vec<(DetectedLanguage, f64)> = Vec::new();
183
184    for (lang, ref_profile) in &reference_profiles {
185        let similarity = profile_similarity(&text_profile, ref_profile);
186        scores.push((*lang, similarity));
187    }
188
189    scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
190
191    if scores.is_empty() {
192        return Ok(unknown_result());
193    }
194
195    let best = scores[0];
196    let confidence = best.1.clamp(0.0, 1.0);
197
198    Ok(LanguageDetectionOutput {
199        language: best.0,
200        confidence,
201        alternatives: scores.into_iter().skip(1).collect(),
202    })
203}
204
205/// Build a frequency profile of character n-grams.
206fn build_ngram_profile(text: &str, n: usize) -> HashMap<String, f64> {
207    let lower = text.to_lowercase();
208    let chars: Vec<char> = lower.chars().collect();
209    let mut counts: HashMap<String, f64> = HashMap::new();
210
211    if chars.len() < n {
212        return counts;
213    }
214
215    for window in chars.windows(n) {
216        let gram: String = window.iter().collect();
217        *counts.entry(gram).or_insert(0.0) += 1.0;
218    }
219
220    // Normalise.
221    let total: f64 = counts.values().sum();
222    if total > 0.0 {
223        for v in counts.values_mut() {
224            *v /= total;
225        }
226    }
227
228    counts
229}
230
231/// Cosine similarity between two n-gram profiles.
232fn profile_similarity(a: &HashMap<String, f64>, b: &HashMap<String, f64>) -> f64 {
233    let mut dot = 0.0_f64;
234    let mut norm_a = 0.0_f64;
235    let mut norm_b = 0.0_f64;
236
237    for (gram, &va) in a {
238        norm_a += va * va;
239        if let Some(&vb) = b.get(gram) {
240            dot += va * vb;
241        }
242    }
243    for &vb in b.values() {
244        norm_b += vb * vb;
245    }
246
247    let denom = norm_a.sqrt() * norm_b.sqrt();
248    if denom == 0.0 {
249        0.0
250    } else {
251        dot / denom
252    }
253}
254
255/// Reference n-gram profiles for known languages.
256fn reference_ngram_profiles() -> HashMap<DetectedLanguage, HashMap<String, f64>> {
257    let mut profiles = HashMap::new();
258
259    // English trigrams.
260    profiles.insert(
261        DetectedLanguage::En,
262        build_ref_profile(&[
263            ("the", 50.0),
264            ("and", 30.0),
265            ("ing", 25.0),
266            ("tion", 20.0),
267            ("her", 18.0),
268            ("ent", 17.0),
269            ("ion", 16.0),
270            ("tio", 16.0),
271            ("for", 15.0),
272            ("ate", 14.0),
273            ("hat", 13.0),
274            ("tha", 13.0),
275            ("ere", 12.0),
276            ("his", 12.0),
277            ("hin", 11.0),
278            ("ter", 11.0),
279            ("was", 10.0),
280            ("all", 10.0),
281            ("ith", 9.0),
282            ("ver", 9.0),
283        ]),
284    );
285
286    // Spanish trigrams.
287    profiles.insert(
288        DetectedLanguage::Es,
289        build_ref_profile(&[
290            ("que", 45.0),
291            ("ent", 30.0),
292            ("los", 28.0),
293            ("ion", 25.0),
294            ("aci", 22.0),
295            ("cion", 20.0),
296            ("del", 19.0),
297            ("las", 18.0),
298            ("con", 17.0),
299            ("est", 16.0),
300            ("por", 15.0),
301            ("nte", 14.0),
302            ("ado", 13.0),
303            ("una", 13.0),
304            ("tra", 12.0),
305            ("par", 11.0),
306            ("com", 10.0),
307            ("ero", 10.0),
308            ("ien", 9.0),
309            ("sta", 9.0),
310        ]),
311    );
312
313    // French trigrams.
314    profiles.insert(
315        DetectedLanguage::Fr,
316        build_ref_profile(&[
317            ("les", 45.0),
318            ("ent", 35.0),
319            ("que", 30.0),
320            ("des", 28.0),
321            ("ion", 25.0),
322            ("ait", 22.0),
323            ("ous", 20.0),
324            ("est", 18.0),
325            ("une", 17.0),
326            ("ant", 16.0),
327            ("par", 15.0),
328            ("eur", 14.0),
329            ("sur", 13.0),
330            ("tre", 12.0),
331            ("eme", 11.0),
332            ("dan", 10.0),
333            ("pas", 10.0),
334            ("tio", 9.0),
335            ("pou", 9.0),
336            ("ais", 8.0),
337        ]),
338    );
339
340    // German trigrams.
341    profiles.insert(
342        DetectedLanguage::De,
343        build_ref_profile(&[
344            ("ein", 45.0),
345            ("ich", 40.0),
346            ("der", 35.0),
347            ("die", 33.0),
348            ("und", 30.0),
349            ("den", 25.0),
350            ("sch", 23.0),
351            ("cht", 20.0),
352            ("ung", 18.0),
353            ("gen", 17.0),
354            ("ber", 16.0),
355            ("ver", 15.0),
356            ("auf", 14.0),
357            ("eit", 13.0),
358            ("ach", 12.0),
359            ("mit", 11.0),
360            ("aus", 10.0),
361            ("ine", 10.0),
362            ("das", 9.0),
363            ("ent", 8.0),
364        ]),
365    );
366
367    // Italian trigrams.
368    profiles.insert(
369        DetectedLanguage::It,
370        build_ref_profile(&[
371            ("che", 45.0),
372            ("ell", 30.0),
373            ("per", 28.0),
374            ("del", 25.0),
375            ("ato", 22.0),
376            ("ion", 20.0),
377            ("ent", 18.0),
378            ("con", 17.0),
379            ("lla", 16.0),
380            ("azi", 15.0),
381            ("tta", 14.0),
382            ("gli", 13.0),
383            ("sta", 12.0),
384            ("nte", 11.0),
385            ("one", 10.0),
386            ("ere", 10.0),
387            ("tto", 9.0),
388            ("ato", 9.0),
389            ("ment", 8.0),
390            ("pre", 8.0),
391        ]),
392    );
393
394    // Portuguese trigrams.
395    profiles.insert(
396        DetectedLanguage::Pt,
397        build_ref_profile(&[
398            ("que", 45.0),
399            ("ent", 30.0),
400            ("nte", 25.0),
401            ("ado", 22.0),
402            ("ica", 20.0),
403            ("est", 18.0),
404            ("dos", 17.0),
405            ("con", 16.0),
406            ("par", 15.0),
407            ("men", 14.0),
408            ("com", 13.0),
409            ("aco", 12.0),
410            ("tra", 11.0),
411            ("ida", 10.0),
412            ("pro", 10.0),
413            ("uma", 9.0),
414            ("mos", 9.0),
415            ("oes", 8.0),
416            ("ter", 8.0),
417            ("ais", 7.0),
418        ]),
419    );
420
421    // Dutch trigrams.
422    profiles.insert(
423        DetectedLanguage::Nl,
424        build_ref_profile(&[
425            ("een", 45.0),
426            ("van", 40.0),
427            ("het", 35.0),
428            ("aar", 28.0),
429            ("ing", 25.0),
430            ("oor", 22.0),
431            ("ver", 20.0),
432            ("den", 18.0),
433            ("ijk", 16.0),
434            ("ond", 15.0),
435            ("ent", 14.0),
436            ("erd", 13.0),
437            ("sch", 12.0),
438            ("ter", 11.0),
439            ("and", 10.0),
440            ("ede", 10.0),
441            ("aat", 9.0),
442            ("met", 9.0),
443            ("nde", 8.0),
444            ("dat", 8.0),
445        ]),
446    );
447
448    // Turkish trigrams.
449    profiles.insert(
450        DetectedLanguage::Tr,
451        build_ref_profile(&[
452            ("lar", 45.0),
453            ("bir", 40.0),
454            ("ler", 35.0),
455            ("eri", 30.0),
456            ("ara", 25.0),
457            ("ini", 22.0),
458            ("rin", 20.0),
459            ("yor", 18.0),
460            ("ile", 16.0),
461            ("dir", 15.0),
462            ("dan", 14.0),
463            ("rak", 13.0),
464            ("len", 12.0),
465            ("ası", 11.0),
466            ("lik", 10.0),
467            ("olu", 10.0),
468            ("ind", 9.0),
469            ("yan", 9.0),
470            ("ama", 8.0),
471            ("aki", 8.0),
472        ]),
473    );
474
475    profiles
476}
477
478fn build_ref_profile(data: &[(&str, f64)]) -> HashMap<String, f64> {
479    let total: f64 = data.iter().map(|(_, f)| f).sum();
480    let mut profile = HashMap::new();
481    for (gram, freq) in data {
482        profile.insert(gram.to_string(), freq / total);
483    }
484    profile
485}
486
487// ---------------------------------------------------------------------------
488// Common-word frequency analysis
489// ---------------------------------------------------------------------------
490
491fn detect_by_word_frequency(text: &str) -> Result<LanguageDetectionOutput> {
492    let lower = text.to_lowercase();
493    let words: Vec<&str> = lower.split_whitespace().collect();
494    if words.is_empty() {
495        return Ok(unknown_result());
496    }
497
498    let word_lists = common_word_lists();
499    let mut scores: Vec<(DetectedLanguage, f64)> = Vec::new();
500
501    for (lang, common_words) in &word_lists {
502        let matches = words.iter().filter(|w| common_words.contains(*w)).count();
503        let ratio = matches as f64 / words.len() as f64;
504        scores.push((*lang, ratio));
505    }
506
507    scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
508
509    if scores.is_empty() || scores[0].1 < 0.01 {
510        return Ok(unknown_result());
511    }
512
513    let best = scores[0];
514    let confidence = (best.1 * 2.5).clamp(0.0, 1.0);
515
516    Ok(LanguageDetectionOutput {
517        language: best.0,
518        confidence,
519        alternatives: scores.into_iter().skip(1).collect(),
520    })
521}
522
523fn common_word_lists() -> HashMap<DetectedLanguage, Vec<&'static str>> {
524    let mut lists = HashMap::new();
525
526    lists.insert(
527        DetectedLanguage::En,
528        vec![
529            "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
530            "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
531            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
532            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
533            "go", "me", "when",
534        ],
535    );
536
537    lists.insert(
538        DetectedLanguage::Es,
539        vec![
540            "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", "un",
541            "para", "con", "no", "una", "su", "al", "es", "lo", "como", "pero", "sus", "le", "ya",
542            "o", "este", "ha", "si", "porque", "esta", "entre", "cuando", "muy", "sin", "sobre",
543            "ser", "tambien", "me", "hasta", "hay", "donde", "quien",
544        ],
545    );
546
547    lists.insert(
548        DetectedLanguage::Fr,
549        vec![
550            "de", "la", "le", "et", "les", "des", "en", "un", "du", "une", "que", "est", "dans",
551            "qui", "par", "pour", "au", "il", "sur", "pas", "plus", "ce", "ne", "se", "avec",
552            "mais", "on", "son", "tout", "je", "nous", "vous", "elle", "ou", "bien", "ces", "sont",
553            "sans", "comme", "peut", "fait", "aux", "entre", "deux",
554        ],
555    );
556
557    lists.insert(
558        DetectedLanguage::De,
559        vec![
560            "der", "die", "und", "in", "den", "von", "zu", "das", "mit", "sich", "des", "auf",
561            "nicht", "ein", "ist", "dem", "eine", "auch", "es", "an", "als", "nach", "wie", "aber",
562            "vor", "hat", "nur", "oder", "ich", "bei", "noch", "unter", "bis", "kann", "wird",
563            "so", "wenn", "sie", "sehr", "wir", "uber", "schon", "dann",
564        ],
565    );
566
567    lists.insert(
568        DetectedLanguage::It,
569        vec![
570            "di", "che", "il", "la", "in", "un", "per", "del", "non", "una", "con", "sono", "gli",
571            "le", "si", "da", "al", "lo", "ha", "come", "ma", "anche", "io", "suo", "dei", "nel",
572            "alla", "piu", "questo", "era", "essere", "tutto", "fra", "stato", "ancora", "dove",
573            "hanno", "ogni", "alle", "nella",
574        ],
575    );
576
577    lists.insert(
578        DetectedLanguage::Pt,
579        vec![
580            "de", "que", "o", "a", "do", "da", "em", "para", "com", "um", "uma", "os", "no", "se",
581            "na", "por", "mais", "as", "dos", "como", "mas", "ao", "ele", "das", "seu", "sua",
582            "ou", "quando", "muito", "nos", "ja", "eu", "tambem", "so", "pelo", "pela", "ate",
583            "isso", "ela", "entre", "depois", "sem", "mesmo",
584        ],
585    );
586
587    lists.insert(
588        DetectedLanguage::Nl,
589        vec![
590            "de", "het", "een", "van", "en", "in", "is", "dat", "op", "te", "zijn", "voor", "met",
591            "die", "niet", "aan", "er", "maar", "om", "ook", "als", "dan", "bij", "nog", "uit",
592            "kan", "al", "wel", "zo", "was", "worden", "tot", "naar", "heeft", "over", "meer",
593            "hun", "dit", "door", "onder", "heel", "deze", "dus",
594        ],
595    );
596
597    lists.insert(
598        DetectedLanguage::Tr,
599        vec![
600            "bir", "bu", "da", "ve", "ile", "olan", "icin", "var", "ama", "den", "daha", "gibi",
601            "sonra", "kadar", "olarak", "hem", "her", "ya", "mi", "ne", "ben", "sen", "biz", "siz",
602            "o", "onlar", "ise", "ancak", "yok", "cok",
603        ],
604    );
605
606    lists
607}
608
609// ---------------------------------------------------------------------------
610// Unicode script detection
611// ---------------------------------------------------------------------------
612
613fn detect_by_unicode_script(text: &str) -> Result<LanguageDetectionOutput> {
614    let chars: Vec<char> = text.chars().filter(|c| !c.is_whitespace()).collect();
615    if chars.is_empty() {
616        return Ok(unknown_result());
617    }
618
619    let total = chars.len() as f64;
620    let mut script_counts: HashMap<&str, usize> = HashMap::new();
621
622    for &ch in &chars {
623        let script = classify_char(ch);
624        *script_counts.entry(script).or_insert(0) += 1;
625    }
626
627    // Map scripts to languages.
628    let mut lang_scores: HashMap<DetectedLanguage, f64> = HashMap::new();
629
630    if let Some(&count) = script_counts.get("cjk") {
631        // Distinguish Chinese, Japanese, Korean by auxiliary scripts.
632        let hiragana = *script_counts.get("hiragana").unwrap_or(&0) as f64;
633        let katakana = *script_counts.get("katakana").unwrap_or(&0) as f64;
634        let hangul = *script_counts.get("hangul").unwrap_or(&0) as f64;
635
636        if hiragana + katakana > hangul {
637            *lang_scores.entry(DetectedLanguage::Ja).or_insert(0.0) +=
638                (count as f64 + hiragana + katakana) / total;
639        } else if hangul > 0.0 {
640            *lang_scores.entry(DetectedLanguage::Ko).or_insert(0.0) +=
641                (count as f64 + hangul) / total;
642        } else {
643            *lang_scores.entry(DetectedLanguage::Zh).or_insert(0.0) += count as f64 / total;
644        }
645    }
646
647    if let Some(&count) = script_counts.get("hiragana") {
648        *lang_scores.entry(DetectedLanguage::Ja).or_insert(0.0) += count as f64 / total;
649    }
650    if let Some(&count) = script_counts.get("katakana") {
651        *lang_scores.entry(DetectedLanguage::Ja).or_insert(0.0) += count as f64 / total;
652    }
653    if let Some(&count) = script_counts.get("hangul") {
654        *lang_scores.entry(DetectedLanguage::Ko).or_insert(0.0) += count as f64 / total;
655    }
656    if let Some(&count) = script_counts.get("cyrillic") {
657        *lang_scores.entry(DetectedLanguage::Ru).or_insert(0.0) += count as f64 / total;
658    }
659    if let Some(&count) = script_counts.get("arabic") {
660        *lang_scores.entry(DetectedLanguage::Ar).or_insert(0.0) += count as f64 / total;
661    }
662    if let Some(&count) = script_counts.get("devanagari") {
663        *lang_scores.entry(DetectedLanguage::Hi).or_insert(0.0) += count as f64 / total;
664    }
665
666    // Latin script -> delegate to n-gram for Latin-script languages.
667    if let Some(&count) = script_counts.get("latin") {
668        let latin_ratio = count as f64 / total;
669        if latin_ratio > 0.5 {
670            // Fall back to ngram for Latin languages.
671            return detect_by_ngram(text);
672        }
673    }
674
675    let mut scores: Vec<(DetectedLanguage, f64)> = lang_scores.into_iter().collect();
676    scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
677
678    if scores.is_empty() {
679        return Ok(unknown_result());
680    }
681
682    let best = scores[0];
683    let confidence = best.1.clamp(0.0, 1.0);
684
685    Ok(LanguageDetectionOutput {
686        language: best.0,
687        confidence,
688        alternatives: scores.into_iter().skip(1).collect(),
689    })
690}
691
692/// Classify a character into a script category.
693fn classify_char(ch: char) -> &'static str {
694    let code = ch as u32;
695    match code {
696        // Basic Latin / Latin Extended
697        0x0041..=0x024F => "latin",
698        // Latin Extended Additional
699        0x1E00..=0x1EFF => "latin",
700        // Cyrillic
701        0x0400..=0x052F => "cyrillic",
702        // Arabic
703        0x0600..=0x06FF | 0x0750..=0x077F | 0xFB50..=0xFDFF | 0xFE70..=0xFEFF => "arabic",
704        // Devanagari
705        0x0900..=0x097F => "devanagari",
706        // CJK Unified Ideographs
707        0x4E00..=0x9FFF | 0x3400..=0x4DBF | 0x20000..=0x2A6DF => "cjk",
708        // Hiragana
709        0x3040..=0x309F => "hiragana",
710        // Katakana
711        0x30A0..=0x30FF | 0x31F0..=0x31FF => "katakana",
712        // Hangul
713        0xAC00..=0xD7AF | 0x1100..=0x11FF | 0x3130..=0x318F => "hangul",
714        // Thai
715        0x0E00..=0x0E7F => "thai",
716        // Greek
717        0x0370..=0x03FF => "greek",
718        // Hebrew
719        0x0590..=0x05FF => "hebrew",
720        _ => "other",
721    }
722}
723
724// ---------------------------------------------------------------------------
725// Combined detection
726// ---------------------------------------------------------------------------
727
728fn detect_combined(text: &str) -> Result<LanguageDetectionOutput> {
729    // 1. Try Unicode script first (fast, decisive for non-Latin scripts).
730    let script_result = detect_by_unicode_script(text)?;
731    if script_result.language != DetectedLanguage::Unknown && script_result.confidence > 0.6 {
732        return Ok(script_result);
733    }
734
735    // 2. For Latin-script text, combine n-gram and word frequency.
736    let ngram_result = detect_by_ngram(text)?;
737    let word_result = detect_by_word_frequency(text)?;
738
739    // Merge scores (weighted average).
740    let mut combined: HashMap<DetectedLanguage, f64> = HashMap::new();
741    let ngram_weight = 0.55;
742    let word_weight = 0.45;
743
744    // Add n-gram scores.
745    *combined.entry(ngram_result.language).or_insert(0.0) += ngram_weight * ngram_result.confidence;
746    for (lang, score) in &ngram_result.alternatives {
747        *combined.entry(*lang).or_insert(0.0) += ngram_weight * score;
748    }
749
750    // Add word frequency scores.
751    *combined.entry(word_result.language).or_insert(0.0) += word_weight * word_result.confidence;
752    for (lang, score) in &word_result.alternatives {
753        *combined.entry(*lang).or_insert(0.0) += word_weight * score;
754    }
755
756    let mut scores: Vec<(DetectedLanguage, f64)> = combined.into_iter().collect();
757    scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
758
759    if scores.is_empty() {
760        return Ok(unknown_result());
761    }
762
763    let best = scores[0];
764    let confidence = best.1.clamp(0.0, 1.0);
765
766    Ok(LanguageDetectionOutput {
767        language: best.0,
768        confidence,
769        alternatives: scores.into_iter().skip(1).collect(),
770    })
771}
772
773fn unknown_result() -> LanguageDetectionOutput {
774    LanguageDetectionOutput {
775        language: DetectedLanguage::Unknown,
776        confidence: 0.0,
777        alternatives: Vec::new(),
778    }
779}
780
781// ---------------------------------------------------------------------------
782// Tests
783// ---------------------------------------------------------------------------
784
785#[cfg(test)]
786mod tests {
787    use super::*;
788
789    // ---- English detection ----
790
791    #[test]
792    fn test_detect_english() {
793        let result = detect_language(
794            "The quick brown fox jumps over the lazy dog. This is a test of the language detection system.",
795        )
796        .expect("Should succeed");
797        assert_eq!(result.language, DetectedLanguage::En);
798        assert!(result.confidence > 0.0);
799    }
800
801    #[test]
802    fn test_detect_english_short() {
803        let result = detect_language("Hello world, how are you today?").expect("Should succeed");
804        assert_eq!(result.language, DetectedLanguage::En);
805    }
806
807    #[test]
808    fn test_detect_english_ngram_strategy() {
809        let result = detect_language_with_strategy(
810            "The weather is wonderful and everything looks beautiful in the morning light.",
811            DetectionStrategy::Ngram,
812        )
813        .expect("ok");
814        assert_eq!(result.language, DetectedLanguage::En);
815    }
816
817    #[test]
818    fn test_detect_english_word_frequency() {
819        let result = detect_language_with_strategy(
820            "This is a test of the word frequency detection method.",
821            DetectionStrategy::WordFrequency,
822        )
823        .expect("ok");
824        assert_eq!(result.language, DetectedLanguage::En);
825    }
826
827    #[test]
828    fn test_english_has_alternatives() {
829        let result = detect_language(
830            "The system provides comprehensive analysis and detailed reporting for all users.",
831        )
832        .expect("ok");
833        assert!(!result.alternatives.is_empty());
834    }
835
836    // ---- Spanish detection ----
837
838    #[test]
839    fn test_detect_spanish() {
840        let result = detect_language(
841            "El gato se sienta en la alfombra. Esta es una prueba del sistema de deteccion de idioma.",
842        )
843        .expect("ok");
844        assert_eq!(result.language, DetectedLanguage::Es);
845    }
846
847    #[test]
848    fn test_detect_spanish_ngram() {
849        let result = detect_language_with_strategy(
850            "Los estudiantes que asistieron a la conferencia disfrutaron de las presentaciones.",
851            DetectionStrategy::Ngram,
852        )
853        .expect("ok");
854        assert_eq!(result.language, DetectedLanguage::Es);
855    }
856
857    #[test]
858    fn test_detect_spanish_word_frequency() {
859        let result = detect_language_with_strategy(
860            "Para los que no saben, el libro es una de las mejores novelas del siglo.",
861            DetectionStrategy::WordFrequency,
862        )
863        .expect("ok");
864        assert_eq!(result.language, DetectedLanguage::Es);
865    }
866
867    #[test]
868    fn test_detect_spanish_combined() {
869        let result = detect_language_with_strategy(
870            "La empresa ha contratado a nuevos empleados para el departamento de marketing.",
871            DetectionStrategy::Combined,
872        )
873        .expect("ok");
874        assert_eq!(result.language, DetectedLanguage::Es);
875    }
876
877    #[test]
878    fn test_spanish_confidence_range() {
879        let result =
880            detect_language("Buenos dias, como estas? Espero que todo vaya bien con la familia.")
881                .expect("ok");
882        assert!(result.confidence >= 0.0 && result.confidence <= 1.0);
883    }
884
885    // ---- French detection ----
886
887    #[test]
888    fn test_detect_french() {
889        let result = detect_language(
890            "Le chat est assis sur le tapis. Les enfants jouent dans le jardin avec leurs amis.",
891        )
892        .expect("ok");
893        assert_eq!(result.language, DetectedLanguage::Fr);
894    }
895
896    #[test]
897    fn test_detect_french_ngram() {
898        let result = detect_language_with_strategy(
899            "Les resultats des elections ont ete publies dans les journaux ce matin.",
900            DetectionStrategy::Ngram,
901        )
902        .expect("ok");
903        assert_eq!(result.language, DetectedLanguage::Fr);
904    }
905
906    #[test]
907    fn test_detect_french_word() {
908        let result = detect_language_with_strategy(
909            "Je ne suis pas sur que nous puissions terminer ce projet dans les delais prevus.",
910            DetectionStrategy::WordFrequency,
911        )
912        .expect("ok");
913        assert_eq!(result.language, DetectedLanguage::Fr);
914    }
915
916    #[test]
917    fn test_french_confidence() {
918        let result = detect_language("Bonjour, comment allez-vous? Je suis content de vous voir.")
919            .expect("ok");
920        assert!(result.confidence > 0.0);
921    }
922
923    #[test]
924    fn test_detect_french_combined() {
925        let result = detect_language(
926            "Les entreprises francaises investissent dans les nouvelles technologies pour une meilleure productivite.",
927        )
928        .expect("ok");
929        assert_eq!(result.language, DetectedLanguage::Fr);
930    }
931
932    // ---- German detection ----
933
934    #[test]
935    fn test_detect_german() {
936        let result = detect_language(
937            "Die Katze sitzt auf der Matte. Die Kinder spielen im Garten mit ihren Freunden.",
938        )
939        .expect("ok");
940        assert_eq!(result.language, DetectedLanguage::De);
941    }
942
943    #[test]
944    fn test_detect_german_word() {
945        let result = detect_language_with_strategy(
946            "Ich bin nicht sicher, ob wir dieses Projekt noch rechtzeitig fertigstellen werden.",
947            DetectionStrategy::WordFrequency,
948        )
949        .expect("ok");
950        assert_eq!(result.language, DetectedLanguage::De);
951    }
952
953    #[test]
954    fn test_detect_german_ngram() {
955        let result = detect_language_with_strategy(
956            "Die Ergebnisse der Untersuchung wurden gestern veroffentlicht und haben grosse Aufmerksamkeit erregt.",
957            DetectionStrategy::Ngram,
958        )
959        .expect("ok");
960        assert_eq!(result.language, DetectedLanguage::De);
961    }
962
963    #[test]
964    fn test_german_confidence() {
965        let result = detect_language("Guten Tag, wie geht es Ihnen? Ich hoffe, es geht Ihnen gut.")
966            .expect("ok");
967        assert!(result.confidence > 0.0);
968    }
969
970    #[test]
971    fn test_detect_german_combined() {
972        let result = detect_language(
973            "Die Wissenschaftler haben einen wichtigen Durchbruch in der Forschung erzielt.",
974        )
975        .expect("ok");
976        assert_eq!(result.language, DetectedLanguage::De);
977    }
978
979    // ---- CJK / non-Latin script detection ----
980
981    #[test]
982    fn test_detect_chinese() {
983        let result =
984            detect_language("今天天气很好，我们去公园散步吧。这是一个美丽的城市。").expect("ok");
985        assert_eq!(result.language, DetectedLanguage::Zh);
986    }
987
988    #[test]
989    fn test_detect_japanese() {
990        let result =
991            detect_language("今日はとてもいい天気です。公園で散歩しましょう。").expect("ok");
992        assert_eq!(result.language, DetectedLanguage::Ja);
993    }
994
995    #[test]
996    fn test_detect_korean() {
997        let result =
998            detect_language("오늘 날씨가 정말 좋습니다. 공원에서 산책합시다.").expect("ok");
999        assert_eq!(result.language, DetectedLanguage::Ko);
1000    }
1001
1002    #[test]
1003    fn test_detect_russian() {
1004        let result = detect_language("Сегодня прекрасная погода. Давайте пойдем гулять в парк.")
1005            .expect("ok");
1006        assert_eq!(result.language, DetectedLanguage::Ru);
1007    }
1008
1009    #[test]
1010    fn test_detect_arabic() {
1011        let result = detect_language("الطقس جميل اليوم. دعونا نذهب للمشي في الحديقة.").expect("ok");
1012        assert_eq!(result.language, DetectedLanguage::Ar);
1013    }
1014
1015    // ---- Edge cases ----
1016
1017    #[test]
1018    fn test_empty_text_error() {
1019        let result = detect_language("");
1020        assert!(result.is_err());
1021    }
1022
1023    #[test]
1024    fn test_whitespace_only_error() {
1025        let result = detect_language("   \t\n  ");
1026        assert!(result.is_err());
1027    }
1028
1029    #[test]
1030    fn test_very_short_text() {
1031        // Very short text may have low confidence but should not error.
1032        let result = detect_language("Hi").expect("ok");
1033        // Confidence may be low but it should return something.
1034        assert!(result.confidence >= 0.0);
1035    }
1036
1037    #[test]
1038    fn test_iso_code_round_trip() {
1039        let lang = DetectedLanguage::En;
1040        assert_eq!(lang.iso_code(), "en");
1041        assert_eq!(lang.name(), "English");
1042    }
1043
1044    #[test]
1045    fn test_unknown_iso_code() {
1046        let lang = DetectedLanguage::Unknown;
1047        assert_eq!(lang.iso_code(), "und");
1048    }
1049
1050    // ---- Unicode script strategy tests ----
1051
1052    #[test]
1053    fn test_unicode_script_cjk() {
1054        let result =
1055            detect_language_with_strategy("这是一个测试。", DetectionStrategy::UnicodeScript)
1056                .expect("ok");
1057        assert_eq!(result.language, DetectedLanguage::Zh);
1058    }
1059
1060    #[test]
1061    fn test_unicode_script_cyrillic() {
1062        let result = detect_language_with_strategy(
1063            "Привет мир, как дела?",
1064            DetectionStrategy::UnicodeScript,
1065        )
1066        .expect("ok");
1067        assert_eq!(result.language, DetectedLanguage::Ru);
1068    }
1069
1070    #[test]
1071    fn test_unicode_script_arabic() {
1072        let result =
1073            detect_language_with_strategy("مرحبا بالعالم", DetectionStrategy::UnicodeScript)
1074                .expect("ok");
1075        assert_eq!(result.language, DetectedLanguage::Ar);
1076    }
1077
1078    #[test]
1079    fn test_unicode_script_devanagari() {
1080        let result =
1081            detect_language_with_strategy("नमस्ते दुनिया, कैसे हो?", DetectionStrategy::UnicodeScript)
1082                .expect("ok");
1083        assert_eq!(result.language, DetectedLanguage::Hi);
1084    }
1085
1086    #[test]
1087    fn test_unicode_script_latin_falls_back() {
1088        // Latin text should fall back to n-gram detection within unicode_script strategy.
1089        let result = detect_language_with_strategy(
1090            "The quick brown fox jumps over the lazy dog.",
1091            DetectionStrategy::UnicodeScript,
1092        )
1093        .expect("ok");
1094        // Should still detect English.
1095        assert_eq!(result.language, DetectedLanguage::En);
1096    }
1097}
scirs2_text/language_detection.rs

scirs2_text/
language_detection.rs