Skip to main content

lang_check/
detection.rs

1use whatlang::{Detector, Info, Lang};
2
3/// Result of detecting the natural language of a text segment.
4#[derive(Debug, Clone, PartialEq)]
5pub struct DetectedLanguage {
6    /// BCP-47-style language tag (e.g. "en", "fr", "de").
7    pub tag: String,
8    /// Confidence score from 0.0 to 1.0.
9    pub confidence: f64,
10    /// Whether this detection is considered reliable.
11    pub reliable: bool,
12}
13
14/// Detect the natural language of a text segment.
15///
16/// Returns `None` if the text is too short or ambiguous for reliable detection.
17#[must_use]
18pub fn detect(text: &str) -> Option<DetectedLanguage> {
19    let info = whatlang::detect(text)?;
20    Some(DetectedLanguage {
21        tag: lang_to_tag(info.lang()),
22        confidence: info.confidence(),
23        reliable: info.is_reliable(),
24    })
25}
26
27/// Detect with a language allowlist (e.g. only detect among supported languages).
28#[must_use]
29pub fn detect_with_allowlist(text: &str, allowed: &[&str]) -> Option<DetectedLanguage> {
30    let langs: Vec<Lang> = allowed.iter().filter_map(|t| tag_to_lang(t)).collect();
31    if langs.is_empty() {
32        return detect(text);
33    }
34    let detector = Detector::with_allowlist(langs);
35    let info: Info = detector.detect(text)?;
36    Some(DetectedLanguage {
37        tag: lang_to_tag(info.lang()),
38        confidence: info.confidence(),
39        reliable: info.is_reliable(),
40    })
41}
42
43/// Convert `whatlang::Lang` to a BCP-47 language tag.
44fn lang_to_tag(lang: Lang) -> String {
45    match lang {
46        Lang::Eng => "en",
47        Lang::Fra => "fr",
48        Lang::Deu => "de",
49        Lang::Spa => "es",
50        Lang::Por => "pt",
51        Lang::Ita => "it",
52        Lang::Nld => "nl",
53        Lang::Rus => "ru",
54        Lang::Pol => "pl",
55        Lang::Swe => "sv",
56        Lang::Dan => "da",
57        Lang::Nob => "no",
58        Lang::Fin => "fi",
59        Lang::Ukr => "uk",
60        Lang::Ces => "cs",
61        Lang::Ron => "ro",
62        Lang::Hun => "hu",
63        Lang::Tur => "tr",
64        Lang::Jpn => "ja",
65        Lang::Cmn => "zh",
66        Lang::Kor => "ko",
67        Lang::Ara => "ar",
68        Lang::Hin => "hi",
69        _ => "und", // undetermined
70    }
71    .to_string()
72}
73
74/// Convert a BCP-47 tag back to `whatlang::Lang`, if supported.
75fn tag_to_lang(tag: &str) -> Option<Lang> {
76    // Normalize: take the primary subtag only (e.g. "en-US" -> "en")
77    let primary = tag.split('-').next().unwrap_or(tag);
78    match primary {
79        "en" => Some(Lang::Eng),
80        "fr" => Some(Lang::Fra),
81        "de" => Some(Lang::Deu),
82        "es" => Some(Lang::Spa),
83        "pt" => Some(Lang::Por),
84        "it" => Some(Lang::Ita),
85        "nl" => Some(Lang::Nld),
86        "ru" => Some(Lang::Rus),
87        "pl" => Some(Lang::Pol),
88        "sv" => Some(Lang::Swe),
89        "da" => Some(Lang::Dan),
90        "no" => Some(Lang::Nob),
91        "fi" => Some(Lang::Fin),
92        "uk" => Some(Lang::Ukr),
93        "cs" => Some(Lang::Ces),
94        "ro" => Some(Lang::Ron),
95        "hu" => Some(Lang::Hun),
96        "tr" => Some(Lang::Tur),
97        "ja" => Some(Lang::Jpn),
98        "zh" => Some(Lang::Cmn),
99        "ko" => Some(Lang::Kor),
100        "ar" => Some(Lang::Ara),
101        "hi" => Some(Lang::Hin),
102        _ => None,
103    }
104}
105
106#[cfg(test)]
107mod tests {
108    use super::*;
109
110    #[test]
111    fn detect_english() {
112        let result =
113            detect("The quick brown fox jumped over the lazy dog. It was a beautiful day.");
114        assert!(result.is_some());
115        let d = result.unwrap();
116        assert_eq!(d.tag, "en");
117        assert!(d.confidence > 0.5);
118    }
119
120    #[test]
121    fn detect_french() {
122        let result =
123            detect("Bonjour le monde. Comment allez-vous aujourd'hui? C'est une belle journée.");
124        assert!(result.is_some());
125        let d = result.unwrap();
126        assert_eq!(d.tag, "fr");
127    }
128
129    #[test]
130    fn detect_german() {
131        let result = detect(
132            "Die schnelle braune Fuchs springt über den faulen Hund. Es war ein schöner Tag.",
133        );
134        assert!(result.is_some());
135        let d = result.unwrap();
136        assert_eq!(d.tag, "de");
137    }
138
139    #[test]
140    fn detect_spanish() {
141        let result =
142            detect("El rápido zorro marrón salta sobre el perro perezoso. Fue un día hermoso.");
143        assert!(result.is_some());
144        let d = result.unwrap();
145        assert_eq!(d.tag, "es");
146    }
147
148    #[test]
149    fn detect_too_short_returns_none() {
150        let result = detect("Hi");
151        // Very short text may or may not detect
152        if let Some(d) = result {
153            // If it does detect, it shouldn't be reliable
154            assert!(!d.reliable || d.confidence < 0.9);
155        }
156    }
157
158    #[test]
159    fn detect_with_allowlist_restricts() {
160        let text = "The quick brown fox jumped over the lazy dog.";
161        let result = detect_with_allowlist(text, &["en", "fr"]);
162        assert!(result.is_some());
163        let d = result.unwrap();
164        assert!(d.tag == "en" || d.tag == "fr");
165    }
166
167    #[test]
168    fn tag_roundtrip() {
169        for tag in ["en", "fr", "de", "es", "ja", "zh", "ko"] {
170            let lang = tag_to_lang(tag);
171            assert!(lang.is_some(), "tag_to_lang failed for {tag}");
172            let back = lang_to_tag(lang.unwrap());
173            assert_eq!(back, tag, "roundtrip failed for {tag}");
174        }
175    }
176
177    #[test]
178    fn tag_from_bcp47_with_region() {
179        let lang = tag_to_lang("en-US");
180        assert_eq!(lang, Some(Lang::Eng));
181    }
182}