1use crate::settings::SETTINGS;
3use crate::{config::Mode, error::LibCfgError};
4pub(crate) use lingua::IsoCode639_1;
5use lingua::{LanguageDetector, LanguageDetectorBuilder};
6use std::collections::HashMap; #[cfg(feature = "lang-detection")]
13pub(crate) fn get_lang(input: &str) -> Result<Vec<String>, LibCfgError> {
14    use itertools::Itertools;
15
16    let input = input.trim();
17    if input.is_empty() {
19        return Ok(vec![]);
20    }
21
22    let settings = SETTINGS.read_recursive();
23
24    match &settings.get_lang_filter.mode {
26        Mode::Disabled => return Ok(vec![]),
27
28        Mode::Error(e) => return Err(e.clone()),
29        _ => {}
30    }
31
32    let detector: LanguageDetector = if !&settings.get_lang_filter.language_candidates.is_empty() {
34        log::trace!(
35            "Execute template filter `get_lang` \
36                        with languages candidates: {:?}",
37            &settings.get_lang_filter.language_candidates,
38        );
39
40        LanguageDetectorBuilder::from_iso_codes_639_1(&settings.get_lang_filter.language_candidates)
41            .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
42            .build()
43    } else {
44        log::trace!(
45            "Execute template filter `get_lang` \
46                        with all available languages",
47        );
48        LanguageDetectorBuilder::from_all_languages()
49            .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
50            .build()
51    };
52
53    let detected_languages: Vec<String> = match &settings.get_lang_filter.mode {
55        Mode::Multilingual => {
56            let consecutive_words_min = settings.get_lang_filter.consecutive_words_min;
58            let words_total_percentage_min = settings.get_lang_filter.words_total_percentage_min;
59
60            let words_total = input.split_whitespace().count();
61            let words_min = [consecutive_words_min, words_total / 3];
63            let words_min = words_min.iter().min().unwrap();
64            log::trace!(
65                "Language snippets with less than {} words will be ignored.",
66                words_min
67            );
68
69            let words_distribution: HashMap<String, usize> = detector
70                .detect_multiple_languages_of(input)
71                .into_iter()
72                .filter(|l| {
74                    let allow_through = l.word_count() >= *words_min;
75                    log::trace!(
76                        "Language(s) detected: {}, {}, {}: {:?}",
77                        l.language().iso_code_639_1(),
78                        l.word_count(),
79                        allow_through,
80                        input[l.start_index()..l.end_index()]
81                            .chars()
82                            .take(50)
83                            .collect::<String>()
84                    );
85                    allow_through
86                })
87                .map(|l| (l.language().iso_code_639_1().to_string(), l.word_count()))
88                .into_grouping_map_by(|n| n.0.clone())
89                .aggregate(|acc, _key, val| Some(acc.unwrap_or(0) + val.1));
90
91            let words_distribution: Vec<(String, usize)> = words_distribution
93                .into_iter()
94                .sorted_by_key(|l| usize::MAX - l.1)
95                .collect();
96            log::debug!(
97                "Languages distribution per word count:\n {:?}",
98                words_distribution
99            );
100
101            let words_distribution_total: usize = words_distribution.iter().map(|l| l.1).sum();
103            let words_total_min: usize =
104                words_distribution_total * words_total_percentage_min / 100;
105
106            words_distribution
108                .into_iter()
109                .filter(|(l, wc)| {
110                    if *wc >= words_total_min {
111                        true
112                    } else {
113                        let words_percentage = wc * 100 / words_distribution_total;
114                        log::info!(
115                            "Language `{}` rejected: not enough words in total ({}%<{}%)",
116                            l,
117                            words_percentage,
118                            words_total_percentage_min
119                        );
120                        false
121                    }
122                })
123                .map(|(l, _)| l)
124                .collect::<Vec<String>>()
125        }
126
127        Mode::Monolingual => detector
128            .detect_language_of(input)
129            .into_iter()
130            .map(|l| l.iso_code_639_1().to_string())
131            .inspect(|l| log::debug!("Language: '{}' in input detected.", l))
132            .collect(),
133
134        Mode::Disabled => unreachable!(), Mode::Error(_) => unreachable!(), };
138
139    Ok(detected_languages)
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145    use parking_lot::RwLockWriteGuard;
146
147    #[test]
148    fn test_get_lang() {
149        use crate::{
150            config::{GetLang, Mode},
151            settings::Settings,
152        };
153        use lingua::IsoCode639_1;
154
155        let get_lang_filter = GetLang {
158            mode: Mode::Multilingual,
159            language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
160            relative_distance_min: 0.2,
161            consecutive_words_min: 5,
162            words_total_percentage_min: 10,
163        };
164
165        let mut settings = SETTINGS.write();
166        *settings = Settings::default();
167        settings.get_lang_filter = get_lang_filter;
168        let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
170
171        let input = "Das große Haus";
172        let output = get_lang(input).unwrap();
173        assert_eq!("de", output[0]);
174
175        let input = "Il est venu trop tard";
176        let output = get_lang(input).unwrap();
177        assert_eq!("fr", output[0]);
178
179        let input = "How to set up a roof rack";
180        let output = get_lang(input).unwrap();
181        assert_eq!("en", output[0]);
182
183        let input = "1917039480 50198%-328470";
184        let output = get_lang(input).unwrap();
185        assert!(output.is_empty());
186
187        let input = " \t\n ";
188        let output = get_lang(input).unwrap();
189        assert!(output.is_empty());
190
191        let input = "Parlez-vous français? \
192        Ich spreche Französisch nur ein bisschen. \
193        A little bit is better than nothing. \
194        Noch mehr Deutsch. \
195        Bien-sûr, je parle un peu. Qu'est-ce que tu veux?";
196        let output = get_lang(input).unwrap();
197
198        assert_eq!(output, ["fr", "de", "en"]);
206
207        let input = "Parlez-vous français? \
208        Ich spreche Französisch nur ein bisschen. \
209        A little bit is better than nothing.";
210        let output = get_lang(input).unwrap();
211
212        assert_eq!(output, ["de", "en"]);
219
220        drop(_settings);
222    }
223
224    #[test]
225    fn test_get_lang2() {
226        use crate::{
227            config::{GetLang, Mode},
228            settings::Settings,
229        };
230        use lingua::IsoCode639_1;
231
232        let get_lang_filter = GetLang {
235            mode: Mode::Monolingual,
236            language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
237            relative_distance_min: 0.2,
238            consecutive_words_min: 5,
239            words_total_percentage_min: 10,
240        };
241
242        let mut settings = SETTINGS.write();
243        *settings = Settings::default();
244        settings.get_lang_filter = get_lang_filter;
245        let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
247
248        let input = "Das große Haus";
249        let output = get_lang(input).unwrap();
250        assert_eq!("de", output[0]);
251
252        let input = "Il est venu trop tard";
253        let output = get_lang(input).unwrap();
254        assert_eq!("fr", output[0]);
255
256        let input = "How to set up a roof rack";
257        let output = get_lang(input).unwrap();
258        assert_eq!("en", output[0]);
259
260        let input = "1917039480 50198%-328470";
261        let output = get_lang(input).unwrap();
262        assert!(output.is_empty());
263
264        let input = " \t\n ";
265        let output = get_lang(input).unwrap();
266        assert!(output.is_empty());
267
268        let input = "Parlez-vous français? \
269        Ich spreche Französisch nur ein bisschen. \
270        A little bit is better than nothing.";
271        let output = get_lang(input).unwrap();
272        assert_eq!(output.len(), 1);
273        assert_eq!("de", output[0]);
274
275        drop(_settings);
277    }
278}