tpnote_lib/
lingua.rs

1//! This module abstracts the Lingua library API.
2use crate::settings::SETTINGS;
3use crate::{config::Mode, error::LibCfgError};
4pub(crate) use lingua::IsoCode639_1;
5use lingua::{LanguageDetector, LanguageDetectorBuilder};
6use std::collections::HashMap; // Reexport this type.
7
8/// A filter telling in which natural language(s) the input text is written.
9/// It returns an array of ISO 639-1 code representations listing the detected
10/// languages. If no language can be reliably identified, the output is the
11/// empty array.
12#[cfg(feature = "lang-detection")]
13pub(crate) fn get_lang(input: &str) -> Result<Vec<String>, LibCfgError> {
14    use itertools::Itertools;
15
16    let input = input.trim();
17    // Return early if there is no input text.
18    if input.is_empty() {
19        return Ok(vec![]);
20    }
21
22    let settings = SETTINGS.read_recursive();
23
24    // Check if we can return early.
25    match &settings.get_lang_filter.mode {
26        Mode::Disabled => return Ok(vec![]),
27
28        Mode::Error(e) => return Err(e.clone()),
29        _ => {}
30    }
31
32    // Build `LanguageDetector`.
33    let detector: LanguageDetector = if !&settings.get_lang_filter.language_candidates.is_empty() {
34        log::trace!(
35            "Execute template filter `get_lang` \
36                        with languages candidates: {:?}",
37            &settings.get_lang_filter.language_candidates,
38        );
39
40        LanguageDetectorBuilder::from_iso_codes_639_1(&settings.get_lang_filter.language_candidates)
41            .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
42            .build()
43    } else {
44        log::trace!(
45            "Execute template filter `get_lang` \
46                        with all available languages",
47        );
48        LanguageDetectorBuilder::from_all_languages()
49            .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
50            .build()
51    };
52
53    // Detect languages.
54    let detected_languages: Vec<String> = match &settings.get_lang_filter.mode {
55        Mode::Multilingual => {
56            //
57            let consecutive_words_min = settings.get_lang_filter.consecutive_words_min;
58            let words_total_percentage_min = settings.get_lang_filter.words_total_percentage_min;
59
60            let words_total = input.split_whitespace().count();
61            // `words_total / 3` relaxes the criteria for very shot input texts.
62            let words_min = [consecutive_words_min, words_total / 3];
63            let words_min = words_min.iter().min().unwrap();
64            log::trace!(
65                "Language snippets with less than {} words will be ignored.",
66                words_min
67            );
68
69            let words_distribution: HashMap<String, usize> = detector
70                .detect_multiple_languages_of(input)
71                .into_iter()
72                // Filter too short word sequences.
73                .filter(|l| {
74                    let allow_through = l.word_count() >= *words_min;
75                    log::trace!(
76                        "Language(s) detected: {}, {}, {}: {:?}",
77                        l.language().iso_code_639_1(),
78                        l.word_count(),
79                        allow_through,
80                        input[l.start_index()..l.end_index()]
81                            .chars()
82                            .take(50)
83                            .collect::<String>()
84                    );
85                    allow_through
86                })
87                .map(|l| (l.language().iso_code_639_1().to_string(), l.word_count()))
88                .into_grouping_map_by(|n| n.0.clone())
89                .aggregate(|acc, _key, val| Some(acc.unwrap_or(0) + val.1));
90
91            // Descending order sort.
92            let words_distribution: Vec<(String, usize)> = words_distribution
93                .into_iter()
94                .sorted_by_key(|l| usize::MAX - l.1)
95                .collect();
96            log::debug!(
97                "Languages distribution per word count:\n {:?}",
98                words_distribution
99            );
100
101            // Filter languages, whose words do not occur sufficiently in total.
102            let words_distribution_total: usize = words_distribution.iter().map(|l| l.1).sum();
103            let words_total_min: usize =
104                words_distribution_total * words_total_percentage_min / 100;
105
106            // Filter languages with too few words and return language list.
107            words_distribution
108                .into_iter()
109                .filter(|(l, wc)| {
110                    if *wc >= words_total_min {
111                        true
112                    } else {
113                        let words_percentage = wc * 100 / words_distribution_total;
114                        log::info!(
115                            "Language `{}` rejected: not enough words in total ({}%<{}%)",
116                            l,
117                            words_percentage,
118                            words_total_percentage_min
119                        );
120                        false
121                    }
122                })
123                .map(|(l, _)| l)
124                .collect::<Vec<String>>()
125        }
126
127        Mode::Monolingual => detector
128            .detect_language_of(input)
129            .into_iter()
130            .map(|l| l.iso_code_639_1().to_string())
131            .inspect(|l| log::debug!("Language: '{}' in input detected.", l))
132            .collect(),
133
134        Mode::Disabled => unreachable!(), // See early return above.
135
136        Mode::Error(_) => unreachable!(), // See early return above.
137    };
138
139    Ok(detected_languages)
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145    use parking_lot::RwLockWriteGuard;
146
147    #[test]
148    fn test_get_lang() {
149        use crate::{
150            config::{GetLang, Mode},
151            settings::Settings,
152        };
153        use lingua::IsoCode639_1;
154
155        // The `get_lang` filter requires an initialized `SETTINGS` object.
156        // Lock the config object for this test.
157        let get_lang_filter = GetLang {
158            mode: Mode::Multilingual,
159            language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
160            relative_distance_min: 0.2,
161            consecutive_words_min: 5,
162            words_total_percentage_min: 10,
163        };
164
165        let mut settings = SETTINGS.write();
166        *settings = Settings::default();
167        settings.get_lang_filter = get_lang_filter;
168        // This locks `SETTINGS` for further write access in this scope.
169        let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
170
171        let input = "Das große Haus";
172        let output = get_lang(input).unwrap();
173        assert_eq!("de", output[0]);
174
175        let input = "Il est venu trop tard";
176        let output = get_lang(input).unwrap();
177        assert_eq!("fr", output[0]);
178
179        let input = "How to set up a roof rack";
180        let output = get_lang(input).unwrap();
181        assert_eq!("en", output[0]);
182
183        let input = "1917039480 50198%-328470";
184        let output = get_lang(input).unwrap();
185        assert!(output.is_empty());
186
187        let input = " \t\n ";
188        let output = get_lang(input).unwrap();
189        assert!(output.is_empty());
190
191        let input = "Parlez-vous français? \
192        Ich spreche Französisch nur ein bisschen. \
193        A little bit is better than nothing. \
194        Noch mehr Deutsch. \
195        Bien-sûr, je parle un peu. Qu'est-ce que tu veux?";
196        let output = get_lang(input).unwrap();
197
198        // Execute template filter `get_lang` with languages candidates: [EN, FR, DE, ET]
199        // Language(s) detected: fr, 2, false: "Parlez-vous français?"
200        // Language(s) detected: de, 7, true: "Ich spreche Französisch nur ein bisschen."
201        // Language(s) detected: en, 6, true: "little bit is better than nothing."
202        // Language(s) detected: de, 3, false: "Noch mehr Deutsch."
203        // Language(s) detected: fr, 9, true: "Bien-sûr, je parle un peu. Qu'est-ce que tu veux?"
204        // Languages distribution per word count: [("fr", 9), ("de", 7), ("en", 6)]
205        assert_eq!(output, ["fr", "de", "en"]);
206
207        let input = "Parlez-vous français? \
208        Ich spreche Französisch nur ein bisschen. \
209        A little bit is better than nothing.";
210        let output = get_lang(input).unwrap();
211
212        // Scheme index: 0, applying the content template: `tmpl.from_clipboard_content`
213        // Execute template filter `get_lang` with languages candidates: [EN, FR, DE, ET]
214        // Language(s) detected: fr, 2, false: "Parlez-vous français?"
215        // Language(s) detected: de, 7, true: "Ich spreche Französisch nur ein bisschen."
216        // Language(s) detected: en, 6, true: "little bit is better than nothing."
217        // Languages distribution per word count: [("de", 7), ("en", 6)]
218        assert_eq!(output, ["de", "en"]);
219
220        // Release the lock.
221        drop(_settings);
222    }
223
224    #[test]
225    fn test_get_lang2() {
226        use crate::{
227            config::{GetLang, Mode},
228            settings::Settings,
229        };
230        use lingua::IsoCode639_1;
231
232        // The `get_lang` filter requires an initialized `SETTINGS` object.
233        // Lock the config object for this test.
234        let get_lang_filter = GetLang {
235            mode: Mode::Monolingual,
236            language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
237            relative_distance_min: 0.2,
238            consecutive_words_min: 5,
239            words_total_percentage_min: 10,
240        };
241
242        let mut settings = SETTINGS.write();
243        *settings = Settings::default();
244        settings.get_lang_filter = get_lang_filter;
245        // This locks `SETTINGS` for further write access in this scope.
246        let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
247
248        let input = "Das große Haus";
249        let output = get_lang(input).unwrap();
250        assert_eq!("de", output[0]);
251
252        let input = "Il est venu trop tard";
253        let output = get_lang(input).unwrap();
254        assert_eq!("fr", output[0]);
255
256        let input = "How to set up a roof rack";
257        let output = get_lang(input).unwrap();
258        assert_eq!("en", output[0]);
259
260        let input = "1917039480 50198%-328470";
261        let output = get_lang(input).unwrap();
262        assert!(output.is_empty());
263
264        let input = " \t\n ";
265        let output = get_lang(input).unwrap();
266        assert!(output.is_empty());
267
268        let input = "Parlez-vous français? \
269        Ich spreche Französisch nur ein bisschen. \
270        A little bit is better than nothing.";
271        let output = get_lang(input).unwrap();
272        assert_eq!(output.len(), 1);
273        assert_eq!("de", output[0]);
274
275        // Release the lock.
276        drop(_settings);
277    }
278}