tpnote_lib/
lingua.rs

1//! This module abstracts the Lingua library API.
2use crate::settings::SETTINGS;
3use crate::{config::Mode, error::LibCfgError};
4pub(crate) use lingua::IsoCode639_1;
5use lingua::{LanguageDetector, LanguageDetectorBuilder};
6use parse_hyperlinks::iterator::MarkupLink;
7use parse_hyperlinks::parser::Link;
8use std::collections::HashMap; // Reexport this type.
9
10/// A filter telling in which natural language(s) the input text is written.
11/// It returns an array of ISO 639-1 code representations listing the detected
12/// languages. If no language can be reliably identified, the output is the
13/// empty array.
14#[cfg(feature = "lang-detection")]
15pub(crate) fn get_lang(input: &str) -> Result<Vec<String>, LibCfgError> {
16    use std::borrow::Cow;
17
18    use itertools::Itertools;
19
20    let input = input.trim();
21    // Return early if there is no input text.
22    if input.is_empty() {
23        return Ok(vec![]);
24    }
25
26    let settings = SETTINGS.read_recursive();
27
28    // Check if we can return early.
29    match &settings.get_lang_filter.mode {
30        Mode::Disabled => return Ok(vec![]),
31
32        Mode::Error(e) => return Err(e.clone()),
33        _ => {}
34    }
35
36    // Build `LanguageDetector`.
37    let detector: LanguageDetector = if !&settings.get_lang_filter.language_candidates.is_empty() {
38        log::trace!(
39            "Execute template filter `get_lang` \
40                        with languages candidates: {:?}",
41            &settings.get_lang_filter.language_candidates,
42        );
43
44        LanguageDetectorBuilder::from_iso_codes_639_1(&settings.get_lang_filter.language_candidates)
45            .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
46            .build()
47    } else {
48        log::trace!(
49            "Execute template filter `get_lang` \
50                        with all available languages",
51        );
52        LanguageDetectorBuilder::from_all_languages()
53            .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
54            .build()
55    };
56
57    // Remove URLs
58    let mut sniplets: Vec<Cow<str>> = Vec::new();
59    let mut remnant = "";
60    for ((skipped, _, r), link) in MarkupLink::new(input, false) {
61        sniplets.push(Cow::from(skipped));
62        remnant = r;
63        match link {
64            Link::Text2Dest(text, _, title) => {
65                if !text.is_empty() {
66                    sniplets.push(text)
67                };
68                if !title.is_empty() {
69                    sniplets.push(title)
70                };
71            }
72            Link::Text2Label(text, _) => {
73                if !text.is_empty() {
74                    sniplets.push(text)
75                };
76            }
77            Link::TextLabel2Dest(text, _, _) => {
78                if !text.is_empty() {
79                    sniplets.push(text)
80                };
81            }
82
83            Link::Image(alt_text, _) => {
84                if !alt_text.is_empty() {
85                    sniplets.push(alt_text)
86                };
87            }
88            Link::Image2Dest(text1, img_alt, _, text2, _, title) => {
89                if !text1.is_empty() {
90                    sniplets.push(text1)
91                };
92                if !img_alt.is_empty() {
93                    sniplets.push(img_alt)
94                };
95                if !text2.is_empty() {
96                    sniplets.push(text2)
97                };
98                if !title.is_empty() {
99                    sniplets.push(title)
100                };
101            }
102            _ => {}
103        }
104    }
105    if !remnant.is_empty() {
106        sniplets.push(Cow::from(remnant));
107    }
108    if sniplets.is_empty() {
109        sniplets.push(Cow::from(input));
110    }
111    // End of remove URLs.
112
113    let texts = sniplets.as_slice();
114
115
116    // Detect languages.
117    use crate::FlattenWithIndexExt;
118    let detected_languages: Vec<String> = match &settings.get_lang_filter.mode {
119        Mode::Multilingual => {
120            //
121            let consecutive_words_min = settings.get_lang_filter.consecutive_words_min;
122            let words_total_percentage_min = settings.get_lang_filter.words_total_percentage_min;
123
124            let words_total: usize = texts
125                .iter()
126                .map(|slice| slice.split_whitespace().count())
127                .sum();
128            // `words_total / 3` relaxes the criteria for very shot input texts.
129            let words_min = [consecutive_words_min, words_total / 3];
130            let words_min = words_min.iter().min().unwrap();
131            log::trace!(
132                "Language snippets with less than {} words will be ignored.",
133                words_min
134            );
135
136            let words_distribution: HashMap<String, usize> = detector
137                .detect_multiple_languages_in_parallel_of(texts)
138                .into_iter()
139                .flatten_with_index()
140                // Filter too short word sequences.
141                .filter(|(i, l)| {
142                    let allow_through = l.word_count() >= *words_min;
143                    log::trace!(
144                        "Language(s) detected in [{}]: {}, {}, {}: {:?}",
145                        i,
146                        l.language().iso_code_639_1(),
147                        l.word_count(),
148                        allow_through,
149                        texts[*i][l.start_index()..l.end_index()]
150                            .chars()
151                            .take(60)
152                            .collect::<String>()
153                    );
154                    allow_through
155                })
156                .map(|(_, l)| (l.language().iso_code_639_1().to_string(), l.word_count()))
157                .into_grouping_map_by(|n| n.0.clone())
158                .aggregate(|acc, _key, val| Some(acc.unwrap_or(0) + val.1));
159
160            // Descending order sort.
161            let words_distribution: Vec<(String, usize)> = words_distribution
162                .into_iter()
163                .sorted_by_key(|l| usize::MAX - l.1)
164                .collect();
165            log::debug!(
166                "Languages distribution per word count:\n {:?}",
167                words_distribution
168            );
169
170            // Filter languages, whose words do not occur sufficiently in total.
171            let words_distribution_total: usize = words_distribution.iter().map(|l| l.1).sum();
172            let words_total_min: usize =
173                words_distribution_total * words_total_percentage_min / 100;
174
175            // Filter languages with too few words and return language list.
176            words_distribution
177                .into_iter()
178                .filter(|(l, wc)| {
179                    if *wc >= words_total_min {
180                        true
181                    } else {
182                        let words_percentage = wc * 100 / words_distribution_total;
183                        log::info!(
184                            "Language `{}` rejected: not enough words in total ({}%<{}%)",
185                            l,
186                            words_percentage,
187                            words_total_percentage_min
188                        );
189                        false
190                    }
191                })
192                .map(|(l, _)| l)
193                .collect::<Vec<String>>()
194        }
195
196        Mode::Monolingual => detector
197            .detect_languages_in_parallel_of(texts)
198            .into_iter()
199            .flatten()
200            .map(|l| l.iso_code_639_1().to_string())
201            .inspect(|l| log::debug!("Language: '{}' in input detected.", l))
202            .collect(),
203
204        Mode::Disabled => unreachable!(), // See early return above.
205
206        Mode::Error(_) => unreachable!(), // See early return above.
207    };
208
209    Ok(detected_languages)
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215    use parking_lot::RwLockWriteGuard;
216
217    #[test]
218    fn test_get_lang() {
219        use crate::{
220            config::{GetLang, Mode},
221            settings::Settings,
222        };
223        use lingua::IsoCode639_1;
224
225        // The `get_lang` filter requires an initialized `SETTINGS` object.
226        // Lock the config object for this test.
227        let get_lang_filter = GetLang {
228            mode: Mode::Multilingual,
229            language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
230            relative_distance_min: 0.2,
231            consecutive_words_min: 5,
232            words_total_percentage_min: 10,
233        };
234
235        let mut settings = SETTINGS.write();
236        *settings = Settings::default();
237        settings.get_lang_filter = get_lang_filter;
238        // This locks `SETTINGS` for further write access in this scope.
239        let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
240
241        let input = "Das große Haus";
242        let output = get_lang(input).unwrap();
243        assert_eq!("de", output[0]);
244
245        let input = "Il est venu trop tard";
246        let output = get_lang(input).unwrap();
247        assert_eq!("fr", output[0]);
248
249        let input = "How to set up a roof rack";
250        let output = get_lang(input).unwrap();
251        assert_eq!("en", output[0]);
252
253        let input = "1917039480 50198%-328470";
254        let output = get_lang(input).unwrap();
255        assert!(output.is_empty());
256
257        let input = " \t\n ";
258        let output = get_lang(input).unwrap();
259        assert!(output.is_empty());
260
261        let input = "Parlez-vous français? \
262        Ich spreche Französisch nur ein bisschen. \
263        A little bit is better than nothing. \
264        Noch mehr Deutsch. \
265        Bien-sûr, je parle un peu. Qu'est-ce que tu veux?";
266        let output = get_lang(input).unwrap();
267
268        // Execute template filter `get_lang` with languages candidates: [EN, FR, DE, ET]
269        // Language(s) detected: fr, 2, false: "Parlez-vous français?"
270        // Language(s) detected: de, 7, true: "Ich spreche Französisch nur ein bisschen."
271        // Language(s) detected: en, 6, true: "little bit is better than nothing."
272        // Language(s) detected: de, 3, false: "Noch mehr Deutsch."
273        // Language(s) detected: fr, 9, true: "Bien-sûr, je parle un peu. Qu'est-ce que tu veux?"
274        // Languages distribution per word count: [("fr", 9), ("de", 7), ("en", 6)]
275        assert_eq!(output, ["fr", "de", "en"]);
276
277        let input = "Parlez-vous français? \
278        Ich spreche Französisch nur ein bisschen. \
279        A little bit is better than nothing.";
280        let output = get_lang(input).unwrap();
281
282        // Scheme index: 0, applying the content template: `tmpl.from_clipboard_content`
283        // Execute template filter `get_lang` with languages candidates: [EN, FR, DE, ET]
284        // Language(s) detected: fr, 2, false: "Parlez-vous français?"
285        // Language(s) detected: de, 7, true: "Ich spreche Französisch nur ein bisschen."
286        // Language(s) detected: en, 6, true: "little bit is better than nothing."
287        // Languages distribution per word count: [("de", 7), ("en", 6)]
288        assert_eq!(output, ["de", "en"]);
289
290        // Release the lock.
291        drop(_settings);
292    }
293
294    #[test]
295    fn test_get_lang2() {
296        use crate::{
297            config::{GetLang, Mode},
298            settings::Settings,
299        };
300        use lingua::IsoCode639_1;
301
302        // The `get_lang` filter requires an initialized `SETTINGS` object.
303        // Lock the config object for this test.
304        let get_lang_filter = GetLang {
305            mode: Mode::Monolingual,
306            language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
307            relative_distance_min: 0.2,
308            consecutive_words_min: 5,
309            words_total_percentage_min: 10,
310        };
311
312        let mut settings = SETTINGS.write();
313        *settings = Settings::default();
314        settings.get_lang_filter = get_lang_filter;
315        // This locks `SETTINGS` for further write access in this scope.
316        let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
317
318        let input = "Das große Haus";
319        let output = get_lang(input).unwrap();
320        assert_eq!("de", output[0]);
321
322        let input = "Il est venu trop tard";
323        let output = get_lang(input).unwrap();
324        assert_eq!("fr", output[0]);
325
326        let input = "How to set up a roof rack";
327        let output = get_lang(input).unwrap();
328        assert_eq!("en", output[0]);
329
330        let input = "1917039480 50198%-328470";
331        let output = get_lang(input).unwrap();
332        assert!(output.is_empty());
333
334        let input = " \t\n ";
335        let output = get_lang(input).unwrap();
336        assert!(output.is_empty());
337
338        let input = "Parlez-vous français? \
339        Ich spreche Französisch nur ein bisschen. \
340        A little bit is better than nothing.";
341        let output = get_lang(input).unwrap();
342        assert_eq!(output.len(), 1);
343        assert_eq!("de", output[0]);
344
345        // Release the lock.
346        drop(_settings);
347    }
348}