Skip to main content

tpnote_lib/
lingua.rs

1//! This module abstracts the Lingua library API.
2use crate::settings::SETTINGS;
3use crate::{config::Mode, error::LibCfgError};
4pub(crate) use lingua::IsoCode639_1;
5use lingua::{LanguageDetector, LanguageDetectorBuilder};
6use parse_hyperlinks::iterator::MarkupLink;
7use parse_hyperlinks::parser::Link;
8use std::collections::HashMap; // Reexport this type.
9
10/// A filter telling in which natural language(s) the input text is written.
11/// It returns an array of ISO 639-1 code representations listing the detected
12/// languages. If no language can be reliably identified, the output is the
13/// empty array.
14#[cfg(feature = "lang-detection")]
15pub(crate) fn get_lang(input: &str) -> Result<Vec<String>, LibCfgError> {
16    use std::borrow::Cow;
17
18    use itertools::Itertools;
19
20    let input = input.trim();
21    // Return early if there is no input text.
22    if input.is_empty() {
23        return Ok(vec![]);
24    }
25
26    let settings = SETTINGS.read_recursive();
27
28    // Check if we can return early.
29    match &settings.get_lang_filter.mode {
30        Mode::Disabled => return Ok(vec![]),
31
32        Mode::Error(e) => return Err(e.clone()),
33        _ => {}
34    }
35
36    // Build `LanguageDetector`.
37    let detector: LanguageDetector = if !&settings.get_lang_filter.language_candidates.is_empty() {
38        log::trace!(
39            "Execute template filter `get_lang` \
40                        with languages candidates: {:?}",
41            &settings.get_lang_filter.language_candidates,
42        );
43
44        LanguageDetectorBuilder::from_iso_codes_639_1(&settings.get_lang_filter.language_candidates)
45            .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
46            .build()
47    } else {
48        log::trace!(
49            "Execute template filter `get_lang` \
50                        with all available languages",
51        );
52        LanguageDetectorBuilder::from_all_languages()
53            .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
54            .build()
55    };
56
57    // Remove URLs
58    let mut sniplets: Vec<Cow<str>> = Vec::new();
59    let mut remnant = "";
60    for ((skipped, _, r), link) in MarkupLink::new(input, false) {
61        sniplets.push(Cow::from(skipped));
62        remnant = r;
63        match link {
64            Link::Text2Dest(text, _, title) => {
65                if !text.is_empty() {
66                    sniplets.push(text)
67                };
68                if !title.is_empty() {
69                    sniplets.push(title)
70                };
71            }
72            Link::Text2Label(text, _) => {
73                if !text.is_empty() {
74                    sniplets.push(text)
75                };
76            }
77            Link::TextLabel2Dest(text, _, _) => {
78                if !text.is_empty() {
79                    sniplets.push(text)
80                };
81            }
82
83            Link::Image(alt_text, _) => {
84                if !alt_text.is_empty() {
85                    sniplets.push(alt_text)
86                };
87            }
88            Link::Image2Dest(text1, img_alt, _, text2, _, title) => {
89                if !text1.is_empty() {
90                    sniplets.push(text1)
91                };
92                if !img_alt.is_empty() {
93                    sniplets.push(img_alt)
94                };
95                if !text2.is_empty() {
96                    sniplets.push(text2)
97                };
98                if !title.is_empty() {
99                    sniplets.push(title)
100                };
101            }
102            _ => {}
103        }
104    }
105    if !remnant.is_empty() {
106        sniplets.push(Cow::from(remnant));
107    }
108    if sniplets.is_empty() {
109        sniplets.push(Cow::from(input));
110    }
111    // End of remove URLs.
112
113    let texts = sniplets.as_slice();
114
115    // Detect languages.
116    use crate::FlattenWithIndexExt;
117    let detected_languages: Vec<String> = match &settings.get_lang_filter.mode {
118        Mode::Multilingual => {
119            //
120            let consecutive_words_min = settings.get_lang_filter.consecutive_words_min;
121            let words_total_percentage_min = settings.get_lang_filter.words_total_percentage_min;
122
123            let words_total: usize = texts
124                .iter()
125                .map(|slice| slice.split_whitespace().count())
126                .sum();
127            // `words_total / 3` relaxes the criteria for very shot input texts.
128            let words_min = [consecutive_words_min, words_total / 3];
129            let words_min = words_min.iter().min().unwrap();
130            log::trace!(
131                "Language snippets with less than {} words will be ignored.",
132                words_min
133            );
134
135            let words_distribution: HashMap<String, usize> = detector
136                .detect_multiple_languages_in_parallel_of(texts)
137                .into_iter()
138                .flatten_with_index()
139                // Filter too short word sequences.
140                .filter(|(i, l)| {
141                    let allow_through = l.word_count() >= *words_min;
142                    log::trace!(
143                        "Language(s) detected in [{}]: {}, {}, {}: {:?}",
144                        i,
145                        l.language().iso_code_639_1(),
146                        l.word_count(),
147                        allow_through,
148                        texts[*i][l.start_index()..l.end_index()]
149                            .chars()
150                            .take(60)
151                            .collect::<String>()
152                    );
153                    allow_through
154                })
155                .map(|(_, l)| (l.language().iso_code_639_1().to_string(), l.word_count()))
156                .into_grouping_map_by(|n| n.0.clone())
157                .aggregate(|acc, _key, val| Some(acc.unwrap_or(0) + val.1));
158
159            // Descending order sort.
160            let words_distribution: Vec<(String, usize)> = words_distribution
161                .into_iter()
162                .sorted_by_key(|l| usize::MAX - l.1)
163                .collect();
164            log::debug!(
165                "Languages distribution per word count:\n {:?}",
166                words_distribution
167            );
168
169            // Filter languages, whose words do not occur sufficiently in total.
170            let words_distribution_total: usize = words_distribution.iter().map(|l| l.1).sum();
171            let words_total_min: usize =
172                words_distribution_total * words_total_percentage_min / 100;
173
174            // Filter languages with too few words and return language list.
175            words_distribution
176                .into_iter()
177                .filter(|(l, wc)| {
178                    if *wc >= words_total_min {
179                        true
180                    } else {
181                        let words_percentage = wc * 100 / words_distribution_total;
182                        log::info!(
183                            "Language `{}` rejected: not enough words in total ({}%<{}%)",
184                            l,
185                            words_percentage,
186                            words_total_percentage_min
187                        );
188                        false
189                    }
190                })
191                .map(|(l, _)| l)
192                .collect::<Vec<String>>()
193        }
194
195        Mode::Monolingual => detector
196            .detect_languages_in_parallel_of(texts)
197            .into_iter()
198            .flatten()
199            .map(|l| l.iso_code_639_1().to_string())
200            .inspect(|l| log::debug!("Language: '{}' in input detected.", l))
201            .collect(),
202
203        Mode::Disabled => unreachable!(), // See early return above.
204
205        Mode::Error(_) => unreachable!(), // See early return above.
206    };
207
208    Ok(detected_languages)
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214    use parking_lot::RwLockWriteGuard;
215
216    #[test]
217    fn test_get_lang() {
218        use crate::{
219            config::{GetLang, Mode},
220            settings::Settings,
221        };
222        use lingua::IsoCode639_1;
223
224        // The `get_lang` filter requires an initialized `SETTINGS` object.
225        // Lock the config object for this test.
226        let get_lang_filter = GetLang {
227            mode: Mode::Multilingual,
228            language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
229            relative_distance_min: 0.2,
230            consecutive_words_min: 5,
231            words_total_percentage_min: 10,
232        };
233
234        let mut settings = SETTINGS.write();
235        *settings = Settings::default();
236        settings.get_lang_filter = get_lang_filter;
237        // This locks `SETTINGS` for further write access in this scope.
238        let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
239
240        let input = "Das große Haus";
241        let output = get_lang(input).unwrap();
242        assert_eq!("de", output[0]);
243
244        let input = "Il est venu trop tard";
245        let output = get_lang(input).unwrap();
246        assert_eq!("fr", output[0]);
247
248        let input = "How to set up a roof rack";
249        let output = get_lang(input).unwrap();
250        assert_eq!("en", output[0]);
251
252        let input = "1917039480 50198%-328470";
253        let output = get_lang(input).unwrap();
254        assert!(output.is_empty());
255
256        let input = " \t\n ";
257        let output = get_lang(input).unwrap();
258        assert!(output.is_empty());
259
260        let input = "Parlez-vous français? \
261        Ich spreche Französisch nur ein bisschen. \
262        A little bit is better than nothing. \
263        Noch mehr Deutsch. \
264        Bien-sûr, je parle un peu. Qu'est-ce que tu veux?";
265        let output = get_lang(input).unwrap();
266
267        // Execute template filter `get_lang` with languages candidates: [EN, FR, DE, ET]
268        // Language(s) detected: fr, 2, false: "Parlez-vous français?"
269        // Language(s) detected: de, 7, true: "Ich spreche Französisch nur ein bisschen."
270        // Language(s) detected: en, 6, true: "little bit is better than nothing."
271        // Language(s) detected: de, 3, false: "Noch mehr Deutsch."
272        // Language(s) detected: fr, 9, true: "Bien-sûr, je parle un peu. Qu'est-ce que tu veux?"
273        // Languages distribution per word count: [("fr", 9), ("de", 7), ("en", 6)]
274        assert_eq!(output, ["fr", "de", "en"]);
275
276        let input = "Parlez-vous français? \
277        Ich spreche Französisch nur ein bisschen. \
278        A little bit is better than nothing.";
279        let output = get_lang(input).unwrap();
280
281        // Scheme index: 0, applying the content template: `tmpl.from_clipboard_content`
282        // Execute template filter `get_lang` with languages candidates: [EN, FR, DE, ET]
283        // Language(s) detected: fr, 2, false: "Parlez-vous français?"
284        // Language(s) detected: de, 7, true: "Ich spreche Französisch nur ein bisschen."
285        // Language(s) detected: en, 6, true: "little bit is better than nothing."
286        // Languages distribution per word count: [("de", 7), ("en", 6)]
287        assert_eq!(output, ["de", "en"]);
288
289        // Release the lock.
290        drop(_settings);
291    }
292
293    #[test]
294    fn test_get_lang2() {
295        use crate::{
296            config::{GetLang, Mode},
297            settings::Settings,
298        };
299        use lingua::IsoCode639_1;
300
301        // The `get_lang` filter requires an initialized `SETTINGS` object.
302        // Lock the config object for this test.
303        let get_lang_filter = GetLang {
304            mode: Mode::Monolingual,
305            language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
306            relative_distance_min: 0.2,
307            consecutive_words_min: 5,
308            words_total_percentage_min: 10,
309        };
310
311        let mut settings = SETTINGS.write();
312        *settings = Settings::default();
313        settings.get_lang_filter = get_lang_filter;
314        // This locks `SETTINGS` for further write access in this scope.
315        let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
316
317        let input = "Das große Haus";
318        let output = get_lang(input).unwrap();
319        assert_eq!("de", output[0]);
320
321        let input = "Il est venu trop tard";
322        let output = get_lang(input).unwrap();
323        assert_eq!("fr", output[0]);
324
325        let input = "How to set up a roof rack";
326        let output = get_lang(input).unwrap();
327        assert_eq!("en", output[0]);
328
329        let input = "1917039480 50198%-328470";
330        let output = get_lang(input).unwrap();
331        assert!(output.is_empty());
332
333        let input = " \t\n ";
334        let output = get_lang(input).unwrap();
335        assert!(output.is_empty());
336
337        let input = "Parlez-vous français? \
338        Ich spreche Französisch nur ein bisschen. \
339        A little bit is better than nothing.";
340        let output = get_lang(input).unwrap();
341        assert_eq!(output.len(), 1);
342        assert_eq!("de", output[0]);
343
344        // Release the lock.
345        drop(_settings);
346    }
347}