1use crate::settings::SETTINGS;
3use crate::{config::Mode, error::LibCfgError};
4pub(crate) use lingua::IsoCode639_1;
5use lingua::{LanguageDetector, LanguageDetectorBuilder};
6use parse_hyperlinks::iterator::MarkupLink;
7use parse_hyperlinks::parser::Link;
8use std::collections::HashMap; #[cfg(feature = "lang-detection")]
15pub(crate) fn get_lang(input: &str) -> Result<Vec<String>, LibCfgError> {
16 use std::borrow::Cow;
17
18 use itertools::Itertools;
19
20 let input = input.trim();
21 if input.is_empty() {
23 return Ok(vec![]);
24 }
25
26 let settings = SETTINGS.read_recursive();
27
28 match &settings.get_lang_filter.mode {
30 Mode::Disabled => return Ok(vec![]),
31
32 Mode::Error(e) => return Err(e.clone()),
33 _ => {}
34 }
35
36 let detector: LanguageDetector = if !&settings.get_lang_filter.language_candidates.is_empty() {
38 log::trace!(
39 "Execute template filter `get_lang` \
40 with languages candidates: {:?}",
41 &settings.get_lang_filter.language_candidates,
42 );
43
44 LanguageDetectorBuilder::from_iso_codes_639_1(&settings.get_lang_filter.language_candidates)
45 .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
46 .build()
47 } else {
48 log::trace!(
49 "Execute template filter `get_lang` \
50 with all available languages",
51 );
52 LanguageDetectorBuilder::from_all_languages()
53 .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
54 .build()
55 };
56
57 let mut sniplets: Vec<Cow<str>> = Vec::new();
59 let mut remnant = "";
60 for ((skipped, _, r), link) in MarkupLink::new(input, false) {
61 sniplets.push(Cow::from(skipped));
62 remnant = r;
63 match link {
64 Link::Text2Dest(text, _, title) => {
65 if !text.is_empty() {
66 sniplets.push(text)
67 };
68 if !title.is_empty() {
69 sniplets.push(title)
70 };
71 }
72 Link::Text2Label(text, _) => {
73 if !text.is_empty() {
74 sniplets.push(text)
75 };
76 }
77 Link::TextLabel2Dest(text, _, _) => {
78 if !text.is_empty() {
79 sniplets.push(text)
80 };
81 }
82
83 Link::Image(alt_text, _) => {
84 if !alt_text.is_empty() {
85 sniplets.push(alt_text)
86 };
87 }
88 Link::Image2Dest(text1, img_alt, _, text2, _, title) => {
89 if !text1.is_empty() {
90 sniplets.push(text1)
91 };
92 if !img_alt.is_empty() {
93 sniplets.push(img_alt)
94 };
95 if !text2.is_empty() {
96 sniplets.push(text2)
97 };
98 if !title.is_empty() {
99 sniplets.push(title)
100 };
101 }
102 _ => {}
103 }
104 }
105 if !remnant.is_empty() {
106 sniplets.push(Cow::from(remnant));
107 }
108 if sniplets.is_empty() {
109 sniplets.push(Cow::from(input));
110 }
111 let texts = sniplets.as_slice();
114
115 use crate::FlattenWithIndexExt;
117 let detected_languages: Vec<String> = match &settings.get_lang_filter.mode {
118 Mode::Multilingual => {
119 let consecutive_words_min = settings.get_lang_filter.consecutive_words_min;
121 let words_total_percentage_min = settings.get_lang_filter.words_total_percentage_min;
122
123 let words_total: usize = texts
124 .iter()
125 .map(|slice| slice.split_whitespace().count())
126 .sum();
127 let words_min = [consecutive_words_min, words_total / 3];
129 let words_min = words_min.iter().min().unwrap();
130 log::trace!(
131 "Language snippets with less than {} words will be ignored.",
132 words_min
133 );
134
135 let words_distribution: HashMap<String, usize> = detector
136 .detect_multiple_languages_in_parallel_of(texts)
137 .into_iter()
138 .flatten_with_index()
139 .filter(|(i, l)| {
141 let allow_through = l.word_count() >= *words_min;
142 log::trace!(
143 "Language(s) detected in [{}]: {}, {}, {}: {:?}",
144 i,
145 l.language().iso_code_639_1(),
146 l.word_count(),
147 allow_through,
148 texts[*i][l.start_index()..l.end_index()]
149 .chars()
150 .take(60)
151 .collect::<String>()
152 );
153 allow_through
154 })
155 .map(|(_, l)| (l.language().iso_code_639_1().to_string(), l.word_count()))
156 .into_grouping_map_by(|n| n.0.clone())
157 .aggregate(|acc, _key, val| Some(acc.unwrap_or(0) + val.1));
158
159 let words_distribution: Vec<(String, usize)> = words_distribution
161 .into_iter()
162 .sorted_by_key(|l| usize::MAX - l.1)
163 .collect();
164 log::debug!(
165 "Languages distribution per word count:\n {:?}",
166 words_distribution
167 );
168
169 let words_distribution_total: usize = words_distribution.iter().map(|l| l.1).sum();
171 let words_total_min: usize =
172 words_distribution_total * words_total_percentage_min / 100;
173
174 words_distribution
176 .into_iter()
177 .filter(|(l, wc)| {
178 if *wc >= words_total_min {
179 true
180 } else {
181 let words_percentage = wc * 100 / words_distribution_total;
182 log::info!(
183 "Language `{}` rejected: not enough words in total ({}%<{}%)",
184 l,
185 words_percentage,
186 words_total_percentage_min
187 );
188 false
189 }
190 })
191 .map(|(l, _)| l)
192 .collect::<Vec<String>>()
193 }
194
195 Mode::Monolingual => detector
196 .detect_languages_in_parallel_of(texts)
197 .into_iter()
198 .flatten()
199 .map(|l| l.iso_code_639_1().to_string())
200 .inspect(|l| log::debug!("Language: '{}' in input detected.", l))
201 .collect(),
202
203 Mode::Disabled => unreachable!(), Mode::Error(_) => unreachable!(), };
207
208 Ok(detected_languages)
209}
210
211#[cfg(test)]
212mod tests {
213 use super::*;
214 use parking_lot::RwLockWriteGuard;
215
216 #[test]
217 fn test_get_lang() {
218 use crate::{
219 config::{GetLang, Mode},
220 settings::Settings,
221 };
222 use lingua::IsoCode639_1;
223
224 let get_lang_filter = GetLang {
227 mode: Mode::Multilingual,
228 language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
229 relative_distance_min: 0.2,
230 consecutive_words_min: 5,
231 words_total_percentage_min: 10,
232 };
233
234 let mut settings = SETTINGS.write();
235 *settings = Settings::default();
236 settings.get_lang_filter = get_lang_filter;
237 let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
239
240 let input = "Das große Haus";
241 let output = get_lang(input).unwrap();
242 assert_eq!("de", output[0]);
243
244 let input = "Il est venu trop tard";
245 let output = get_lang(input).unwrap();
246 assert_eq!("fr", output[0]);
247
248 let input = "How to set up a roof rack";
249 let output = get_lang(input).unwrap();
250 assert_eq!("en", output[0]);
251
252 let input = "1917039480 50198%-328470";
253 let output = get_lang(input).unwrap();
254 assert!(output.is_empty());
255
256 let input = " \t\n ";
257 let output = get_lang(input).unwrap();
258 assert!(output.is_empty());
259
260 let input = "Parlez-vous français? \
261 Ich spreche Französisch nur ein bisschen. \
262 A little bit is better than nothing. \
263 Noch mehr Deutsch. \
264 Bien-sûr, je parle un peu. Qu'est-ce que tu veux?";
265 let output = get_lang(input).unwrap();
266
267 assert_eq!(output, ["fr", "de", "en"]);
275
276 let input = "Parlez-vous français? \
277 Ich spreche Französisch nur ein bisschen. \
278 A little bit is better than nothing.";
279 let output = get_lang(input).unwrap();
280
281 assert_eq!(output, ["de", "en"]);
288
289 drop(_settings);
291 }
292
293 #[test]
294 fn test_get_lang2() {
295 use crate::{
296 config::{GetLang, Mode},
297 settings::Settings,
298 };
299 use lingua::IsoCode639_1;
300
301 let get_lang_filter = GetLang {
304 mode: Mode::Monolingual,
305 language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
306 relative_distance_min: 0.2,
307 consecutive_words_min: 5,
308 words_total_percentage_min: 10,
309 };
310
311 let mut settings = SETTINGS.write();
312 *settings = Settings::default();
313 settings.get_lang_filter = get_lang_filter;
314 let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
316
317 let input = "Das große Haus";
318 let output = get_lang(input).unwrap();
319 assert_eq!("de", output[0]);
320
321 let input = "Il est venu trop tard";
322 let output = get_lang(input).unwrap();
323 assert_eq!("fr", output[0]);
324
325 let input = "How to set up a roof rack";
326 let output = get_lang(input).unwrap();
327 assert_eq!("en", output[0]);
328
329 let input = "1917039480 50198%-328470";
330 let output = get_lang(input).unwrap();
331 assert!(output.is_empty());
332
333 let input = " \t\n ";
334 let output = get_lang(input).unwrap();
335 assert!(output.is_empty());
336
337 let input = "Parlez-vous français? \
338 Ich spreche Französisch nur ein bisschen. \
339 A little bit is better than nothing.";
340 let output = get_lang(input).unwrap();
341 assert_eq!(output.len(), 1);
342 assert_eq!("de", output[0]);
343
344 drop(_settings);
346 }
347}