1use crate::settings::SETTINGS;
3use crate::{config::Mode, error::LibCfgError};
4pub(crate) use lingua::IsoCode639_1;
5use lingua::{LanguageDetector, LanguageDetectorBuilder};
6use parse_hyperlinks::iterator::MarkupLink;
7use parse_hyperlinks::parser::Link;
8use std::collections::HashMap; #[cfg(feature = "lang-detection")]
15pub(crate) fn get_lang(input: &str) -> Result<Vec<String>, LibCfgError> {
16 use std::borrow::Cow;
17
18 use itertools::Itertools;
19
20 let input = input.trim();
21 if input.is_empty() {
23 return Ok(vec![]);
24 }
25
26 let settings = SETTINGS.read_recursive();
27
28 match &settings.get_lang_filter.mode {
30 Mode::Disabled => return Ok(vec![]),
31
32 Mode::Error(e) => return Err(e.clone()),
33 _ => {}
34 }
35
36 let detector: LanguageDetector = if !&settings.get_lang_filter.language_candidates.is_empty() {
38 log::trace!(
39 "Execute template filter `get_lang` \
40 with languages candidates: {:?}",
41 &settings.get_lang_filter.language_candidates,
42 );
43
44 LanguageDetectorBuilder::from_iso_codes_639_1(&settings.get_lang_filter.language_candidates)
45 .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
46 .build()
47 } else {
48 log::trace!(
49 "Execute template filter `get_lang` \
50 with all available languages",
51 );
52 LanguageDetectorBuilder::from_all_languages()
53 .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
54 .build()
55 };
56
57 let mut sniplets: Vec<Cow<str>> = Vec::new();
59 let mut remnant = "";
60 for ((skipped, _, r), link) in MarkupLink::new(input, false) {
61 sniplets.push(Cow::from(skipped));
62 remnant = r;
63 match link {
64 Link::Text2Dest(text, _, title) => {
65 if !text.is_empty() {
66 sniplets.push(text)
67 };
68 if !title.is_empty() {
69 sniplets.push(title)
70 };
71 }
72 Link::Text2Label(text, _) => {
73 if !text.is_empty() {
74 sniplets.push(text)
75 };
76 }
77 Link::TextLabel2Dest(text, _, _) => {
78 if !text.is_empty() {
79 sniplets.push(text)
80 };
81 }
82
83 Link::Image(alt_text, _) => {
84 if !alt_text.is_empty() {
85 sniplets.push(alt_text)
86 };
87 }
88 Link::Image2Dest(text1, img_alt, _, text2, _, title) => {
89 if !text1.is_empty() {
90 sniplets.push(text1)
91 };
92 if !img_alt.is_empty() {
93 sniplets.push(img_alt)
94 };
95 if !text2.is_empty() {
96 sniplets.push(text2)
97 };
98 if !title.is_empty() {
99 sniplets.push(title)
100 };
101 }
102 _ => {}
103 }
104 }
105 if !remnant.is_empty() {
106 sniplets.push(Cow::from(remnant));
107 }
108 if sniplets.is_empty() {
109 sniplets.push(Cow::from(input));
110 }
111 let texts = sniplets.as_slice();
114
115
116 use crate::FlattenWithIndexExt;
118 let detected_languages: Vec<String> = match &settings.get_lang_filter.mode {
119 Mode::Multilingual => {
120 let consecutive_words_min = settings.get_lang_filter.consecutive_words_min;
122 let words_total_percentage_min = settings.get_lang_filter.words_total_percentage_min;
123
124 let words_total: usize = texts
125 .iter()
126 .map(|slice| slice.split_whitespace().count())
127 .sum();
128 let words_min = [consecutive_words_min, words_total / 3];
130 let words_min = words_min.iter().min().unwrap();
131 log::trace!(
132 "Language snippets with less than {} words will be ignored.",
133 words_min
134 );
135
136 let words_distribution: HashMap<String, usize> = detector
137 .detect_multiple_languages_in_parallel_of(texts)
138 .into_iter()
139 .flatten_with_index()
140 .filter(|(i, l)| {
142 let allow_through = l.word_count() >= *words_min;
143 log::trace!(
144 "Language(s) detected in [{}]: {}, {}, {}: {:?}",
145 i,
146 l.language().iso_code_639_1(),
147 l.word_count(),
148 allow_through,
149 texts[*i][l.start_index()..l.end_index()]
150 .chars()
151 .take(60)
152 .collect::<String>()
153 );
154 allow_through
155 })
156 .map(|(_, l)| (l.language().iso_code_639_1().to_string(), l.word_count()))
157 .into_grouping_map_by(|n| n.0.clone())
158 .aggregate(|acc, _key, val| Some(acc.unwrap_or(0) + val.1));
159
160 let words_distribution: Vec<(String, usize)> = words_distribution
162 .into_iter()
163 .sorted_by_key(|l| usize::MAX - l.1)
164 .collect();
165 log::debug!(
166 "Languages distribution per word count:\n {:?}",
167 words_distribution
168 );
169
170 let words_distribution_total: usize = words_distribution.iter().map(|l| l.1).sum();
172 let words_total_min: usize =
173 words_distribution_total * words_total_percentage_min / 100;
174
175 words_distribution
177 .into_iter()
178 .filter(|(l, wc)| {
179 if *wc >= words_total_min {
180 true
181 } else {
182 let words_percentage = wc * 100 / words_distribution_total;
183 log::info!(
184 "Language `{}` rejected: not enough words in total ({}%<{}%)",
185 l,
186 words_percentage,
187 words_total_percentage_min
188 );
189 false
190 }
191 })
192 .map(|(l, _)| l)
193 .collect::<Vec<String>>()
194 }
195
196 Mode::Monolingual => detector
197 .detect_languages_in_parallel_of(texts)
198 .into_iter()
199 .flatten()
200 .map(|l| l.iso_code_639_1().to_string())
201 .inspect(|l| log::debug!("Language: '{}' in input detected.", l))
202 .collect(),
203
204 Mode::Disabled => unreachable!(), Mode::Error(_) => unreachable!(), };
208
209 Ok(detected_languages)
210}
211
212#[cfg(test)]
213mod tests {
214 use super::*;
215 use parking_lot::RwLockWriteGuard;
216
217 #[test]
218 fn test_get_lang() {
219 use crate::{
220 config::{GetLang, Mode},
221 settings::Settings,
222 };
223 use lingua::IsoCode639_1;
224
225 let get_lang_filter = GetLang {
228 mode: Mode::Multilingual,
229 language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
230 relative_distance_min: 0.2,
231 consecutive_words_min: 5,
232 words_total_percentage_min: 10,
233 };
234
235 let mut settings = SETTINGS.write();
236 *settings = Settings::default();
237 settings.get_lang_filter = get_lang_filter;
238 let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
240
241 let input = "Das große Haus";
242 let output = get_lang(input).unwrap();
243 assert_eq!("de", output[0]);
244
245 let input = "Il est venu trop tard";
246 let output = get_lang(input).unwrap();
247 assert_eq!("fr", output[0]);
248
249 let input = "How to set up a roof rack";
250 let output = get_lang(input).unwrap();
251 assert_eq!("en", output[0]);
252
253 let input = "1917039480 50198%-328470";
254 let output = get_lang(input).unwrap();
255 assert!(output.is_empty());
256
257 let input = " \t\n ";
258 let output = get_lang(input).unwrap();
259 assert!(output.is_empty());
260
261 let input = "Parlez-vous français? \
262 Ich spreche Französisch nur ein bisschen. \
263 A little bit is better than nothing. \
264 Noch mehr Deutsch. \
265 Bien-sûr, je parle un peu. Qu'est-ce que tu veux?";
266 let output = get_lang(input).unwrap();
267
268 assert_eq!(output, ["fr", "de", "en"]);
276
277 let input = "Parlez-vous français? \
278 Ich spreche Französisch nur ein bisschen. \
279 A little bit is better than nothing.";
280 let output = get_lang(input).unwrap();
281
282 assert_eq!(output, ["de", "en"]);
289
290 drop(_settings);
292 }
293
294 #[test]
295 fn test_get_lang2() {
296 use crate::{
297 config::{GetLang, Mode},
298 settings::Settings,
299 };
300 use lingua::IsoCode639_1;
301
302 let get_lang_filter = GetLang {
305 mode: Mode::Monolingual,
306 language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
307 relative_distance_min: 0.2,
308 consecutive_words_min: 5,
309 words_total_percentage_min: 10,
310 };
311
312 let mut settings = SETTINGS.write();
313 *settings = Settings::default();
314 settings.get_lang_filter = get_lang_filter;
315 let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
317
318 let input = "Das große Haus";
319 let output = get_lang(input).unwrap();
320 assert_eq!("de", output[0]);
321
322 let input = "Il est venu trop tard";
323 let output = get_lang(input).unwrap();
324 assert_eq!("fr", output[0]);
325
326 let input = "How to set up a roof rack";
327 let output = get_lang(input).unwrap();
328 assert_eq!("en", output[0]);
329
330 let input = "1917039480 50198%-328470";
331 let output = get_lang(input).unwrap();
332 assert!(output.is_empty());
333
334 let input = " \t\n ";
335 let output = get_lang(input).unwrap();
336 assert!(output.is_empty());
337
338 let input = "Parlez-vous français? \
339 Ich spreche Französisch nur ein bisschen. \
340 A little bit is better than nothing.";
341 let output = get_lang(input).unwrap();
342 assert_eq!(output.len(), 1);
343 assert_eq!("de", output[0]);
344
345 drop(_settings);
347 }
348}