1use crate::settings::SETTINGS;
3use crate::{config::Mode, error::LibCfgError};
4pub(crate) use lingua::IsoCode639_1;
5use lingua::{LanguageDetector, LanguageDetectorBuilder};
6use std::collections::HashMap; #[cfg(feature = "lang-detection")]
13pub(crate) fn get_lang(input: &str) -> Result<Vec<String>, LibCfgError> {
14 use itertools::Itertools;
15
16 let input = input.trim();
17 if input.is_empty() {
19 return Ok(vec![]);
20 }
21
22 let settings = SETTINGS.read_recursive();
23
24 match &settings.get_lang_filter.mode {
26 Mode::Disabled => return Ok(vec![]),
27
28 Mode::Error(e) => return Err(e.clone()),
29 _ => {}
30 }
31
32 let detector: LanguageDetector = if !&settings.get_lang_filter.language_candidates.is_empty() {
34 log::trace!(
35 "Execute template filter `get_lang` \
36 with languages candidates: {:?}",
37 &settings.get_lang_filter.language_candidates,
38 );
39
40 LanguageDetectorBuilder::from_iso_codes_639_1(&settings.get_lang_filter.language_candidates)
41 .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
42 .build()
43 } else {
44 log::trace!(
45 "Execute template filter `get_lang` \
46 with all available languages",
47 );
48 LanguageDetectorBuilder::from_all_languages()
49 .with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
50 .build()
51 };
52
53 let detected_languages: Vec<String> = match &settings.get_lang_filter.mode {
55 Mode::Multilingual => {
56 let consecutive_words_min = settings.get_lang_filter.consecutive_words_min;
58 let words_total_percentage_min = settings.get_lang_filter.words_total_percentage_min;
59
60 let words_total = input.split_whitespace().count();
61 let words_min = [consecutive_words_min, words_total / 3];
63 let words_min = words_min.iter().min().unwrap();
64 log::trace!(
65 "Language snippets with less than {} words will be ignored.",
66 words_min
67 );
68
69 let words_distribution: HashMap<String, usize> = detector
70 .detect_multiple_languages_of(input)
71 .into_iter()
72 .filter(|l| {
74 let allow_through = l.word_count() >= *words_min;
75 log::trace!(
76 "Language(s) detected: {}, {}, {}: {:?}",
77 l.language().iso_code_639_1(),
78 l.word_count(),
79 allow_through,
80 input[l.start_index()..l.end_index()]
81 .chars()
82 .take(50)
83 .collect::<String>()
84 );
85 allow_through
86 })
87 .map(|l| (l.language().iso_code_639_1().to_string(), l.word_count()))
88 .into_grouping_map_by(|n| n.0.clone())
89 .aggregate(|acc, _key, val| Some(acc.unwrap_or(0) + val.1));
90
91 let words_distribution: Vec<(String, usize)> = words_distribution
93 .into_iter()
94 .sorted_by_key(|l| usize::MAX - l.1)
95 .collect();
96 log::debug!(
97 "Languages distribution per word count:\n {:?}",
98 words_distribution
99 );
100
101 let words_distribution_total: usize = words_distribution.iter().map(|l| l.1).sum();
103 let words_total_min: usize =
104 words_distribution_total * words_total_percentage_min / 100;
105
106 words_distribution
108 .into_iter()
109 .filter(|(l, wc)| {
110 if *wc >= words_total_min {
111 true
112 } else {
113 let words_percentage = wc * 100 / words_distribution_total;
114 log::info!(
115 "Language `{}` rejected: not enough words in total ({}%<{}%)",
116 l,
117 words_percentage,
118 words_total_percentage_min
119 );
120 false
121 }
122 })
123 .map(|(l, _)| l)
124 .collect::<Vec<String>>()
125 }
126
127 Mode::Monolingual => detector
128 .detect_language_of(input)
129 .into_iter()
130 .map(|l| l.iso_code_639_1().to_string())
131 .inspect(|l| log::debug!("Language: '{}' in input detected.", l))
132 .collect(),
133
134 Mode::Disabled => unreachable!(), Mode::Error(_) => unreachable!(), };
138
139 Ok(detected_languages)
140}
141
142#[cfg(test)]
143mod tests {
144 use super::*;
145 use parking_lot::RwLockWriteGuard;
146
147 #[test]
148 fn test_get_lang() {
149 use crate::{
150 config::{GetLang, Mode},
151 settings::Settings,
152 };
153 use lingua::IsoCode639_1;
154
155 let get_lang_filter = GetLang {
158 mode: Mode::Multilingual,
159 language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
160 relative_distance_min: 0.2,
161 consecutive_words_min: 5,
162 words_total_percentage_min: 10,
163 };
164
165 let mut settings = SETTINGS.write();
166 *settings = Settings::default();
167 settings.get_lang_filter = get_lang_filter;
168 let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
170
171 let input = "Das große Haus";
172 let output = get_lang(input).unwrap();
173 assert_eq!("de", output[0]);
174
175 let input = "Il est venu trop tard";
176 let output = get_lang(input).unwrap();
177 assert_eq!("fr", output[0]);
178
179 let input = "How to set up a roof rack";
180 let output = get_lang(input).unwrap();
181 assert_eq!("en", output[0]);
182
183 let input = "1917039480 50198%-328470";
184 let output = get_lang(input).unwrap();
185 assert!(output.is_empty());
186
187 let input = " \t\n ";
188 let output = get_lang(input).unwrap();
189 assert!(output.is_empty());
190
191 let input = "Parlez-vous français? \
192 Ich spreche Französisch nur ein bisschen. \
193 A little bit is better than nothing. \
194 Noch mehr Deutsch. \
195 Bien-sûr, je parle un peu. Qu'est-ce que tu veux?";
196 let output = get_lang(input).unwrap();
197
198 assert_eq!(output, ["fr", "de", "en"]);
206
207 let input = "Parlez-vous français? \
208 Ich spreche Französisch nur ein bisschen. \
209 A little bit is better than nothing.";
210 let output = get_lang(input).unwrap();
211
212 assert_eq!(output, ["de", "en"]);
219
220 drop(_settings);
222 }
223
224 #[test]
225 fn test_get_lang2() {
226 use crate::{
227 config::{GetLang, Mode},
228 settings::Settings,
229 };
230 use lingua::IsoCode639_1;
231
232 let get_lang_filter = GetLang {
235 mode: Mode::Monolingual,
236 language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
237 relative_distance_min: 0.2,
238 consecutive_words_min: 5,
239 words_total_percentage_min: 10,
240 };
241
242 let mut settings = SETTINGS.write();
243 *settings = Settings::default();
244 settings.get_lang_filter = get_lang_filter;
245 let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
247
248 let input = "Das große Haus";
249 let output = get_lang(input).unwrap();
250 assert_eq!("de", output[0]);
251
252 let input = "Il est venu trop tard";
253 let output = get_lang(input).unwrap();
254 assert_eq!("fr", output[0]);
255
256 let input = "How to set up a roof rack";
257 let output = get_lang(input).unwrap();
258 assert_eq!("en", output[0]);
259
260 let input = "1917039480 50198%-328470";
261 let output = get_lang(input).unwrap();
262 assert!(output.is_empty());
263
264 let input = " \t\n ";
265 let output = get_lang(input).unwrap();
266 assert!(output.is_empty());
267
268 let input = "Parlez-vous français? \
269 Ich spreche Französisch nur ein bisschen. \
270 A little bit is better than nothing.";
271 let output = get_lang(input).unwrap();
272 assert_eq!(output.len(), 1);
273 assert_eq!("de", output[0]);
274
275 drop(_settings);
277 }
278}