1use std::io::{Cursor, ErrorKind, Read};
18
19use brotli::Decompressor;
20use include_dir::Dir;
21
22#[cfg(feature = "afrikaans")]
23use lingua_afrikaans_language_model::AFRIKAANS_MODELS_DIRECTORY;
24#[cfg(feature = "albanian")]
25use lingua_albanian_language_model::ALBANIAN_MODELS_DIRECTORY;
26#[cfg(feature = "arabic")]
27use lingua_arabic_language_model::ARABIC_MODELS_DIRECTORY;
28#[cfg(feature = "armenian")]
29use lingua_armenian_language_model::ARMENIAN_MODELS_DIRECTORY;
30#[cfg(feature = "azerbaijani")]
31use lingua_azerbaijani_language_model::AZERBAIJANI_MODELS_DIRECTORY;
32#[cfg(feature = "basque")]
33use lingua_basque_language_model::BASQUE_MODELS_DIRECTORY;
34#[cfg(feature = "belarusian")]
35use lingua_belarusian_language_model::BELARUSIAN_MODELS_DIRECTORY;
36#[cfg(feature = "bengali")]
37use lingua_bengali_language_model::BENGALI_MODELS_DIRECTORY;
38#[cfg(feature = "bokmal")]
39use lingua_bokmal_language_model::BOKMAL_MODELS_DIRECTORY;
40#[cfg(feature = "bosnian")]
41use lingua_bosnian_language_model::BOSNIAN_MODELS_DIRECTORY;
42#[cfg(feature = "bulgarian")]
43use lingua_bulgarian_language_model::BULGARIAN_MODELS_DIRECTORY;
44#[cfg(feature = "catalan")]
45use lingua_catalan_language_model::CATALAN_MODELS_DIRECTORY;
46#[cfg(feature = "chinese")]
47use lingua_chinese_language_model::CHINESE_MODELS_DIRECTORY;
48#[cfg(feature = "croatian")]
49use lingua_croatian_language_model::CROATIAN_MODELS_DIRECTORY;
50#[cfg(feature = "czech")]
51use lingua_czech_language_model::CZECH_MODELS_DIRECTORY;
52#[cfg(feature = "danish")]
53use lingua_danish_language_model::DANISH_MODELS_DIRECTORY;
54#[cfg(feature = "dutch")]
55use lingua_dutch_language_model::DUTCH_MODELS_DIRECTORY;
56#[cfg(feature = "english")]
57use lingua_english_language_model::ENGLISH_MODELS_DIRECTORY;
58#[cfg(feature = "esperanto")]
59use lingua_esperanto_language_model::ESPERANTO_MODELS_DIRECTORY;
60#[cfg(feature = "estonian")]
61use lingua_estonian_language_model::ESTONIAN_MODELS_DIRECTORY;
62#[cfg(feature = "finnish")]
63use lingua_finnish_language_model::FINNISH_MODELS_DIRECTORY;
64#[cfg(feature = "french")]
65use lingua_french_language_model::FRENCH_MODELS_DIRECTORY;
66#[cfg(feature = "ganda")]
67use lingua_ganda_language_model::GANDA_MODELS_DIRECTORY;
68#[cfg(feature = "georgian")]
69use lingua_georgian_language_model::GEORGIAN_MODELS_DIRECTORY;
70#[cfg(feature = "german")]
71use lingua_german_language_model::GERMAN_MODELS_DIRECTORY;
72#[cfg(feature = "greek")]
73use lingua_greek_language_model::GREEK_MODELS_DIRECTORY;
74#[cfg(feature = "gujarati")]
75use lingua_gujarati_language_model::GUJARATI_MODELS_DIRECTORY;
76#[cfg(feature = "hebrew")]
77use lingua_hebrew_language_model::HEBREW_MODELS_DIRECTORY;
78#[cfg(feature = "hindi")]
79use lingua_hindi_language_model::HINDI_MODELS_DIRECTORY;
80#[cfg(feature = "hungarian")]
81use lingua_hungarian_language_model::HUNGARIAN_MODELS_DIRECTORY;
82#[cfg(feature = "icelandic")]
83use lingua_icelandic_language_model::ICELANDIC_MODELS_DIRECTORY;
84#[cfg(feature = "indonesian")]
85use lingua_indonesian_language_model::INDONESIAN_MODELS_DIRECTORY;
86#[cfg(feature = "irish")]
87use lingua_irish_language_model::IRISH_MODELS_DIRECTORY;
88#[cfg(feature = "italian")]
89use lingua_italian_language_model::ITALIAN_MODELS_DIRECTORY;
90#[cfg(feature = "japanese")]
91use lingua_japanese_language_model::JAPANESE_MODELS_DIRECTORY;
92#[cfg(feature = "kazakh")]
93use lingua_kazakh_language_model::KAZAKH_MODELS_DIRECTORY;
94#[cfg(feature = "korean")]
95use lingua_korean_language_model::KOREAN_MODELS_DIRECTORY;
96#[cfg(feature = "latin")]
97use lingua_latin_language_model::LATIN_MODELS_DIRECTORY;
98#[cfg(feature = "latvian")]
99use lingua_latvian_language_model::LATVIAN_MODELS_DIRECTORY;
100#[cfg(feature = "lithuanian")]
101use lingua_lithuanian_language_model::LITHUANIAN_MODELS_DIRECTORY;
102#[cfg(feature = "macedonian")]
103use lingua_macedonian_language_model::MACEDONIAN_MODELS_DIRECTORY;
104#[cfg(feature = "malay")]
105use lingua_malay_language_model::MALAY_MODELS_DIRECTORY;
106#[cfg(feature = "maori")]
107use lingua_maori_language_model::MAORI_MODELS_DIRECTORY;
108#[cfg(feature = "marathi")]
109use lingua_marathi_language_model::MARATHI_MODELS_DIRECTORY;
110#[cfg(feature = "mongolian")]
111use lingua_mongolian_language_model::MONGOLIAN_MODELS_DIRECTORY;
112#[cfg(feature = "nynorsk")]
113use lingua_nynorsk_language_model::NYNORSK_MODELS_DIRECTORY;
114#[cfg(feature = "persian")]
115use lingua_persian_language_model::PERSIAN_MODELS_DIRECTORY;
116#[cfg(feature = "polish")]
117use lingua_polish_language_model::POLISH_MODELS_DIRECTORY;
118#[cfg(feature = "portuguese")]
119use lingua_portuguese_language_model::PORTUGUESE_MODELS_DIRECTORY;
120#[cfg(feature = "punjabi")]
121use lingua_punjabi_language_model::PUNJABI_MODELS_DIRECTORY;
122#[cfg(feature = "romanian")]
123use lingua_romanian_language_model::ROMANIAN_MODELS_DIRECTORY;
124#[cfg(feature = "russian")]
125use lingua_russian_language_model::RUSSIAN_MODELS_DIRECTORY;
126#[cfg(feature = "serbian")]
127use lingua_serbian_language_model::SERBIAN_MODELS_DIRECTORY;
128#[cfg(feature = "shona")]
129use lingua_shona_language_model::SHONA_MODELS_DIRECTORY;
130#[cfg(feature = "slovak")]
131use lingua_slovak_language_model::SLOVAK_MODELS_DIRECTORY;
132#[cfg(feature = "slovene")]
133use lingua_slovene_language_model::SLOVENE_MODELS_DIRECTORY;
134#[cfg(feature = "somali")]
135use lingua_somali_language_model::SOMALI_MODELS_DIRECTORY;
136#[cfg(feature = "sotho")]
137use lingua_sotho_language_model::SOTHO_MODELS_DIRECTORY;
138#[cfg(feature = "spanish")]
139use lingua_spanish_language_model::SPANISH_MODELS_DIRECTORY;
140#[cfg(feature = "swahili")]
141use lingua_swahili_language_model::SWAHILI_MODELS_DIRECTORY;
142#[cfg(feature = "swedish")]
143use lingua_swedish_language_model::SWEDISH_MODELS_DIRECTORY;
144#[cfg(feature = "tagalog")]
145use lingua_tagalog_language_model::TAGALOG_MODELS_DIRECTORY;
146#[cfg(feature = "tamil")]
147use lingua_tamil_language_model::TAMIL_MODELS_DIRECTORY;
148#[cfg(feature = "telugu")]
149use lingua_telugu_language_model::TELUGU_MODELS_DIRECTORY;
150#[cfg(feature = "thai")]
151use lingua_thai_language_model::THAI_MODELS_DIRECTORY;
152#[cfg(feature = "tsonga")]
153use lingua_tsonga_language_model::TSONGA_MODELS_DIRECTORY;
154#[cfg(feature = "tswana")]
155use lingua_tswana_language_model::TSWANA_MODELS_DIRECTORY;
156#[cfg(feature = "turkish")]
157use lingua_turkish_language_model::TURKISH_MODELS_DIRECTORY;
158#[cfg(feature = "ukrainian")]
159use lingua_ukrainian_language_model::UKRAINIAN_MODELS_DIRECTORY;
160#[cfg(feature = "urdu")]
161use lingua_urdu_language_model::URDU_MODELS_DIRECTORY;
162#[cfg(feature = "vietnamese")]
163use lingua_vietnamese_language_model::VIETNAMESE_MODELS_DIRECTORY;
164#[cfg(feature = "welsh")]
165use lingua_welsh_language_model::WELSH_MODELS_DIRECTORY;
166#[cfg(feature = "xhosa")]
167use lingua_xhosa_language_model::XHOSA_MODELS_DIRECTORY;
168#[cfg(feature = "yoruba")]
169use lingua_yoruba_language_model::YORUBA_MODELS_DIRECTORY;
170#[cfg(feature = "zulu")]
171use lingua_zulu_language_model::ZULU_MODELS_DIRECTORY;
172
173use crate::Language;
174
175pub(crate) fn load_json(language: Language, file_name: &str) -> std::io::Result<String> {
176 let directory = get_language_models_directory(language);
177 let compressed_file = directory.get_file(file_name).ok_or(ErrorKind::NotFound)?;
178 let compressed_file_reader = Cursor::new(compressed_file.contents());
179 let mut uncompressed_file = Decompressor::new(compressed_file_reader, 4096);
180 let mut uncompressed_file_content = String::new();
181 uncompressed_file.read_to_string(&mut uncompressed_file_content)?;
182 Ok(uncompressed_file_content)
183}
184
185fn get_language_models_directory(language: Language) -> Dir<'static> {
186 match language {
187 #[cfg(feature = "afrikaans")]
188 Language::Afrikaans => AFRIKAANS_MODELS_DIRECTORY,
189
190 #[cfg(feature = "albanian")]
191 Language::Albanian => ALBANIAN_MODELS_DIRECTORY,
192
193 #[cfg(feature = "arabic")]
194 Language::Arabic => ARABIC_MODELS_DIRECTORY,
195
196 #[cfg(feature = "armenian")]
197 Language::Armenian => ARMENIAN_MODELS_DIRECTORY,
198
199 #[cfg(feature = "azerbaijani")]
200 Language::Azerbaijani => AZERBAIJANI_MODELS_DIRECTORY,
201
202 #[cfg(feature = "basque")]
203 Language::Basque => BASQUE_MODELS_DIRECTORY,
204
205 #[cfg(feature = "belarusian")]
206 Language::Belarusian => BELARUSIAN_MODELS_DIRECTORY,
207
208 #[cfg(feature = "bengali")]
209 Language::Bengali => BENGALI_MODELS_DIRECTORY,
210
211 #[cfg(feature = "bokmal")]
212 Language::Bokmal => BOKMAL_MODELS_DIRECTORY,
213
214 #[cfg(feature = "bosnian")]
215 Language::Bosnian => BOSNIAN_MODELS_DIRECTORY,
216
217 #[cfg(feature = "bulgarian")]
218 Language::Bulgarian => BULGARIAN_MODELS_DIRECTORY,
219
220 #[cfg(feature = "catalan")]
221 Language::Catalan => CATALAN_MODELS_DIRECTORY,
222
223 #[cfg(feature = "chinese")]
224 Language::Chinese => CHINESE_MODELS_DIRECTORY,
225
226 #[cfg(feature = "croatian")]
227 Language::Croatian => CROATIAN_MODELS_DIRECTORY,
228
229 #[cfg(feature = "czech")]
230 Language::Czech => CZECH_MODELS_DIRECTORY,
231
232 #[cfg(feature = "danish")]
233 Language::Danish => DANISH_MODELS_DIRECTORY,
234
235 #[cfg(feature = "dutch")]
236 Language::Dutch => DUTCH_MODELS_DIRECTORY,
237
238 #[cfg(feature = "english")]
239 Language::English => ENGLISH_MODELS_DIRECTORY,
240
241 #[cfg(feature = "esperanto")]
242 Language::Esperanto => ESPERANTO_MODELS_DIRECTORY,
243
244 #[cfg(feature = "estonian")]
245 Language::Estonian => ESTONIAN_MODELS_DIRECTORY,
246
247 #[cfg(feature = "finnish")]
248 Language::Finnish => FINNISH_MODELS_DIRECTORY,
249
250 #[cfg(feature = "french")]
251 Language::French => FRENCH_MODELS_DIRECTORY,
252
253 #[cfg(feature = "ganda")]
254 Language::Ganda => GANDA_MODELS_DIRECTORY,
255
256 #[cfg(feature = "georgian")]
257 Language::Georgian => GEORGIAN_MODELS_DIRECTORY,
258
259 #[cfg(feature = "german")]
260 Language::German => GERMAN_MODELS_DIRECTORY,
261
262 #[cfg(feature = "greek")]
263 Language::Greek => GREEK_MODELS_DIRECTORY,
264
265 #[cfg(feature = "gujarati")]
266 Language::Gujarati => GUJARATI_MODELS_DIRECTORY,
267
268 #[cfg(feature = "hebrew")]
269 Language::Hebrew => HEBREW_MODELS_DIRECTORY,
270
271 #[cfg(feature = "hindi")]
272 Language::Hindi => HINDI_MODELS_DIRECTORY,
273
274 #[cfg(feature = "hungarian")]
275 Language::Hungarian => HUNGARIAN_MODELS_DIRECTORY,
276
277 #[cfg(feature = "icelandic")]
278 Language::Icelandic => ICELANDIC_MODELS_DIRECTORY,
279
280 #[cfg(feature = "indonesian")]
281 Language::Indonesian => INDONESIAN_MODELS_DIRECTORY,
282
283 #[cfg(feature = "irish")]
284 Language::Irish => IRISH_MODELS_DIRECTORY,
285
286 #[cfg(feature = "italian")]
287 Language::Italian => ITALIAN_MODELS_DIRECTORY,
288
289 #[cfg(feature = "japanese")]
290 Language::Japanese => JAPANESE_MODELS_DIRECTORY,
291
292 #[cfg(feature = "kazakh")]
293 Language::Kazakh => KAZAKH_MODELS_DIRECTORY,
294
295 #[cfg(feature = "korean")]
296 Language::Korean => KOREAN_MODELS_DIRECTORY,
297
298 #[cfg(feature = "latin")]
299 Language::Latin => LATIN_MODELS_DIRECTORY,
300
301 #[cfg(feature = "latvian")]
302 Language::Latvian => LATVIAN_MODELS_DIRECTORY,
303
304 #[cfg(feature = "lithuanian")]
305 Language::Lithuanian => LITHUANIAN_MODELS_DIRECTORY,
306
307 #[cfg(feature = "macedonian")]
308 Language::Macedonian => MACEDONIAN_MODELS_DIRECTORY,
309
310 #[cfg(feature = "malay")]
311 Language::Malay => MALAY_MODELS_DIRECTORY,
312
313 #[cfg(feature = "maori")]
314 Language::Maori => MAORI_MODELS_DIRECTORY,
315
316 #[cfg(feature = "marathi")]
317 Language::Marathi => MARATHI_MODELS_DIRECTORY,
318
319 #[cfg(feature = "mongolian")]
320 Language::Mongolian => MONGOLIAN_MODELS_DIRECTORY,
321
322 #[cfg(feature = "nynorsk")]
323 Language::Nynorsk => NYNORSK_MODELS_DIRECTORY,
324
325 #[cfg(feature = "persian")]
326 Language::Persian => PERSIAN_MODELS_DIRECTORY,
327
328 #[cfg(feature = "polish")]
329 Language::Polish => POLISH_MODELS_DIRECTORY,
330
331 #[cfg(feature = "portuguese")]
332 Language::Portuguese => PORTUGUESE_MODELS_DIRECTORY,
333
334 #[cfg(feature = "punjabi")]
335 Language::Punjabi => PUNJABI_MODELS_DIRECTORY,
336
337 #[cfg(feature = "romanian")]
338 Language::Romanian => ROMANIAN_MODELS_DIRECTORY,
339
340 #[cfg(feature = "russian")]
341 Language::Russian => RUSSIAN_MODELS_DIRECTORY,
342
343 #[cfg(feature = "serbian")]
344 Language::Serbian => SERBIAN_MODELS_DIRECTORY,
345
346 #[cfg(feature = "shona")]
347 Language::Shona => SHONA_MODELS_DIRECTORY,
348
349 #[cfg(feature = "slovak")]
350 Language::Slovak => SLOVAK_MODELS_DIRECTORY,
351
352 #[cfg(feature = "slovene")]
353 Language::Slovene => SLOVENE_MODELS_DIRECTORY,
354
355 #[cfg(feature = "somali")]
356 Language::Somali => SOMALI_MODELS_DIRECTORY,
357
358 #[cfg(feature = "sotho")]
359 Language::Sotho => SOTHO_MODELS_DIRECTORY,
360
361 #[cfg(feature = "spanish")]
362 Language::Spanish => SPANISH_MODELS_DIRECTORY,
363
364 #[cfg(feature = "swahili")]
365 Language::Swahili => SWAHILI_MODELS_DIRECTORY,
366
367 #[cfg(feature = "swedish")]
368 Language::Swedish => SWEDISH_MODELS_DIRECTORY,
369
370 #[cfg(feature = "tagalog")]
371 Language::Tagalog => TAGALOG_MODELS_DIRECTORY,
372
373 #[cfg(feature = "tamil")]
374 Language::Tamil => TAMIL_MODELS_DIRECTORY,
375
376 #[cfg(feature = "telugu")]
377 Language::Telugu => TELUGU_MODELS_DIRECTORY,
378
379 #[cfg(feature = "thai")]
380 Language::Thai => THAI_MODELS_DIRECTORY,
381
382 #[cfg(feature = "tsonga")]
383 Language::Tsonga => TSONGA_MODELS_DIRECTORY,
384
385 #[cfg(feature = "tswana")]
386 Language::Tswana => TSWANA_MODELS_DIRECTORY,
387
388 #[cfg(feature = "turkish")]
389 Language::Turkish => TURKISH_MODELS_DIRECTORY,
390
391 #[cfg(feature = "ukrainian")]
392 Language::Ukrainian => UKRAINIAN_MODELS_DIRECTORY,
393
394 #[cfg(feature = "urdu")]
395 Language::Urdu => URDU_MODELS_DIRECTORY,
396
397 #[cfg(feature = "vietnamese")]
398 Language::Vietnamese => VIETNAMESE_MODELS_DIRECTORY,
399
400 #[cfg(feature = "welsh")]
401 Language::Welsh => WELSH_MODELS_DIRECTORY,
402
403 #[cfg(feature = "xhosa")]
404 Language::Xhosa => XHOSA_MODELS_DIRECTORY,
405
406 #[cfg(feature = "yoruba")]
407 Language::Yoruba => YORUBA_MODELS_DIRECTORY,
408
409 #[cfg(feature = "zulu")]
410 Language::Zulu => ZULU_MODELS_DIRECTORY,
411 }
412}
413
414#[cfg(test)]
415mod tests {
416 use crate::minify;
417
418 use super::*;
419
420 const EXPECTED_UNIGRAM_MODEL: &str = r#"
421 {
422 "language":"ENGLISH",
423 "ngrams":{
424 "2/93616591":"ff ċ ė ĩ ȼ ɔ ţ ũ ʔ ơ ả ộ ù",
425 "36/93616591":"ā",
426 "16/93616591":"fi",
427 "7/93616591":"ă ệ",
428 "5/93616591":"ą ħ ś",
429 "26/93616591":"ć",
430 "49/93616591":"č",
431 "8/93616591":"đ ě ź",
432 "1/93616591":"ē ț ġ ḵ ņ ɑ ə ɛ ɦ ű ƅ ạ ƴ ặ ế ỉ ờ ủ ứ",
433 "4/93616591":"ș ÿ",
434 "9/93616591":"ę ż",
435 "40/93616591":"ğ",
436 "13/93616591":"ī ß",
437 "31/93616591":"ı",
438 "39/93616591":"ł",
439 "25/93616591":"ń",
440 "3/93616591":"ň m ů ư ị",
441 "10/93616591":"ō",
442 "60/93616591":"œ",
443 "11/93616591":"ř ì",
444 "18/93616591":"ş",
445 "52/93616591":"š ô",
446 "7915445/93616591":"a",
447 "1461095/93616591":"b",
448 "3003229/93616591":"c",
449 "3622548/93616591":"d",
450 "11308892/93616591":"e",
451 "2006896/93616591":"f",
452 "1963483/93616591":"g",
453 "234603/4927189":"h",
454 "6800966/93616591":"i",
455 "207477/93616591":"j",
456 "14/93616591":"ū û",
457 "760186/93616591":"k",
458 "3928800/93616591":"l",
459 "2358339/93616591":"m",
460 "6698842/93616591":"n",
461 "7137868/93616591":"o",
462 "1994813/93616591":"p",
463 "82818/93616591":"q",
464 "5939665/93616591":"r",
465 "6234570/93616591":"s",
466 "8431167/93616591":"t",
467 "2559048/93616591":"u",
468 "1024914/93616591":"v",
469 "1751793/93616591":"w",
470 "172448/93616591":"x",
471 "1683314/93616591":"y",
472 "103267/93616591":"z",
473 "20/93616591":"ž",
474 "37/93616591":"º ë",
475 "4/4927189":"à",
476 "539/93616591":"á",
477 "913/93616591":"â",
478 "28/93616591":"ã",
479 "118/93616591":"ä",
480 "42/93616591":"å",
481 "6/93616591":"æ",
482 "126/93616591":"ç",
483 "136/93616591":"è",
484 "2259/93616591":"é",
485 "45/93616591":"ê",
486 "428/93616591":"í",
487 "1/4927189":"î",
488 "77/93616591":"ï",
489 "21/93616591":"ð",
490 "478/93616591":"ñ",
491 "48/93616591":"ò",
492 "490/93616591":"ó",
493 "93/93616591":"õ",
494 "200/93616591":"ö",
495 "32/93616591":"ø",
496 "142/93616591":"ú",
497 "149/93616591":"ü",
498 "23/93616591":"ý"
499 }
500 }
501 "#;
502
503 #[test]
504 fn test_load_json() {
505 let result = load_json(Language::English, "unigrams.json.br");
506 assert!(result.is_ok());
507 assert_eq!(result.unwrap(), minify(EXPECTED_UNIGRAM_MODEL));
508 }
509}