lingua/
json.rs

1/*
2 * Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use std::io::{Cursor, ErrorKind, Read};
18
19use brotli::Decompressor;
20use include_dir::Dir;
21
22#[cfg(feature = "afrikaans")]
23use lingua_afrikaans_language_model::AFRIKAANS_MODELS_DIRECTORY;
24#[cfg(feature = "albanian")]
25use lingua_albanian_language_model::ALBANIAN_MODELS_DIRECTORY;
26#[cfg(feature = "arabic")]
27use lingua_arabic_language_model::ARABIC_MODELS_DIRECTORY;
28#[cfg(feature = "armenian")]
29use lingua_armenian_language_model::ARMENIAN_MODELS_DIRECTORY;
30#[cfg(feature = "azerbaijani")]
31use lingua_azerbaijani_language_model::AZERBAIJANI_MODELS_DIRECTORY;
32#[cfg(feature = "basque")]
33use lingua_basque_language_model::BASQUE_MODELS_DIRECTORY;
34#[cfg(feature = "belarusian")]
35use lingua_belarusian_language_model::BELARUSIAN_MODELS_DIRECTORY;
36#[cfg(feature = "bengali")]
37use lingua_bengali_language_model::BENGALI_MODELS_DIRECTORY;
38#[cfg(feature = "bokmal")]
39use lingua_bokmal_language_model::BOKMAL_MODELS_DIRECTORY;
40#[cfg(feature = "bosnian")]
41use lingua_bosnian_language_model::BOSNIAN_MODELS_DIRECTORY;
42#[cfg(feature = "bulgarian")]
43use lingua_bulgarian_language_model::BULGARIAN_MODELS_DIRECTORY;
44#[cfg(feature = "catalan")]
45use lingua_catalan_language_model::CATALAN_MODELS_DIRECTORY;
46#[cfg(feature = "chinese")]
47use lingua_chinese_language_model::CHINESE_MODELS_DIRECTORY;
48#[cfg(feature = "croatian")]
49use lingua_croatian_language_model::CROATIAN_MODELS_DIRECTORY;
50#[cfg(feature = "czech")]
51use lingua_czech_language_model::CZECH_MODELS_DIRECTORY;
52#[cfg(feature = "danish")]
53use lingua_danish_language_model::DANISH_MODELS_DIRECTORY;
54#[cfg(feature = "dutch")]
55use lingua_dutch_language_model::DUTCH_MODELS_DIRECTORY;
56#[cfg(feature = "english")]
57use lingua_english_language_model::ENGLISH_MODELS_DIRECTORY;
58#[cfg(feature = "esperanto")]
59use lingua_esperanto_language_model::ESPERANTO_MODELS_DIRECTORY;
60#[cfg(feature = "estonian")]
61use lingua_estonian_language_model::ESTONIAN_MODELS_DIRECTORY;
62#[cfg(feature = "finnish")]
63use lingua_finnish_language_model::FINNISH_MODELS_DIRECTORY;
64#[cfg(feature = "french")]
65use lingua_french_language_model::FRENCH_MODELS_DIRECTORY;
66#[cfg(feature = "ganda")]
67use lingua_ganda_language_model::GANDA_MODELS_DIRECTORY;
68#[cfg(feature = "georgian")]
69use lingua_georgian_language_model::GEORGIAN_MODELS_DIRECTORY;
70#[cfg(feature = "german")]
71use lingua_german_language_model::GERMAN_MODELS_DIRECTORY;
72#[cfg(feature = "greek")]
73use lingua_greek_language_model::GREEK_MODELS_DIRECTORY;
74#[cfg(feature = "gujarati")]
75use lingua_gujarati_language_model::GUJARATI_MODELS_DIRECTORY;
76#[cfg(feature = "hebrew")]
77use lingua_hebrew_language_model::HEBREW_MODELS_DIRECTORY;
78#[cfg(feature = "hindi")]
79use lingua_hindi_language_model::HINDI_MODELS_DIRECTORY;
80#[cfg(feature = "hungarian")]
81use lingua_hungarian_language_model::HUNGARIAN_MODELS_DIRECTORY;
82#[cfg(feature = "icelandic")]
83use lingua_icelandic_language_model::ICELANDIC_MODELS_DIRECTORY;
84#[cfg(feature = "indonesian")]
85use lingua_indonesian_language_model::INDONESIAN_MODELS_DIRECTORY;
86#[cfg(feature = "irish")]
87use lingua_irish_language_model::IRISH_MODELS_DIRECTORY;
88#[cfg(feature = "italian")]
89use lingua_italian_language_model::ITALIAN_MODELS_DIRECTORY;
90#[cfg(feature = "japanese")]
91use lingua_japanese_language_model::JAPANESE_MODELS_DIRECTORY;
92#[cfg(feature = "kazakh")]
93use lingua_kazakh_language_model::KAZAKH_MODELS_DIRECTORY;
94#[cfg(feature = "korean")]
95use lingua_korean_language_model::KOREAN_MODELS_DIRECTORY;
96#[cfg(feature = "latin")]
97use lingua_latin_language_model::LATIN_MODELS_DIRECTORY;
98#[cfg(feature = "latvian")]
99use lingua_latvian_language_model::LATVIAN_MODELS_DIRECTORY;
100#[cfg(feature = "lithuanian")]
101use lingua_lithuanian_language_model::LITHUANIAN_MODELS_DIRECTORY;
102#[cfg(feature = "macedonian")]
103use lingua_macedonian_language_model::MACEDONIAN_MODELS_DIRECTORY;
104#[cfg(feature = "malay")]
105use lingua_malay_language_model::MALAY_MODELS_DIRECTORY;
106#[cfg(feature = "maori")]
107use lingua_maori_language_model::MAORI_MODELS_DIRECTORY;
108#[cfg(feature = "marathi")]
109use lingua_marathi_language_model::MARATHI_MODELS_DIRECTORY;
110#[cfg(feature = "mongolian")]
111use lingua_mongolian_language_model::MONGOLIAN_MODELS_DIRECTORY;
112#[cfg(feature = "nynorsk")]
113use lingua_nynorsk_language_model::NYNORSK_MODELS_DIRECTORY;
114#[cfg(feature = "persian")]
115use lingua_persian_language_model::PERSIAN_MODELS_DIRECTORY;
116#[cfg(feature = "polish")]
117use lingua_polish_language_model::POLISH_MODELS_DIRECTORY;
118#[cfg(feature = "portuguese")]
119use lingua_portuguese_language_model::PORTUGUESE_MODELS_DIRECTORY;
120#[cfg(feature = "punjabi")]
121use lingua_punjabi_language_model::PUNJABI_MODELS_DIRECTORY;
122#[cfg(feature = "romanian")]
123use lingua_romanian_language_model::ROMANIAN_MODELS_DIRECTORY;
124#[cfg(feature = "russian")]
125use lingua_russian_language_model::RUSSIAN_MODELS_DIRECTORY;
126#[cfg(feature = "serbian")]
127use lingua_serbian_language_model::SERBIAN_MODELS_DIRECTORY;
128#[cfg(feature = "shona")]
129use lingua_shona_language_model::SHONA_MODELS_DIRECTORY;
130#[cfg(feature = "slovak")]
131use lingua_slovak_language_model::SLOVAK_MODELS_DIRECTORY;
132#[cfg(feature = "slovene")]
133use lingua_slovene_language_model::SLOVENE_MODELS_DIRECTORY;
134#[cfg(feature = "somali")]
135use lingua_somali_language_model::SOMALI_MODELS_DIRECTORY;
136#[cfg(feature = "sotho")]
137use lingua_sotho_language_model::SOTHO_MODELS_DIRECTORY;
138#[cfg(feature = "spanish")]
139use lingua_spanish_language_model::SPANISH_MODELS_DIRECTORY;
140#[cfg(feature = "swahili")]
141use lingua_swahili_language_model::SWAHILI_MODELS_DIRECTORY;
142#[cfg(feature = "swedish")]
143use lingua_swedish_language_model::SWEDISH_MODELS_DIRECTORY;
144#[cfg(feature = "tagalog")]
145use lingua_tagalog_language_model::TAGALOG_MODELS_DIRECTORY;
146#[cfg(feature = "tamil")]
147use lingua_tamil_language_model::TAMIL_MODELS_DIRECTORY;
148#[cfg(feature = "telugu")]
149use lingua_telugu_language_model::TELUGU_MODELS_DIRECTORY;
150#[cfg(feature = "thai")]
151use lingua_thai_language_model::THAI_MODELS_DIRECTORY;
152#[cfg(feature = "tsonga")]
153use lingua_tsonga_language_model::TSONGA_MODELS_DIRECTORY;
154#[cfg(feature = "tswana")]
155use lingua_tswana_language_model::TSWANA_MODELS_DIRECTORY;
156#[cfg(feature = "turkish")]
157use lingua_turkish_language_model::TURKISH_MODELS_DIRECTORY;
158#[cfg(feature = "ukrainian")]
159use lingua_ukrainian_language_model::UKRAINIAN_MODELS_DIRECTORY;
160#[cfg(feature = "urdu")]
161use lingua_urdu_language_model::URDU_MODELS_DIRECTORY;
162#[cfg(feature = "vietnamese")]
163use lingua_vietnamese_language_model::VIETNAMESE_MODELS_DIRECTORY;
164#[cfg(feature = "welsh")]
165use lingua_welsh_language_model::WELSH_MODELS_DIRECTORY;
166#[cfg(feature = "xhosa")]
167use lingua_xhosa_language_model::XHOSA_MODELS_DIRECTORY;
168#[cfg(feature = "yoruba")]
169use lingua_yoruba_language_model::YORUBA_MODELS_DIRECTORY;
170#[cfg(feature = "zulu")]
171use lingua_zulu_language_model::ZULU_MODELS_DIRECTORY;
172
173use crate::Language;
174
175pub(crate) fn load_json(language: Language, file_name: &str) -> std::io::Result<String> {
176    let directory = get_language_models_directory(language);
177    let compressed_file = directory.get_file(file_name).ok_or(ErrorKind::NotFound)?;
178    let compressed_file_reader = Cursor::new(compressed_file.contents());
179    let mut uncompressed_file = Decompressor::new(compressed_file_reader, 4096);
180    let mut uncompressed_file_content = String::new();
181    uncompressed_file.read_to_string(&mut uncompressed_file_content)?;
182    Ok(uncompressed_file_content)
183}
184
185fn get_language_models_directory(language: Language) -> Dir<'static> {
186    match language {
187        #[cfg(feature = "afrikaans")]
188        Language::Afrikaans => AFRIKAANS_MODELS_DIRECTORY,
189
190        #[cfg(feature = "albanian")]
191        Language::Albanian => ALBANIAN_MODELS_DIRECTORY,
192
193        #[cfg(feature = "arabic")]
194        Language::Arabic => ARABIC_MODELS_DIRECTORY,
195
196        #[cfg(feature = "armenian")]
197        Language::Armenian => ARMENIAN_MODELS_DIRECTORY,
198
199        #[cfg(feature = "azerbaijani")]
200        Language::Azerbaijani => AZERBAIJANI_MODELS_DIRECTORY,
201
202        #[cfg(feature = "basque")]
203        Language::Basque => BASQUE_MODELS_DIRECTORY,
204
205        #[cfg(feature = "belarusian")]
206        Language::Belarusian => BELARUSIAN_MODELS_DIRECTORY,
207
208        #[cfg(feature = "bengali")]
209        Language::Bengali => BENGALI_MODELS_DIRECTORY,
210
211        #[cfg(feature = "bokmal")]
212        Language::Bokmal => BOKMAL_MODELS_DIRECTORY,
213
214        #[cfg(feature = "bosnian")]
215        Language::Bosnian => BOSNIAN_MODELS_DIRECTORY,
216
217        #[cfg(feature = "bulgarian")]
218        Language::Bulgarian => BULGARIAN_MODELS_DIRECTORY,
219
220        #[cfg(feature = "catalan")]
221        Language::Catalan => CATALAN_MODELS_DIRECTORY,
222
223        #[cfg(feature = "chinese")]
224        Language::Chinese => CHINESE_MODELS_DIRECTORY,
225
226        #[cfg(feature = "croatian")]
227        Language::Croatian => CROATIAN_MODELS_DIRECTORY,
228
229        #[cfg(feature = "czech")]
230        Language::Czech => CZECH_MODELS_DIRECTORY,
231
232        #[cfg(feature = "danish")]
233        Language::Danish => DANISH_MODELS_DIRECTORY,
234
235        #[cfg(feature = "dutch")]
236        Language::Dutch => DUTCH_MODELS_DIRECTORY,
237
238        #[cfg(feature = "english")]
239        Language::English => ENGLISH_MODELS_DIRECTORY,
240
241        #[cfg(feature = "esperanto")]
242        Language::Esperanto => ESPERANTO_MODELS_DIRECTORY,
243
244        #[cfg(feature = "estonian")]
245        Language::Estonian => ESTONIAN_MODELS_DIRECTORY,
246
247        #[cfg(feature = "finnish")]
248        Language::Finnish => FINNISH_MODELS_DIRECTORY,
249
250        #[cfg(feature = "french")]
251        Language::French => FRENCH_MODELS_DIRECTORY,
252
253        #[cfg(feature = "ganda")]
254        Language::Ganda => GANDA_MODELS_DIRECTORY,
255
256        #[cfg(feature = "georgian")]
257        Language::Georgian => GEORGIAN_MODELS_DIRECTORY,
258
259        #[cfg(feature = "german")]
260        Language::German => GERMAN_MODELS_DIRECTORY,
261
262        #[cfg(feature = "greek")]
263        Language::Greek => GREEK_MODELS_DIRECTORY,
264
265        #[cfg(feature = "gujarati")]
266        Language::Gujarati => GUJARATI_MODELS_DIRECTORY,
267
268        #[cfg(feature = "hebrew")]
269        Language::Hebrew => HEBREW_MODELS_DIRECTORY,
270
271        #[cfg(feature = "hindi")]
272        Language::Hindi => HINDI_MODELS_DIRECTORY,
273
274        #[cfg(feature = "hungarian")]
275        Language::Hungarian => HUNGARIAN_MODELS_DIRECTORY,
276
277        #[cfg(feature = "icelandic")]
278        Language::Icelandic => ICELANDIC_MODELS_DIRECTORY,
279
280        #[cfg(feature = "indonesian")]
281        Language::Indonesian => INDONESIAN_MODELS_DIRECTORY,
282
283        #[cfg(feature = "irish")]
284        Language::Irish => IRISH_MODELS_DIRECTORY,
285
286        #[cfg(feature = "italian")]
287        Language::Italian => ITALIAN_MODELS_DIRECTORY,
288
289        #[cfg(feature = "japanese")]
290        Language::Japanese => JAPANESE_MODELS_DIRECTORY,
291
292        #[cfg(feature = "kazakh")]
293        Language::Kazakh => KAZAKH_MODELS_DIRECTORY,
294
295        #[cfg(feature = "korean")]
296        Language::Korean => KOREAN_MODELS_DIRECTORY,
297
298        #[cfg(feature = "latin")]
299        Language::Latin => LATIN_MODELS_DIRECTORY,
300
301        #[cfg(feature = "latvian")]
302        Language::Latvian => LATVIAN_MODELS_DIRECTORY,
303
304        #[cfg(feature = "lithuanian")]
305        Language::Lithuanian => LITHUANIAN_MODELS_DIRECTORY,
306
307        #[cfg(feature = "macedonian")]
308        Language::Macedonian => MACEDONIAN_MODELS_DIRECTORY,
309
310        #[cfg(feature = "malay")]
311        Language::Malay => MALAY_MODELS_DIRECTORY,
312
313        #[cfg(feature = "maori")]
314        Language::Maori => MAORI_MODELS_DIRECTORY,
315
316        #[cfg(feature = "marathi")]
317        Language::Marathi => MARATHI_MODELS_DIRECTORY,
318
319        #[cfg(feature = "mongolian")]
320        Language::Mongolian => MONGOLIAN_MODELS_DIRECTORY,
321
322        #[cfg(feature = "nynorsk")]
323        Language::Nynorsk => NYNORSK_MODELS_DIRECTORY,
324
325        #[cfg(feature = "persian")]
326        Language::Persian => PERSIAN_MODELS_DIRECTORY,
327
328        #[cfg(feature = "polish")]
329        Language::Polish => POLISH_MODELS_DIRECTORY,
330
331        #[cfg(feature = "portuguese")]
332        Language::Portuguese => PORTUGUESE_MODELS_DIRECTORY,
333
334        #[cfg(feature = "punjabi")]
335        Language::Punjabi => PUNJABI_MODELS_DIRECTORY,
336
337        #[cfg(feature = "romanian")]
338        Language::Romanian => ROMANIAN_MODELS_DIRECTORY,
339
340        #[cfg(feature = "russian")]
341        Language::Russian => RUSSIAN_MODELS_DIRECTORY,
342
343        #[cfg(feature = "serbian")]
344        Language::Serbian => SERBIAN_MODELS_DIRECTORY,
345
346        #[cfg(feature = "shona")]
347        Language::Shona => SHONA_MODELS_DIRECTORY,
348
349        #[cfg(feature = "slovak")]
350        Language::Slovak => SLOVAK_MODELS_DIRECTORY,
351
352        #[cfg(feature = "slovene")]
353        Language::Slovene => SLOVENE_MODELS_DIRECTORY,
354
355        #[cfg(feature = "somali")]
356        Language::Somali => SOMALI_MODELS_DIRECTORY,
357
358        #[cfg(feature = "sotho")]
359        Language::Sotho => SOTHO_MODELS_DIRECTORY,
360
361        #[cfg(feature = "spanish")]
362        Language::Spanish => SPANISH_MODELS_DIRECTORY,
363
364        #[cfg(feature = "swahili")]
365        Language::Swahili => SWAHILI_MODELS_DIRECTORY,
366
367        #[cfg(feature = "swedish")]
368        Language::Swedish => SWEDISH_MODELS_DIRECTORY,
369
370        #[cfg(feature = "tagalog")]
371        Language::Tagalog => TAGALOG_MODELS_DIRECTORY,
372
373        #[cfg(feature = "tamil")]
374        Language::Tamil => TAMIL_MODELS_DIRECTORY,
375
376        #[cfg(feature = "telugu")]
377        Language::Telugu => TELUGU_MODELS_DIRECTORY,
378
379        #[cfg(feature = "thai")]
380        Language::Thai => THAI_MODELS_DIRECTORY,
381
382        #[cfg(feature = "tsonga")]
383        Language::Tsonga => TSONGA_MODELS_DIRECTORY,
384
385        #[cfg(feature = "tswana")]
386        Language::Tswana => TSWANA_MODELS_DIRECTORY,
387
388        #[cfg(feature = "turkish")]
389        Language::Turkish => TURKISH_MODELS_DIRECTORY,
390
391        #[cfg(feature = "ukrainian")]
392        Language::Ukrainian => UKRAINIAN_MODELS_DIRECTORY,
393
394        #[cfg(feature = "urdu")]
395        Language::Urdu => URDU_MODELS_DIRECTORY,
396
397        #[cfg(feature = "vietnamese")]
398        Language::Vietnamese => VIETNAMESE_MODELS_DIRECTORY,
399
400        #[cfg(feature = "welsh")]
401        Language::Welsh => WELSH_MODELS_DIRECTORY,
402
403        #[cfg(feature = "xhosa")]
404        Language::Xhosa => XHOSA_MODELS_DIRECTORY,
405
406        #[cfg(feature = "yoruba")]
407        Language::Yoruba => YORUBA_MODELS_DIRECTORY,
408
409        #[cfg(feature = "zulu")]
410        Language::Zulu => ZULU_MODELS_DIRECTORY,
411    }
412}
413
414#[cfg(test)]
415mod tests {
416    use crate::minify;
417
418    use super::*;
419
420    const EXPECTED_UNIGRAM_MODEL: &str = r#"
421    {
422        "language":"ENGLISH",
423        "ngrams":{
424            "2/93616591":"ff ċ ė ĩ ȼ ɔ ţ ũ ʔ ơ ả ộ ù",
425            "36/93616591":"ā",
426            "16/93616591":"fi",
427            "7/93616591":"ă ệ",
428            "5/93616591":"ą ħ ś",
429            "26/93616591":"ć",
430            "49/93616591":"č",
431            "8/93616591":"đ ě ź",
432            "1/93616591":"ē ț ġ ḵ ņ ɑ ə ɛ ɦ ű ƅ ạ ƴ ặ ế ỉ ờ ủ ứ",
433            "4/93616591":"ș ÿ",
434            "9/93616591":"ę ż",
435            "40/93616591":"ğ",
436            "13/93616591":"ī ß",
437            "31/93616591":"ı",
438            "39/93616591":"ł",
439            "25/93616591":"ń",
440            "3/93616591":"ň m ů ư ị",
441            "10/93616591":"ō",
442            "60/93616591":"œ",
443            "11/93616591":"ř ì",
444            "18/93616591":"ş",
445            "52/93616591":"š ô",
446            "7915445/93616591":"a",
447            "1461095/93616591":"b",
448            "3003229/93616591":"c",
449            "3622548/93616591":"d",
450            "11308892/93616591":"e",
451            "2006896/93616591":"f",
452            "1963483/93616591":"g",
453            "234603/4927189":"h",
454            "6800966/93616591":"i",
455            "207477/93616591":"j",
456            "14/93616591":"ū û",
457            "760186/93616591":"k",
458            "3928800/93616591":"l",
459            "2358339/93616591":"m",
460            "6698842/93616591":"n",
461            "7137868/93616591":"o",
462            "1994813/93616591":"p",
463            "82818/93616591":"q",
464            "5939665/93616591":"r",
465            "6234570/93616591":"s",
466            "8431167/93616591":"t",
467            "2559048/93616591":"u",
468            "1024914/93616591":"v",
469            "1751793/93616591":"w",
470            "172448/93616591":"x",
471            "1683314/93616591":"y",
472            "103267/93616591":"z",
473            "20/93616591":"ž",
474            "37/93616591":"º ë",
475            "4/4927189":"à",
476            "539/93616591":"á",
477            "913/93616591":"â",
478            "28/93616591":"ã",
479            "118/93616591":"ä",
480            "42/93616591":"å",
481            "6/93616591":"æ",
482            "126/93616591":"ç",
483            "136/93616591":"è",
484            "2259/93616591":"é",
485            "45/93616591":"ê",
486            "428/93616591":"í",
487            "1/4927189":"î",
488            "77/93616591":"ï",
489            "21/93616591":"ð",
490            "478/93616591":"ñ",
491            "48/93616591":"ò",
492            "490/93616591":"ó",
493            "93/93616591":"õ",
494            "200/93616591":"ö",
495            "32/93616591":"ø",
496            "142/93616591":"ú",
497            "149/93616591":"ü",
498            "23/93616591":"ý"
499        }
500    }
501    "#;
502
503    #[test]
504    fn test_load_json() {
505        let result = load_json(Language::English, "unigrams.json.br");
506        assert!(result.is_ok());
507        assert_eq!(result.unwrap(), minify(EXPECTED_UNIGRAM_MODEL));
508    }
509}