1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#![warn(clippy::all)]
#![warn(missing_docs)]
#![warn(rustdoc::missing_doc_code_examples)]
#![warn(clippy::missing_docs_in_private_items)]
#![doc = include_str!("../README.md")]
mod language_names;
pub use language_names::LANGUAGE;
use std::fmt::Display;
#[cfg(not(feature = "nltk"))]
use serde_json;
#[doc(hidden)]
pub trait LanguageName: Display {}
impl LanguageName for LANGUAGE {}
impl LanguageName for &str {}
impl LanguageName for String {}
pub fn get<T>(input_language: T) -> Vec<String>
where
T: LanguageName,
{
get_iso(&*format!("{}", input_language))
}
#[cfg(feature = "nltk")]
fn get_iso(input_language: &str) -> Vec<String> {
match input_language {
"ar" => read_from_bytes(include_bytes!("nltk/arabic")),
"az" => read_from_bytes(include_bytes!("nltk/azerbaijani")),
"da" => read_from_bytes(include_bytes!("nltk/danish")),
"nl" => read_from_bytes(include_bytes!("nltk/dutch")),
"en" => read_from_bytes(include_bytes!("nltk/english")),
"fi" => read_from_bytes(include_bytes!("nltk/finnish")),
"fr" => read_from_bytes(include_bytes!("nltk/french")),
"de" => read_from_bytes(include_bytes!("nltk/german")),
"el" => read_from_bytes(include_bytes!("nltk/greek")),
"hu" => read_from_bytes(include_bytes!("nltk/hungarian")),
"id" => read_from_bytes(include_bytes!("nltk/indonesian")),
"it" => read_from_bytes(include_bytes!("nltk/italian")),
"kk" => read_from_bytes(include_bytes!("nltk/kazakh")),
"ne" => read_from_bytes(include_bytes!("nltk/nepali")),
"no" => read_from_bytes(include_bytes!("nltk/norwegian")),
"pt" => read_from_bytes(include_bytes!("nltk/portuguese")),
"ro" => read_from_bytes(include_bytes!("nltk/romanian")),
"ru" => read_from_bytes(include_bytes!("nltk/russian")),
"sl" => read_from_bytes(include_bytes!("nltk/slovenian")),
"es" => read_from_bytes(include_bytes!("nltk/spanish")),
"sv" => read_from_bytes(include_bytes!("nltk/swedish")),
"tg" => read_from_bytes(include_bytes!("nltk/tajik")),
"tr" => read_from_bytes(include_bytes!("nltk/turkish")),
_ => panic!("Unfortunately, the '{}' language is not currently supported. Please make sure that the name of the language is spelled in English.", input_language)
}
}
#[cfg(not(feature = "nltk"))]
fn get_iso(input_language: &str) -> Vec<String> {
let bytes = include_bytes!("iso/stopwords-iso.json");
let mut json: serde_json::Value = serde_json::from_slice(bytes).unwrap();
if !json[input_language].is_array() {
panic!(concat!("Unfortunately, the '{}' language is not currently supported or nonexistent. Please make sure that the name is an appropriate 2-letter ISO code."), input_language )
}
let array_of_words = json[input_language].as_array_mut().unwrap();
array_of_words
.into_iter()
.map(|x| x.as_str().unwrap().to_owned())
.collect()
}
#[cfg(feature = "nltk")]
pub(crate) fn read_from_bytes(bytes: &[u8]) -> Vec<String> {
let contents = String::from_utf8_lossy(bytes);
let split_contents = contents.split('\n');
let mut output = vec![];
for word in split_contents {
output.push(String::from(word));
}
output
}