1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#![warn(clippy::all)]
#![warn(missing_docs)]
#![warn(rustdoc::missing_doc_code_examples)]
#![warn(clippy::missing_docs_in_private_items)]

#![doc = include_str!("../README.md")]

mod language_names;
pub use language_names::LANGUAGE;

use std::fmt::Display;

#[cfg(not(feature = "nltk"))]
use serde_json;

/// Define and implement a trait that allows for overloading the method
#[doc(hidden)]
pub trait LanguageName: Display {}
impl LanguageName for LANGUAGE {}
impl LanguageName for &str {}
impl LanguageName for String {}

/// This function is the only one you'll ever need! It fetches stop words for a language using
/// either a member of the `LANGUAGE` enum, or a two-character ISO language name as either a `str` or a `String` type.
/// ```
/// let first_list = stop_words::get("ar");
/// let second_list = stop_words::get(stop_words::LANGUAGE::Arabic);
/// assert_eq!(first_list, second_list)
/// ```
pub fn get<T>(input_language: T) -> Vec<String>
where
    T: LanguageName,
{
    get_iso(&*format!("{}", input_language))
}

/// This function fetches stop words for a language using a 2-letter ISO code
#[cfg(feature = "nltk")]
fn get_iso(input_language: &str) -> Vec<String> {
    match input_language {
        "ar" => read_from_bytes(include_bytes!("nltk/arabic")),
        "az" => read_from_bytes(include_bytes!("nltk/azerbaijani")),
        "da" => read_from_bytes(include_bytes!("nltk/danish")),
        "nl" => read_from_bytes(include_bytes!("nltk/dutch")),
        "en" => read_from_bytes(include_bytes!("nltk/english")),
        "fi" => read_from_bytes(include_bytes!("nltk/finnish")),
        "fr" => read_from_bytes(include_bytes!("nltk/french")),
        "de" => read_from_bytes(include_bytes!("nltk/german")),
        "el" => read_from_bytes(include_bytes!("nltk/greek")),
        "hu" => read_from_bytes(include_bytes!("nltk/hungarian")),
        "id" => read_from_bytes(include_bytes!("nltk/indonesian")),
        "it" => read_from_bytes(include_bytes!("nltk/italian")),
        "kk" => read_from_bytes(include_bytes!("nltk/kazakh")),
        "ne" => read_from_bytes(include_bytes!("nltk/nepali")),
        "no" => read_from_bytes(include_bytes!("nltk/norwegian")),
        "pt" => read_from_bytes(include_bytes!("nltk/portuguese")),
        "ro" => read_from_bytes(include_bytes!("nltk/romanian")),
        "ru" => read_from_bytes(include_bytes!("nltk/russian")),
        "sl" => read_from_bytes(include_bytes!("nltk/slovenian")),
        "es" => read_from_bytes(include_bytes!("nltk/spanish")),
        "sv" => read_from_bytes(include_bytes!("nltk/swedish")),
        "tg" => read_from_bytes(include_bytes!("nltk/tajik")),
        "tr" => read_from_bytes(include_bytes!("nltk/turkish")),
        _ => panic!("Unfortunately, the '{}' language is not currently supported. Please make sure that the name of the language is spelled in English.", input_language)
    }
}

/// This function fetches stop words for a language using a 2-letter ISO code
#[cfg(not(feature = "nltk"))]
fn get_iso(input_language: &str) -> Vec<String> {
    let bytes = include_bytes!("iso/stopwords-iso.json");
    let mut json: serde_json::Value = serde_json::from_slice(bytes).unwrap();
    if !json[input_language].is_array() {
        panic!(concat!("Unfortunately, the '{}' language is not currently supported or nonexistent. Please make sure that the name is an appropriate 2-letter ISO code."), input_language )
    }
    let array_of_words = json[input_language].as_array_mut().unwrap();
    array_of_words
        .into_iter()
        .map(|x| x.as_str().unwrap().to_owned())
        .collect()
}

#[cfg(feature = "nltk")]
/// This function converts the bytestring to a vector
pub(crate) fn read_from_bytes(bytes: &[u8]) -> Vec<String> {
    let contents = String::from_utf8_lossy(bytes);
    let split_contents = contents.split('\n');
    let mut output = vec![];
    for word in split_contents {
        output.push(String::from(word));
    }
    output
}