Skip to main content

stopwords/
lib.rs

1//! This library provides stopwords datasets from popular text processing engines.
2//!
3//!
4//! This could help reproducing results of text analysis pipelines written using different languages and tools.
5//!
6//! # Usage
7//! ```toml
8//! [dependencies]
9//! stopwords = "0.1.0"
10//! ```
11//!
12//! ```rust
13//! extern crate stopwords;
14//!
15//! use std::collections::HashSet;
16//! use stopwords::{Spark, Language, Stopwords};
17//!
18//! fn main() {
19//!     let stops: HashSet<_> = Spark::stopwords(Language::English).unwrap().iter().collect();
20//!     let mut tokens = vec!("brocolli", "is", "good", "to", "eat");
21//!     tokens.retain(|s| !stops.contains(s));
22//!     assert_eq!(tokens, vec!("brocolli", "good", "eat"));
23//! }
24//! ```
25#[macro_use] extern crate lazy_static;
26#[macro_use] extern crate thiserror;
27
28use std::str::FromStr;
29
30mod nltk;
31mod spark;
32mod sklearn;
33
34pub use nltk::NLTK;
35pub use spark::Spark;
36pub use sklearn::SkLearn;
37
38/// Supported languages. Each provider supports only a subset of this list.
39///
40/// You can parse lowercase English name of the language to construct enum variants.
41///
42/// ```rust
43/// use std::str::FromStr;
44/// use stopwords::Language;
45///
46/// assert_eq!(Language::from_str("english").ok(), Some(Language::English));
47/// assert_eq!(Language::from_str("nepali").ok(), Some(Language::Nepali));
48/// ```
49#[derive(Clone, Copy, PartialEq, Debug)]
50pub enum Language {
51    Arabic,
52    Azerbaijani,
53    Danish,
54    Dutch,
55    English,
56    Finnish,
57    French,
58    German,
59    Greek,
60    Hungarian,
61    Italian,
62    Kazakh,
63    Nepali,
64    Norwegian,
65    Portuguese,
66    Romanian,
67    Russian,
68    Spanish,
69    Swedish,
70    Turkish,
71}
72
73/// Language parse error.
74#[derive(Error, PartialEq, Debug)]
75#[error("Language {0:?} is not supported")]
76pub struct LanguageError(String);
77
78impl FromStr for Language {
79    type Err = LanguageError;
80
81    fn from_str(s: &str) -> Result<Self, Self::Err> {
82        match s {
83            "arabic" => Ok(Language::Arabic),
84            "azerbaijani" => Ok(Language::Azerbaijani),
85            "danish" => Ok(Language::Danish),
86            "dutch" => Ok(Language::Dutch),
87            "english" => Ok(Language::English),
88            "finnish" => Ok(Language::Finnish),
89            "french" => Ok(Language::French),
90            "german" => Ok(Language::German),
91            "greek" => Ok(Language::Greek),
92            "hungarian" => Ok(Language::Hungarian),
93            "italian" => Ok(Language::Italian),
94            "kazakh" => Ok(Language::Kazakh),
95            "nepali" => Ok(Language::Nepali),
96            "norwegian" => Ok(Language::Norwegian),
97            "portuguese" => Ok(Language::Portuguese),
98            "romanian" => Ok(Language::Romanian),
99            "russian" => Ok(Language::Russian),
100            "spanish" => Ok(Language::Spanish),
101            "swedish" => Ok(Language::Swedish),
102            "turkish" => Ok(Language::Turkish),
103            _ => Err(LanguageError(s.to_owned()))
104        }
105    }
106}
107
108/// Interface for getting stopwords from different providers.
109pub trait Stopwords {
110    fn stopwords(language: Language) -> Option<&'static [&'static str]>;
111}
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    #[test]
118    fn from_str() {
119        assert_eq!(Language::from_str("english").ok(), Some(Language::English));
120        assert_eq!(Language::from_str("en"), Err(LanguageError("en".to_owned())));
121        assert_eq!(Language::from_str("en").ok(), None);
122    }
123}