1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
//! This library provides stopwords datasets from popular text processing engines.
//!
//!
//! This could help reproducing results of text analysis pipelines written using different languages and tools.
//!
//! # Usage
//! ```toml
//! [dependencies]
//! stopwords = "0.1.0"
//! ```
//!
//! ```rust
//! extern crate stopwords;
//!
//! use std::collections::HashSet;
//! use stopwords::{Spark, Language, Stopwords};
//!
//! fn main() {
//!     let stops: HashSet<_> = Spark::stopwords(Language::English).unwrap().iter().collect();
//!     let mut tokens = vec!("brocolli", "is", "good", "to", "eat");
//!     tokens.retain(|s| !stops.contains(s));
//!     assert_eq!(tokens, vec!("brocolli", "good", "eat"));
//! }
//! ```
#[macro_use] extern crate lazy_static;
#[macro_use] extern crate thiserror;

use std::str::FromStr;

mod nltk;
mod spark;
mod sklearn;

pub use nltk::NLTK;
pub use spark::Spark;
pub use sklearn::SkLearn;

/// Supported languages. Each provider supports only a subset of this list.
///
/// You can parse lowercase English name of the language to construct enum variants.
///
/// ```rust
/// use std::str::FromStr;
/// use stopwords::Language;
///
/// assert_eq!(Language::from_str("english").ok(), Some(Language::English));
/// assert_eq!(Language::from_str("nepali").ok(), Some(Language::Nepali));
/// ```
#[derive(Clone, Copy, PartialEq, Debug)]
pub enum Language {
    Arabic,
    Azerbaijani,
    Danish,
    Dutch,
    English,
    Finnish,
    French,
    German,
    Greek,
    Hungarian,
    Italian,
    Kazakh,
    Nepali,
    Norwegian,
    Portuguese,
    Romanian,
    Russian,
    Spanish,
    Swedish,
    Turkish,
}

/// Language parse error.
#[derive(Error, PartialEq, Debug)]
#[error("Language {0:?} is not supported")]
pub struct LanguageError(String);

impl FromStr for Language {
    type Err = LanguageError;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "arabic" => Ok(Language::Arabic),
            "azerbaijani" => Ok(Language::Azerbaijani),
            "danish" => Ok(Language::Danish),
            "dutch" => Ok(Language::Dutch),
            "english" => Ok(Language::English),
            "finnish" => Ok(Language::Finnish),
            "french" => Ok(Language::French),
            "german" => Ok(Language::German),
            "greek" => Ok(Language::Greek),
            "hungarian" => Ok(Language::Hungarian),
            "italian" => Ok(Language::Italian),
            "kazakh" => Ok(Language::Kazakh),
            "nepali" => Ok(Language::Nepali),
            "norwegian" => Ok(Language::Norwegian),
            "portuguese" => Ok(Language::Portuguese),
            "romanian" => Ok(Language::Romanian),
            "russian" => Ok(Language::Russian),
            "spanish" => Ok(Language::Spanish),
            "swedish" => Ok(Language::Swedish),
            "turkish" => Ok(Language::Turkish),
            _ => Err(LanguageError(s.to_owned()))
        }
    }
}

/// Interface for getting stopwords from different providers.
pub trait Stopwords {
    fn stopwords(language: Language) -> Option<&'static [&'static str]>;
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn from_str() {
        assert_eq!(Language::from_str("english").ok(), Some(Language::English));
        assert_eq!(Language::from_str("en"), Err(LanguageError("en".to_owned())));
        assert_eq!(Language::from_str("en").ok(), None);
    }
}