use crate::AprenderError;
use std::collections::HashSet;
#[derive(Debug, Clone)]
pub struct StopWordsFilter {
stop_words: HashSet<String>,
}
impl StopWordsFilter {
pub fn new<I, S>(words: I) -> Self
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let stop_words = words
.into_iter()
.map(|s| s.as_ref().to_lowercase())
.collect();
Self { stop_words }
}
#[must_use]
pub fn english() -> Self {
Self::new(ENGLISH_STOP_WORDS)
}
fn retain_non_stop<I, F>(&self, iter: I, to_string: F) -> Result<Vec<String>, AprenderError>
where
I: Iterator,
F: Fn(I::Item) -> String,
{
Ok(iter
.map(to_string)
.filter(|s| !self.is_stop_word(s))
.collect())
}
pub fn filter<S: AsRef<str>>(&self, tokens: &[S]) -> Result<Vec<String>, AprenderError> {
self.retain_non_stop(tokens.iter(), |token| token.as_ref().to_string())
}
pub fn filter_owned(&self, tokens: Vec<String>) -> Result<Vec<String>, AprenderError> {
self.retain_non_stop(tokens.into_iter(), |token| token)
}
#[must_use]
pub fn is_stop_word(&self, word: &str) -> bool {
self.stop_words.contains(&word.to_lowercase())
}
#[must_use]
pub fn len(&self) -> usize {
self.stop_words.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.stop_words.is_empty()
}
}
pub const ENGLISH_STOP_WORDS: &[&str] = &[
"a",
"an",
"the",
"i",
"me",
"my",
"myself",
"we",
"our",
"ours",
"ourselves",
"you",
"your",
"yours",
"yourself",
"yourselves",
"he",
"him",
"his",
"himself",
"she",
"her",
"hers",
"herself",
"it",
"its",
"itself",
"they",
"them",
"their",
"theirs",
"themselves",
"what",
"which",
"who",
"whom",
"whose",
"why",
"when",
"where",
"how",
"about",
"above",
"across",
"after",
"against",
"along",
"among",
"around",
"at",
"before",
"behind",
"below",
"beneath",
"beside",
"between",
"beyond",
"by",
"down",
"during",
"for",
"from",
"in",
"inside",
"into",
"near",
"of",
"off",
"on",
"onto",
"out",
"outside",
"over",
"through",
"throughout",
"to",
"toward",
"under",
"underneath",
"until",
"up",
"upon",
"with",
"within",
"without",
"and",
"as",
"because",
"but",
"if",
"or",
"since",
"so",
"than",
"that",
"though",
"unless",
"while",
"am",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"having",
"do",
"does",
"did",
"doing",
"would",
"should",
"could",
"ought",
"can",
"may",
"might",
"must",
"will",
"shall",
"all",
"any",
"both",
"each",
"every",
"few",
"more",
"most",
"much",
"neither",
"no",
"none",
"not",
"one",
"other",
"same",
"several",
"some",
"such",
"very",
"too",
"only",
"own",
"then",
"there",
"these",
"this",
"those",
"just",
"now",
"here",
"again",
"also",
"another",
"back",
"even",
"ever",
"get",
"give",
"go",
"got",
"made",
"make",
"say",
"see",
"take",
"way",
];
#[cfg(test)]
#[path = "stopwords_tests.rs"]
mod tests;
#[cfg(test)]
#[path = "tests_stopwords_contract.rs"]
mod tests_stopwords_contract;