pub(crate) use super::*;
fn assert_english_filter(tokens: &[&str], expected: &[&str]) {
let filter = StopWordsFilter::english();
let filtered = filter.filter(tokens).expect("filter should succeed");
let expected_strings: Vec<String> = expected.iter().map(|s| s.to_string()).collect();
assert_eq!(filtered, expected_strings, "input: {tokens:?}");
}
#[test]
fn test_english_filter_basic_cases() {
let cases: &[(&[&str], &[&str])] = &[
(
&["the", "quick", "brown", "fox"],
&["quick", "brown", "fox"],
),
(&["The", "Cat", "IS", "happy"], &["Cat", "happy"]),
(
&["Machine", "learning", "the", "FUTURE"],
&["Machine", "learning", "FUTURE"],
),
(&[], &[]),
(&["the", "and", "is", "a"], &[]),
(
&["machine", "learning", "neural", "network"],
&["machine", "learning", "neural", "network"],
),
];
for (tokens, expected) in cases {
assert_english_filter(tokens, expected);
}
}
#[test]
fn test_custom_stop_words() {
let filter = StopWordsFilter::new(vec!["foo", "bar", "baz"]);
let tokens = vec!["foo", "test", "bar", "data", "baz"];
let filtered = filter.filter(&tokens).expect("filter should succeed");
assert_eq!(filtered, vec!["test", "data"]);
}
#[test]
fn test_filter_owned() {
let filter = StopWordsFilter::english();
let tokens = vec![
"the".to_string(),
"cat".to_string(),
"is".to_string(),
"happy".to_string(),
];
let filtered = filter.filter_owned(tokens).expect("filter should succeed");
assert_eq!(filtered, vec!["cat", "happy"]);
}
#[test]
fn test_is_stop_word() {
let filter = StopWordsFilter::english();
assert!(filter.is_stop_word("the"));
assert!(filter.is_stop_word("THE"));
assert!(filter.is_stop_word("is"));
assert!(!filter.is_stop_word("machine"));
assert!(!filter.is_stop_word("learning"));
}
#[test]
fn test_filter_len() {
let english = StopWordsFilter::english();
assert_eq!(english.len(), 171);
let custom = StopWordsFilter::new(vec!["foo", "bar"]);
assert_eq!(custom.len(), 2);
let empty = StopWordsFilter::new(Vec::<String>::new());
assert_eq!(empty.len(), 0);
}
#[test]
fn test_filter_is_empty() {
let english = StopWordsFilter::english();
assert!(!english.is_empty());
let empty = StopWordsFilter::new(Vec::<String>::new());
assert!(empty.is_empty());
}
#[test]
fn test_unicode_tokens() {
let filter = StopWordsFilter::english();
let tokens = vec!["the", "世界", "is", "beautiful"];
let filtered = filter.filter(&tokens).expect("filter should succeed");
assert_eq!(filtered, vec!["世界", "beautiful"]);
}
#[test]
fn test_punctuation_handling() {
let filter = StopWordsFilter::english();
let tokens = vec!["the.", "cat,", "is!", "happy"];
let filtered = filter.filter(&tokens).expect("filter should succeed");
assert_eq!(filtered, vec!["the.", "cat,", "is!", "happy"]);
}
#[test]
fn test_real_world_sentences() {
let cases: &[(&[&str], &[&str])] = &[
(
&[
"I", "love", "machine", "learning", "and", "neural", "networks", "because", "they",
"are", "powerful",
],
&[
"love", "machine", "learning", "neural", "networks", "powerful",
],
),
(
&[
"Natural",
"language",
"processing",
"is",
"a",
"subfield",
"of",
"artificial",
"intelligence",
],
&[
"Natural",
"language",
"processing",
"subfield",
"artificial",
"intelligence",
],
),
];
for (tokens, expected) in cases {
assert_english_filter(tokens, expected);
}
}
#[test]
fn test_stop_words_list_has_expected_count() {
assert_eq!(ENGLISH_STOP_WORDS.len(), 171);
}
const EXPECTED_STOP_WORDS_BY_CATEGORY: &[(&str, &[&str])] = &[
("articles", &["a", "an", "the"]),
("pronouns", &["i", "you", "he", "she", "it", "we", "they"]),
("prepositions", &["in", "on", "at", "by", "for", "with"]),
("conjunctions", &["and", "or", "but", "if", "because"]),
];
#[test]
fn test_stop_words_contains_expected_categories() {
for (category, words) in EXPECTED_STOP_WORDS_BY_CATEGORY {
for word in *words {
assert!(
ENGLISH_STOP_WORDS.contains(word),
"{category}: expected {word:?} in ENGLISH_STOP_WORDS"
);
}
}
}
#[test]
fn test_stop_words_lowercase_only() {
for word in ENGLISH_STOP_WORDS {
assert_eq!(*word, word.to_lowercase());
}
}