use std::collections::HashSet;
use std::sync::LazyLock;
use rust_stemmers::{Algorithm, Stemmer};
use crate::embed::Tokenizer;
const DEFAULT_ENGLISH_STOPWORDS_ARRAY: &[&str] = &[
"a",
"about",
"above",
"after",
"again",
"against",
"ain",
"all",
"am",
"an",
"and",
"any",
"are",
"aren",
"aren't",
"as",
"at",
"be",
"because",
"been",
"before",
"being",
"below",
"between",
"both",
"but",
"by",
"can",
"couldn",
"couldn't",
"d",
"did",
"didn",
"didn't",
"do",
"does",
"doesn",
"doesn't",
"doing",
"don",
"don't",
"down",
"during",
"each",
"few",
"for",
"from",
"further",
"had",
"hadn",
"hadn't",
"has",
"hasn",
"hasn't",
"have",
"haven",
"haven't",
"having",
"he",
"her",
"here",
"hers",
"herself",
"him",
"himself",
"his",
"how",
"i",
"if",
"in",
"into",
"is",
"isn",
"isn't",
"it",
"it's",
"its",
"itself",
"just",
"ll",
"m",
"ma",
"me",
"mightn",
"mightn't",
"more",
"most",
"mustn",
"mustn't",
"my",
"myself",
"needn",
"needn't",
"no",
"nor",
"not",
"now",
"o",
"of",
"off",
"on",
"once",
"only",
"or",
"other",
"our",
"ours",
"ourselves",
"out",
"over",
"own",
"re",
"s",
"same",
"shan",
"shan't",
"she",
"she's",
"should",
"should've",
"shouldn",
"shouldn't",
"so",
"some",
"such",
"t",
"than",
"that",
"that'll",
"the",
"their",
"theirs",
"them",
"themselves",
"then",
"there",
"these",
"they",
"this",
"those",
"through",
"to",
"too",
"under",
"until",
"up",
"ve",
"very",
"was",
"wasn",
"wasn't",
"we",
"were",
"weren",
"weren't",
"what",
"when",
"where",
"which",
"while",
"who",
"whom",
"why",
"will",
"with",
"won",
"won't",
"wouldn",
"wouldn't",
"y",
"you",
"you'd",
"you'll",
"you're",
"you've",
"your",
"yours",
"yourself",
"yourselves",
];
static DEFAULT_ENGLISH_STOPWORDS: LazyLock<HashSet<&'static str>> =
LazyLock::new(|| DEFAULT_ENGLISH_STOPWORDS_ARRAY.iter().copied().collect());
pub struct Bm25Tokenizer {
pub stemmer: Stemmer,
pub stopwords: HashSet<&'static str>,
pub token_max_length: usize,
}
impl Default for Bm25Tokenizer {
fn default() -> Self {
Self {
stemmer: Stemmer::create(Algorithm::English),
stopwords: DEFAULT_ENGLISH_STOPWORDS.clone(),
token_max_length: 40,
}
}
}
impl Bm25Tokenizer {
fn remove_non_alphanumeric(&self, text: &str) -> String {
text.chars()
.map(|c| {
if c.is_alphanumeric() || c.is_whitespace() || c == '_' {
c
} else {
' '
}
})
.collect()
}
fn simple_tokenize(&self, text: &str) -> Vec<String> {
text.to_lowercase()
.split_whitespace()
.map(|s| s.to_string())
.collect()
}
}
impl Tokenizer for Bm25Tokenizer {
fn tokenize(&self, text: &str) -> Vec<String> {
let cleaned = self.remove_non_alphanumeric(text);
let tokens = self.simple_tokenize(&cleaned);
let mut result = Vec::new();
for token in tokens {
if self.stopwords.contains(token.as_str()) {
continue;
}
if token.len() > self.token_max_length {
continue;
}
let stemmed = self.stemmer.stem(&token).to_string();
if !stemmed.is_empty() {
result.push(stemmed);
}
}
result
}
}