pub trait Tokenizer: Send + Sync + std::fmt::Debug {
fn tokenize(&self, text: &str) -> Vec<String>;
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct WhitespaceTokenizer {
pub lowercase: bool,
pub min_token_len: usize,
pub stopwords: std::collections::BTreeSet<String>,
}
impl Default for WhitespaceTokenizer {
fn default() -> Self {
Self { lowercase: true, min_token_len: 2, stopwords: std::collections::BTreeSet::new() }
}
}
impl WhitespaceTokenizer {
#[must_use]
pub fn new() -> Self {
Self::default()
}
}
impl Tokenizer for WhitespaceTokenizer {
fn tokenize(&self, text: &str) -> Vec<String> {
text.split(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty())
.map(|s| if self.lowercase { s.to_lowercase() } else { s.to_string() })
.filter(|s| !self.stopwords.contains(s))
.filter(|s| s.len() >= self.min_token_len)
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn whitespace_tokenizer_default_lowercases_and_drops_short() {
let tok = WhitespaceTokenizer::default();
let out = tok.tokenize("Hello, World! a b c");
assert_eq!(out, vec!["hello", "world"]);
}
#[test]
fn whitespace_tokenizer_can_disable_lowercase() {
let tok = WhitespaceTokenizer { lowercase: false, ..Default::default() };
let out = tok.tokenize("Hello World");
assert_eq!(out, vec!["Hello", "World"]);
}
#[test]
fn whitespace_tokenizer_stopwords_drop_match() {
let mut stopwords = std::collections::BTreeSet::new();
stopwords.insert("hello".to_string());
let tok = WhitespaceTokenizer { stopwords, ..Default::default() };
let out = tok.tokenize("Hello World");
assert_eq!(out, vec!["world"]);
}
}