use std::collections::HashSet;
pub trait Tokenizer: Send + Sync {
fn tokenize(&self, text: &str) -> Vec<String>;
fn name(&self) -> &'static str {
"Tokenizer"
}
}
#[derive(Debug, Clone)]
pub struct SimpleTokenizer {
pub lowercase: bool,
pub remove_punctuation: bool,
}
impl SimpleTokenizer {
pub fn new() -> Self {
Self {
lowercase: true,
remove_punctuation: true,
}
}
pub fn with_case_preserved() -> Self {
Self {
lowercase: false,
remove_punctuation: true,
}
}
}
impl Default for SimpleTokenizer {
fn default() -> Self {
Self::new()
}
}
impl Tokenizer for SimpleTokenizer {
fn tokenize(&self, text: &str) -> Vec<String> {
let text = if self.lowercase {
text.to_lowercase()
} else {
text.to_string()
};
if self.remove_punctuation {
text.split(|c: char| c.is_whitespace() || c.is_ascii_punctuation())
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect()
} else {
text.split_whitespace()
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect()
}
}
fn name(&self) -> &'static str {
"SimpleTokenizer"
}
}
#[derive(Debug, Clone)]
pub struct WhitespaceTokenizer {
pub lowercase: bool,
}
impl WhitespaceTokenizer {
pub fn new() -> Self {
Self { lowercase: true }
}
}
impl Default for WhitespaceTokenizer {
fn default() -> Self {
Self::new()
}
}
impl Tokenizer for WhitespaceTokenizer {
fn tokenize(&self, text: &str) -> Vec<String> {
let text = if self.lowercase {
text.to_lowercase()
} else {
text.to_string()
};
text.split_whitespace().map(|s| s.to_string()).collect()
}
fn name(&self) -> &'static str {
"WhitespaceTokenizer"
}
}
#[derive(Debug, Clone)]
pub struct LanguageTokenizer {
lowercase: bool,
remove_punctuation: bool,
stopwords: HashSet<String>,
}
impl LanguageTokenizer {
pub fn new(stopwords: HashSet<String>) -> Self {
Self {
lowercase: true,
remove_punctuation: true,
stopwords,
}
}
pub fn english() -> Self {
Self::new(english_stopwords())
}
pub fn no_stopwords() -> Self {
Self::new(HashSet::new())
}
}
impl Tokenizer for LanguageTokenizer {
fn tokenize(&self, text: &str) -> Vec<String> {
let text = if self.lowercase {
text.to_lowercase()
} else {
text.to_string()
};
let tokens: Vec<String> = if self.remove_punctuation {
text.split(|c: char| c.is_whitespace() || c.is_ascii_punctuation())
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect()
} else {
text.split_whitespace().map(|s| s.to_string()).collect()
};
tokens
.into_iter()
.filter(|token| !self.stopwords.contains(token))
.collect()
}
fn name(&self) -> &'static str {
"LanguageTokenizer"
}
}
fn english_stopwords() -> HashSet<String> {
let words = vec![
"a", "an", "the", "and", "or", "but", "nor", "in", "on", "at", "to", "for", "of", "with", "from", "by", "about", "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "my",
"your", "his", "its", "our", "their", "this", "that", "these", "those",
"am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
"do", "does", "did", "doing", "will", "would", "shall", "should", "can", "could", "may",
"might", "must", "not", "no", "yes", "very", "too", "so", "just", "only", "all", "any", "some", "more",
"most", "other", "such", "what", "which", "who", "when", "where", "why", "how", "as", "if", "than", "then", "there", "here",
];
words.into_iter().map(|s| s.to_string()).collect()
}
#[derive(Debug, Clone)]
pub struct NGramTokenizer {
pub n: usize,
pub char_ngrams: bool,
}
impl NGramTokenizer {
pub fn char_ngrams(n: usize) -> Self {
Self {
n,
char_ngrams: true,
}
}
pub fn word_ngrams(n: usize) -> Self {
Self {
n,
char_ngrams: false,
}
}
}
impl Tokenizer for NGramTokenizer {
fn tokenize(&self, text: &str) -> Vec<String> {
if self.char_ngrams {
let text = text.to_lowercase();
let chars: Vec<char> = text.chars().collect();
if chars.len() < self.n {
return vec![text];
}
chars
.windows(self.n)
.map(|window| window.iter().collect())
.collect()
} else {
let words: Vec<String> = text
.to_lowercase()
.split_whitespace()
.map(|s| s.to_string())
.collect();
if words.len() < self.n {
return vec![words.join(" ")];
}
words
.windows(self.n)
.map(|window| window.join(" "))
.collect()
}
}
fn name(&self) -> &'static str {
if self.char_ngrams {
"CharNGramTokenizer"
} else {
"WordNGramTokenizer"
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_tokenizer() {
let tokenizer = SimpleTokenizer::new();
let tokens = tokenizer.tokenize("Hello, world! This is a test.");
assert_eq!(tokens, vec!["hello", "world", "this", "is", "a", "test"]);
}
#[test]
fn test_simple_tokenizer_case_preserved() {
let tokenizer = SimpleTokenizer::with_case_preserved();
let tokens = tokenizer.tokenize("Hello World");
assert_eq!(tokens, vec!["Hello", "World"]);
}
#[test]
fn test_whitespace_tokenizer() {
let tokenizer = WhitespaceTokenizer::new();
let tokens = tokenizer.tokenize("hello@example.com test-data");
assert_eq!(tokens, vec!["hello@example.com", "test-data"]);
}
#[test]
fn test_language_tokenizer_english() {
let tokenizer = LanguageTokenizer::english();
let tokens = tokenizer.tokenize("The quick brown fox jumps");
assert_eq!(tokens, vec!["quick", "brown", "fox", "jumps"]);
}
#[test]
fn test_language_tokenizer_no_stopwords() {
let tokenizer = LanguageTokenizer::no_stopwords();
let tokens = tokenizer.tokenize("The quick brown fox");
assert_eq!(tokens, vec!["the", "quick", "brown", "fox"]);
}
#[test]
fn test_char_ngrams() {
let tokenizer = NGramTokenizer::char_ngrams(3);
let tokens = tokenizer.tokenize("hello");
assert_eq!(tokens, vec!["hel", "ell", "llo"]);
}
#[test]
fn test_word_ngrams() {
let tokenizer = NGramTokenizer::word_ngrams(2);
let tokens = tokenizer.tokenize("the quick brown fox");
assert_eq!(tokens, vec!["the quick", "quick brown", "brown fox"]);
}
#[test]
fn test_ngrams_short_text() {
let tokenizer = NGramTokenizer::char_ngrams(5);
let tokens = tokenizer.tokenize("hi");
assert_eq!(tokens, vec!["hi"]);
}
#[test]
fn test_empty_text() {
let tokenizer = SimpleTokenizer::new();
let tokens = tokenizer.tokenize("");
assert_eq!(tokens, Vec::<String>::new());
}
#[test]
fn test_unicode_text() {
let tokenizer = SimpleTokenizer::new();
let tokens = tokenizer.tokenize("Hello 世界 émojis 😀");
assert!(tokens.contains(&"hello".to_string()));
assert!(tokens.contains(&"世界".to_string()));
assert!(tokens.contains(&"émojis".to_string()));
}
}