use crate::common::{DefaultFilter, StringTokenizer};
use crate::index::text::Languages;
use std::ops::Deref;
use std::sync::Arc;
pub trait TokenizerProvider: Send + Sync {
fn supported_languages(&self) -> Languages;
fn tokenize(&self, text: &str) -> Vec<String> {
StringTokenizer::new(DefaultFilter, text)
.filter(|token| !self.stop_words().contains(&token.term()))
.map(|token| token.term().to_string())
.collect()
}
fn stop_words(&self) -> Vec<&'static str>;
}
#[derive(Clone)]
pub struct Tokenizer {
inner: Arc<dyn TokenizerProvider>,
}
impl Tokenizer {
pub fn new<T: TokenizerProvider + 'static>(inner: T) -> Self {
Tokenizer { inner: Arc::new(inner) }
}
}
impl Deref for Tokenizer {
type Target = Arc<dyn TokenizerProvider>;
#[inline]
fn deref(&self) -> &Self::Target {
&self.inner
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::index::text::Languages;
struct MockTokenizer;
impl TokenizerProvider for MockTokenizer {
fn supported_languages(&self) -> Languages {
Languages::English
}
fn stop_words(&self) -> Vec<&'static str> {
vec!["a", "an", "the"]
}
}
#[test]
fn test_tokenizer_supported_languages() {
let tokenizer = Tokenizer::new(MockTokenizer);
assert_eq!(tokenizer.supported_languages(), Languages::English);
}
#[test]
fn test_tokenizer_tokenize() {
let tokenizer = Tokenizer::new(MockTokenizer);
let text = "This is a test.";
let tokens = tokenizer.tokenize(text);
assert_eq!(tokens, vec!["This", "is", "test"]);
}
#[test]
fn test_tokenizer_stop_words() {
let tokenizer = Tokenizer::new(MockTokenizer);
assert_eq!(tokenizer.stop_words(), vec!["a", "an", "the"]);
}
#[test]
fn test_tokenizer_empty_text() {
let tokenizer = Tokenizer::new(MockTokenizer);
let text = "";
let tokens = tokenizer.tokenize(text);
assert!(tokens.is_empty());
}
#[test]
fn test_tokenizer_whitespace_text() {
let tokenizer = Tokenizer::new(MockTokenizer);
let text = " ";
let tokens = tokenizer.tokenize(text);
assert!(tokens.is_empty());
}
#[test]
fn test_tokenizer_special_characters() {
let tokenizer = Tokenizer::new(MockTokenizer);
let text = "!@#$%^&*()";
let tokens = tokenizer.tokenize(text);
assert_eq!(tokens, vec!["$"]);
}
}