ruve-db 0.1.1 - Docs.rs

pub struct Tokenizer {
    stop_words: Vec<String>,
}

impl Tokenizer {
    pub fn new() -> Self {
        Tokenizer {
            // todo load from config
            stop_words: vec![
                "the".to_string(),
                "is".to_string(),
                "in".to_string(),
                "and".to_string(),
                "to".to_string(),
                "a".to_string(),
                "of".to_string(),
                "that".to_string(),
                "it".to_string(),
                "with".to_string(),
            ],
        }
    }

    // tokenize whitespace, punctuation, and stop words, and lowercase the tokens
    pub fn tokenize(&self, text: &str) -> Vec<String> {
        text.split(|c: char| c.is_whitespace() || c.is_ascii_punctuation())
            .filter(|token| !token.is_empty())
            .map(|token| token.to_lowercase())
            .filter(|token| !self.stop_words.contains(token))
            .collect()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn splits_on_whitespace() {
        let t = Tokenizer::new();
        assert_eq!(t.tokenize("hello world"), vec!["hello", "world"]);
    }

    #[test]
    fn splits_on_punctuation() {
        let t = Tokenizer::new();
        let tokens = t.tokenize("hello, world!");
        assert!(tokens.contains(&"hello".to_string()));
        assert!(tokens.contains(&"world".to_string()));
    }

    #[test]
    fn lowercases_tokens() {
        let t = Tokenizer::new();
        assert_eq!(t.tokenize("Hello WORLD"), vec!["hello", "world"]);
    }

    #[test]
    fn filters_stop_words() {
        let t = Tokenizer::new();
        let tokens = t.tokenize("the quick brown fox");
        assert!(!tokens.contains(&"the".to_string()));
        assert!(tokens.contains(&"quick".to_string()));
        assert!(tokens.contains(&"brown".to_string()));
        assert!(tokens.contains(&"fox".to_string()));
    }

    #[test]
    fn stop_words_are_case_insensitive() {
        let t = Tokenizer::new();
        // "The" should be lowercased to "the" then filtered
        assert!(t.tokenize("The dog").iter().all(|tok| tok != "the"));
    }

    #[test]
    fn empty_string_returns_empty() {
        let t = Tokenizer::new();
        assert!(t.tokenize("").is_empty());
    }

    #[test]
    fn only_stop_words_returns_empty() {
        let t = Tokenizer::new();
        assert!(t.tokenize("the and to a").is_empty());
    }

    #[test]
    fn consecutive_delimiters_produce_no_empty_tokens() {
        let t = Tokenizer::new();
        let tokens = t.tokenize("hello   world");
        assert_eq!(tokens, vec!["hello", "world"]);
    }
}