ruve-db 0.1.0

A hybrid vector and full-text search database with HNSW approximate nearest-neighbour indexing and BM25
Documentation
pub struct Tokenizer {
    stop_words: Vec<String>,
}

impl Tokenizer {
    pub fn new() -> Self {
        Tokenizer {
            // todo load from config
            stop_words: vec![
                "the".to_string(),
                "is".to_string(),
                "in".to_string(),
                "and".to_string(),
                "to".to_string(),
                "a".to_string(),
                "of".to_string(),
                "that".to_string(),
                "it".to_string(),
                "with".to_string(),
            ],
        }
    }

    // tokenize whitespace, punctuation, and stop words, and lowercase the tokens
    pub fn tokenize(&self, text: &str) -> Vec<String> {
        text.split(|c: char| c.is_whitespace() || c.is_ascii_punctuation())
            .filter(|token| !token.is_empty())
            .map(|token| token.to_lowercase())
            .filter(|token| !self.stop_words.contains(token))
            .collect()
    }
}