Skip to main content

agentic_memory/engine/
tokenizer.rs

1//! Shared tokenizer for BM25 text search and index building.
2
3use std::collections::{HashMap, HashSet};
4
5/// Stop words to exclude from tokenization.
6const STOP_WORDS: &[&str] = &[
7    "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
8    "do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can",
9    "need", "must", "to", "of", "in", "for", "on", "with", "at", "by", "from", "as", "into",
10    "about", "but", "not", "or", "and", "if", "it", "its", "this", "that", "which", "who", "what",
11    "when", "where", "how", "all", "each", "both", "few", "more", "most", "other", "some", "such",
12    "no", "than", "too", "very", "just", "also",
13];
14
15/// Deterministic tokenizer for BM25 text search.
16pub struct Tokenizer {
17    stop_words: HashSet<&'static str>,
18}
19
20impl Tokenizer {
21    /// Create a new tokenizer with the default stop word list.
22    pub fn new() -> Self {
23        Self {
24            stop_words: STOP_WORDS.iter().copied().collect(),
25        }
26    }
27
28    /// Tokenize text into lowercase terms, excluding stop words and short tokens.
29    pub fn tokenize(&self, text: &str) -> Vec<String> {
30        text.to_lowercase()
31            .split(|c: char| !c.is_alphanumeric())
32            .filter(|token| token.len() >= 2)
33            .filter(|token| !self.stop_words.contains(token))
34            .map(|s| s.to_string())
35            .collect()
36    }
37
38    /// Tokenize and return term frequencies.
39    pub fn term_frequencies(&self, text: &str) -> HashMap<String, u32> {
40        let mut freqs = HashMap::new();
41        for token in self.tokenize(text) {
42            *freqs.entry(token).or_insert(0) += 1;
43        }
44        freqs
45    }
46}
47
48impl Default for Tokenizer {
49    fn default() -> Self {
50        Self::new()
51    }
52}