agentic_memory/engine/
tokenizer.rs1use std::collections::{HashMap, HashSet};
4
5const STOP_WORDS: &[&str] = &[
7 "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
8 "do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can",
9 "need", "must", "to", "of", "in", "for", "on", "with", "at", "by", "from", "as", "into",
10 "about", "but", "not", "or", "and", "if", "it", "its", "this", "that", "which", "who", "what",
11 "when", "where", "how", "all", "each", "both", "few", "more", "most", "other", "some", "such",
12 "no", "than", "too", "very", "just", "also",
13];
14
15pub struct Tokenizer {
17 stop_words: HashSet<&'static str>,
18}
19
20impl Tokenizer {
21 pub fn new() -> Self {
23 Self {
24 stop_words: STOP_WORDS.iter().copied().collect(),
25 }
26 }
27
28 pub fn tokenize(&self, text: &str) -> Vec<String> {
30 text.to_lowercase()
31 .split(|c: char| !c.is_alphanumeric())
32 .filter(|token| token.len() >= 2)
33 .filter(|token| !self.stop_words.contains(token))
34 .map(|s| s.to_string())
35 .collect()
36 }
37
38 pub fn term_frequencies(&self, text: &str) -> HashMap<String, u32> {
40 let mut freqs = HashMap::new();
41 for token in self.tokenize(text) {
42 *freqs.entry(token).or_insert(0) += 1;
43 }
44 freqs
45 }
46}
47
48impl Default for Tokenizer {
49 fn default() -> Self {
50 Self::new()
51 }
52}