use alloc::string::{String, ToString};
use alloc::vec;
use alloc::vec::Vec;
use super::types::Token;
const STOP_WORDS: &[&str] = &[
"a", "after", "all", "also", "an", "and", "are", "as", "at", "be", "been", "before", "but",
"by", "can", "did", "do", "does", "down", "for", "from", "had", "has", "have", "he", "how",
"if", "in", "into", "is", "it", "its", "just", "more", "no", "not", "of", "on", "only", "or",
"other", "out", "over", "so", "some", "such", "than", "that", "the", "their", "then", "there",
"they", "this", "to", "up", "was", "were", "what", "when", "which", "who", "will", "with",
"would", "you", "your",
];
pub struct Tokenizer {
lowercase: bool,
remove_stop_words: bool,
min_length: usize,
max_length: usize,
}
impl Default for Tokenizer {
fn default() -> Self {
Self::new()
}
}
impl Tokenizer {
pub fn new() -> Self {
Self {
lowercase: true,
remove_stop_words: true,
min_length: 2,
max_length: 64,
}
}
pub fn with_lowercase(mut self, lowercase: bool) -> Self {
self.lowercase = lowercase;
self
}
pub fn with_stop_words(mut self, remove: bool) -> Self {
self.remove_stop_words = remove;
self
}
pub fn with_min_length(mut self, min: usize) -> Self {
self.min_length = min;
self
}
pub fn tokenize(&self, content: &[u8]) -> Vec<Token> {
let mut tokens = Vec::new();
let mut position = 0u32;
let mut current_word = String::new();
let mut word_start = 0usize;
let text = core::str::from_utf8(content)
.map(|s| s.to_string())
.unwrap_or_else(|_| String::from_utf8_lossy(content).into_owned());
for (i, ch) in text.char_indices() {
if ch.is_alphanumeric() {
if current_word.is_empty() {
word_start = i;
}
current_word.push(ch);
} else if !current_word.is_empty() {
if let Some(token) = self.process_word(¤t_word, position, word_start) {
tokens.push(token);
position += 1;
}
current_word.clear();
}
}
if !current_word.is_empty() {
if let Some(token) = self.process_word(¤t_word, position, word_start) {
tokens.push(token);
}
}
tokens
}
fn process_word(&self, word: &str, position: u32, byte_offset: usize) -> Option<Token> {
if word.len() < self.min_length || word.len() > self.max_length {
return None;
}
let normalized = if self.lowercase {
word.to_lowercase()
} else {
word.to_string()
};
if self.remove_stop_words && is_stop_word(&normalized) {
return None;
}
Some(Token {
term: normalized,
position,
byte_offset,
})
}
pub fn tokenize_query(&self, query: &str) -> Vec<String> {
let mut terms = Vec::new();
let mut current_word = String::new();
for ch in query.chars() {
if ch.is_alphanumeric() {
current_word.push(ch);
} else if !current_word.is_empty() {
let term = if self.lowercase {
current_word.to_lowercase()
} else {
current_word.clone()
};
if term.len() >= self.min_length && term.len() <= self.max_length {
terms.push(term);
}
current_word.clear();
}
}
if !current_word.is_empty() {
let term = if self.lowercase {
current_word.to_lowercase()
} else {
current_word
};
if term.len() >= self.min_length && term.len() <= self.max_length {
terms.push(term);
}
}
terms
}
}
fn is_stop_word(word: &str) -> bool {
STOP_WORDS.binary_search(&word).is_ok()
}
pub fn levenshtein_distance(s1: &str, s2: &str) -> usize {
let len1 = s1.chars().count();
let len2 = s2.chars().count();
if len1 == 0 {
return len2;
}
if len2 == 0 {
return len1;
}
let mut prev_row: Vec<usize> = (0..=len2).collect();
let mut curr_row = vec![0; len2 + 1];
for (i, c1) in s1.chars().enumerate() {
curr_row[0] = i + 1;
for (j, c2) in s2.chars().enumerate() {
let cost = if c1 == c2 { 0 } else { 1 };
curr_row[j + 1] = (prev_row[j + 1] + 1) .min(curr_row[j] + 1) .min(prev_row[j] + cost); }
core::mem::swap(&mut prev_row, &mut curr_row);
}
prev_row[len2]
}
pub fn fuzzy_match(
term: &str,
candidates: impl Iterator<Item = impl AsRef<str>>,
max_distance: usize,
) -> Vec<String> {
candidates
.filter_map(|candidate| {
let candidate = candidate.as_ref();
let distance = levenshtein_distance(term, candidate);
if distance <= max_distance {
Some(candidate.to_string())
} else {
None
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_simple() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize(b"Hello World");
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].term, "hello");
assert_eq!(tokens[1].term, "world");
}
#[test]
fn test_stop_words() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize(b"the quick brown fox");
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].term, "quick");
}
#[test]
fn test_levenshtein() {
assert_eq!(levenshtein_distance("hello", "hello"), 0);
assert_eq!(levenshtein_distance("hello", "hallo"), 1);
assert_eq!(levenshtein_distance("hello", "world"), 4);
assert_eq!(levenshtein_distance("", "hello"), 5);
}
}