use lazy_static::lazy_static;
use regex::Regex;
use std::collections::{HashMap, HashSet};
lazy_static! {
static ref SYNONYM_GROUPS: Vec<Vec<&'static str>> = vec![
vec!["prefer", "like", "love", "enjoy", "favor"],
vec!["theme", "mode", "style", "layout"],
vec!["meeting", "meet", "session", "call", "sync"],
vec!["dark", "night", "black"],
vec!["light", "bright", "day"],
vec!["user", "person", "people", "customer"],
vec!["task", "todo", "job"],
vec!["note", "memo", "reminder"],
vec!["time", "schedule", "when", "date"],
vec!["project", "initiative", "plan"],
vec!["issue", "problem", "bug"],
vec!["document", "doc", "file"],
vec!["question", "query", "ask"],
];
static ref CANONICAL_MAP: HashMap<&'static str, &'static str> = {
let mut map = HashMap::new();
for grp in SYNONYM_GROUPS.iter() {
let canonical = grp[0];
for &word in grp {
map.insert(word, canonical);
}
}
map
};
static ref SYNONYM_LOOKUP: HashMap<&'static str, HashSet<&'static str>> = {
let mut map = HashMap::new();
for grp in SYNONYM_GROUPS.iter() {
let canonical = grp[0];
let set: HashSet<&'static str> = grp.iter().copied().collect();
map.insert(canonical, set);
}
map
};
static ref TOKEN_PATTERN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap();
static ref STEM_RULES: Vec<(Regex, &'static str)> = vec![
(Regex::new(r"ies$").unwrap(), "y"),
(Regex::new(r"ing$").unwrap(), ""),
(Regex::new(r"ers?$").unwrap(), "er"),
(Regex::new(r"ed$").unwrap(), ""),
(Regex::new(r"s$").unwrap(), ""),
];
}
pub fn tokenize(text: &str) -> Vec<String> {
TOKEN_PATTERN
.find_iter(text)
.map(|m| m.as_str().to_lowercase())
.collect()
}
fn stem(tok: &str) -> String {
if tok.len() <= 3 {
return tok.to_string();
}
for (pat, rep) in STEM_RULES.iter() {
if pat.is_match(tok) {
let stemmed = pat.replace(tok, *rep).to_string();
if stemmed.len() >= 3 {
return stemmed;
}
}
}
tok.to_string()
}
pub fn canonicalize_token(tok: &str) -> String {
if tok.is_empty() {
return String::new();
}
let low = tok.to_lowercase();
if let Some(&canonical) = CANONICAL_MAP.get(low.as_str()) {
return canonical.to_string();
}
let stemmed = stem(&low);
if let Some(&canonical) = CANONICAL_MAP.get(stemmed.as_str()) {
return canonical.to_string();
}
stemmed
}
pub fn canonical_tokens_from_text(text: &str) -> Vec<String> {
tokenize(text)
.into_iter()
.map(|tok| canonicalize_token(&tok))
.filter(|tok| tok.len() > 1)
.collect()
}
pub fn synonyms_for(tok: &str) -> HashSet<String> {
let canonical = canonicalize_token(tok);
if let Some(syns) = SYNONYM_LOOKUP.get(canonical.as_str()) {
syns.iter().map(|&s| s.to_string()).collect()
} else {
let mut set = HashSet::new();
set.insert(canonical);
set
}
}
pub fn build_search_doc(text: &str) -> String {
let canonical = canonical_tokens_from_text(text);
let mut expanded = HashSet::new();
for tok in canonical {
expanded.insert(tok.clone());
if let Some(syns) = SYNONYM_LOOKUP.get(tok.as_str()) {
for syn in syns {
expanded.insert(syn.to_string());
}
}
}
let mut result: Vec<_> = expanded.into_iter().collect();
result.sort(); result.join(" ")
}
pub fn build_fts_query(text: &str) -> String {
let canonical = canonical_tokens_from_text(text);
let unique: HashSet<_> = canonical.into_iter().filter(|t| t.len() > 1).collect();
if unique.is_empty() {
return String::new();
}
let mut terms: Vec<_> = unique.into_iter().collect();
terms.sort(); terms
.iter()
.map(|t| format!("\"{}\"", t))
.collect::<Vec<_>>()
.join(" OR ")
}
pub fn canonical_token_set(text: &str) -> HashSet<String> {
canonical_tokens_from_text(text).into_iter().collect()
}
pub fn add_synonym_tokens<'a, I>(tokens: I) -> HashSet<String>
where
I: IntoIterator<Item = &'a str>,
{
let mut result = HashSet::new();
for tok in tokens {
let canonical = canonicalize_token(tok);
result.insert(canonical.clone());
if let Some(syns) = SYNONYM_LOOKUP.get(canonical.as_str()) {
for syn in syns {
result.insert(canonicalize_token(syn));
}
}
}
result
}
pub fn normalize(text: &str) -> String {
text.to_lowercase()
.chars()
.filter(|c| c.is_alphanumeric() || c.is_whitespace())
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
pub fn token_overlap(text1: &str, text2: &str) -> f64 {
let set1 = canonical_token_set(text1);
let set2 = canonical_token_set(text2);
if set1.is_empty() && set2.is_empty() {
return 1.0;
}
let intersection = set1.intersection(&set2).count();
let union = set1.union(&set2).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize() {
let tokens = tokenize("Hello World! Testing 123.");
assert_eq!(tokens, vec!["hello", "world", "testing", "123"]);
}
#[test]
fn test_tokenize_empty() {
let tokens = tokenize("");
assert!(tokens.is_empty());
}
#[test]
fn test_stem() {
assert_eq!(stem("running"), "runn");
assert_eq!(stem("tries"), "try");
assert_eq!(stem("jumped"), "jump");
assert_eq!(stem("cats"), "cat");
assert_eq!(stem("go"), "go"); }
#[test]
fn test_canonicalize_token() {
assert_eq!(canonicalize_token("love"), "prefer");
assert_eq!(canonicalize_token("enjoy"), "prefer");
assert_eq!(canonicalize_token("hello"), "hello");
assert_eq!(canonicalize_token("meetings"), "meeting");
}
#[test]
fn test_synonyms_for() {
let syns = synonyms_for("love");
assert!(syns.contains("prefer"));
assert!(syns.contains("love"));
assert!(syns.contains("like"));
}
#[test]
fn test_canonical_token_set() {
let set = canonical_token_set("I love dark themes");
assert!(set.contains("prefer")); assert!(set.contains("dark"));
assert!(set.contains("theme")); }
#[test]
fn test_build_search_doc() {
let doc = build_search_doc("I like dark mode");
assert!(doc.contains("prefer")); assert!(doc.contains("dark"));
assert!(doc.contains("theme")); }
#[test]
fn test_token_overlap() {
let overlap = token_overlap("hello world", "hello there");
assert!(overlap > 0.0 && overlap < 1.0);
let same = token_overlap("hello world", "hello world");
assert!((same - 1.0).abs() < 1e-6);
let different = token_overlap("hello", "world");
assert!(different < 1e-6);
}
#[test]
fn test_normalize() {
assert_eq!(normalize("Hello World!"), "hello world");
assert_eq!(normalize("Test123"), "test123");
}
}