openmemory 0.1.1

OpenMemory - Cognitive memory system for AI applications
Documentation
//! Text processing utilities
//!
//! This module provides text tokenization, normalization, stemming,
//! and synonym expansion functionality.

use lazy_static::lazy_static;
use regex::Regex;
use std::collections::{HashMap, HashSet};

lazy_static! {
    /// Synonym groups - first word is canonical form
    static ref SYNONYM_GROUPS: Vec<Vec<&'static str>> = vec![
        vec!["prefer", "like", "love", "enjoy", "favor"],
        vec!["theme", "mode", "style", "layout"],
        vec!["meeting", "meet", "session", "call", "sync"],
        vec!["dark", "night", "black"],
        vec!["light", "bright", "day"],
        vec!["user", "person", "people", "customer"],
        vec!["task", "todo", "job"],
        vec!["note", "memo", "reminder"],
        vec!["time", "schedule", "when", "date"],
        vec!["project", "initiative", "plan"],
        vec!["issue", "problem", "bug"],
        vec!["document", "doc", "file"],
        vec!["question", "query", "ask"],
    ];

    /// Map from any word to its canonical form
    static ref CANONICAL_MAP: HashMap<&'static str, &'static str> = {
        let mut map = HashMap::new();
        for grp in SYNONYM_GROUPS.iter() {
            let canonical = grp[0];
            for &word in grp {
                map.insert(word, canonical);
            }
        }
        map
    };

    /// Map from canonical form to all synonyms
    static ref SYNONYM_LOOKUP: HashMap<&'static str, HashSet<&'static str>> = {
        let mut map = HashMap::new();
        for grp in SYNONYM_GROUPS.iter() {
            let canonical = grp[0];
            let set: HashSet<&'static str> = grp.iter().copied().collect();
            map.insert(canonical, set);
        }
        map
    };

    /// Token pattern for extraction
    static ref TOKEN_PATTERN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap();

    /// Stemming rules (pattern, replacement)
    static ref STEM_RULES: Vec<(Regex, &'static str)> = vec![
        (Regex::new(r"ies$").unwrap(), "y"),
        (Regex::new(r"ing$").unwrap(), ""),
        (Regex::new(r"ers?$").unwrap(), "er"),
        (Regex::new(r"ed$").unwrap(), ""),
        (Regex::new(r"s$").unwrap(), ""),
    ];
}

/// Tokenize text into lowercase words
///
/// Extracts all alphanumeric sequences and converts to lowercase.
///
/// # Example
/// ```
/// use openmemory::utils::text::tokenize;
///
/// let tokens = tokenize("Hello World! Testing 123.");
/// assert_eq!(tokens, vec!["hello", "world", "testing", "123"]);
/// ```
pub fn tokenize(text: &str) -> Vec<String> {
    TOKEN_PATTERN
        .find_iter(text)
        .map(|m| m.as_str().to_lowercase())
        .collect()
}

/// Apply simple stemming rules to a token
fn stem(tok: &str) -> String {
    if tok.len() <= 3 {
        return tok.to_string();
    }

    for (pat, rep) in STEM_RULES.iter() {
        if pat.is_match(tok) {
            let stemmed = pat.replace(tok, *rep).to_string();
            if stemmed.len() >= 3 {
                return stemmed;
            }
        }
    }

    tok.to_string()
}

/// Canonicalize a token by applying stemming and synonym mapping
///
/// # Example
/// ```
/// use openmemory::utils::text::canonicalize_token;
///
/// assert_eq!(canonicalize_token("love"), "prefer");  // synonym mapping
/// assert_eq!(canonicalize_token("meetings"), "meeting"); // stem only
/// ```
pub fn canonicalize_token(tok: &str) -> String {
    if tok.is_empty() {
        return String::new();
    }

    let low = tok.to_lowercase();

    // Check if already in canonical map
    if let Some(&canonical) = CANONICAL_MAP.get(low.as_str()) {
        return canonical.to_string();
    }

    // Apply stemming
    let stemmed = stem(&low);

    // Check stemmed form in canonical map
    if let Some(&canonical) = CANONICAL_MAP.get(stemmed.as_str()) {
        return canonical.to_string();
    }

    stemmed
}

/// Extract canonical tokens from text
///
/// Tokenizes, canonicalizes, and filters out very short tokens.
pub fn canonical_tokens_from_text(text: &str) -> Vec<String> {
    tokenize(text)
        .into_iter()
        .map(|tok| canonicalize_token(&tok))
        .filter(|tok| tok.len() > 1)
        .collect()
}

/// Get synonyms for a token
///
/// Returns a set containing the canonical form and all its synonyms.
pub fn synonyms_for(tok: &str) -> HashSet<String> {
    let canonical = canonicalize_token(tok);

    if let Some(syns) = SYNONYM_LOOKUP.get(canonical.as_str()) {
        syns.iter().map(|&s| s.to_string()).collect()
    } else {
        let mut set = HashSet::new();
        set.insert(canonical);
        set
    }
}

/// Build a search document by expanding tokens with synonyms
///
/// Returns a space-separated string of all tokens and their synonyms.
pub fn build_search_doc(text: &str) -> String {
    let canonical = canonical_tokens_from_text(text);
    let mut expanded = HashSet::new();

    for tok in canonical {
        expanded.insert(tok.clone());
        if let Some(syns) = SYNONYM_LOOKUP.get(tok.as_str()) {
            for syn in syns {
                expanded.insert(syn.to_string());
            }
        }
    }

    let mut result: Vec<_> = expanded.into_iter().collect();
    result.sort(); // For deterministic output
    result.join(" ")
}

/// Build an FTS (Full-Text Search) query from text
///
/// Returns tokens formatted for SQLite FTS queries.
pub fn build_fts_query(text: &str) -> String {
    let canonical = canonical_tokens_from_text(text);

    let unique: HashSet<_> = canonical.into_iter().filter(|t| t.len() > 1).collect();

    if unique.is_empty() {
        return String::new();
    }

    let mut terms: Vec<_> = unique.into_iter().collect();
    terms.sort(); // For deterministic output
    terms
        .iter()
        .map(|t| format!("\"{}\"", t))
        .collect::<Vec<_>>()
        .join(" OR ")
}

/// Get the set of canonical tokens from text
pub fn canonical_token_set(text: &str) -> HashSet<String> {
    canonical_tokens_from_text(text).into_iter().collect()
}

/// Add synonym tokens to a set of tokens
///
/// For each token, adds all known synonyms to the result set.
pub fn add_synonym_tokens<'a, I>(tokens: I) -> HashSet<String>
where
    I: IntoIterator<Item = &'a str>,
{
    let mut result = HashSet::new();

    for tok in tokens {
        let canonical = canonicalize_token(tok);
        result.insert(canonical.clone());

        if let Some(syns) = SYNONYM_LOOKUP.get(canonical.as_str()) {
            for syn in syns {
                result.insert(canonicalize_token(syn));
            }
        }
    }

    result
}

/// Normalize text for comparison
///
/// Converts to lowercase, removes non-alphanumeric chars, normalizes whitespace.
pub fn normalize(text: &str) -> String {
    text.to_lowercase()
        .chars()
        .filter(|c| c.is_alphanumeric() || c.is_whitespace())
        .collect::<String>()
        .split_whitespace()
        .collect::<Vec<_>>()
        .join(" ")
}

/// Calculate token overlap ratio between two texts
///
/// Returns the Jaccard similarity (intersection / union) of their token sets.
pub fn token_overlap(text1: &str, text2: &str) -> f64 {
    let set1 = canonical_token_set(text1);
    let set2 = canonical_token_set(text2);

    if set1.is_empty() && set2.is_empty() {
        return 1.0;
    }

    let intersection = set1.intersection(&set2).count();
    let union = set1.union(&set2).count();

    if union == 0 {
        0.0
    } else {
        intersection as f64 / union as f64
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_tokenize() {
        let tokens = tokenize("Hello World! Testing 123.");
        assert_eq!(tokens, vec!["hello", "world", "testing", "123"]);
    }

    #[test]
    fn test_tokenize_empty() {
        let tokens = tokenize("");
        assert!(tokens.is_empty());
    }

    #[test]
    fn test_stem() {
        assert_eq!(stem("running"), "runn");
        assert_eq!(stem("tries"), "try");
        assert_eq!(stem("jumped"), "jump");
        assert_eq!(stem("cats"), "cat");
        assert_eq!(stem("go"), "go"); // Too short
    }

    #[test]
    fn test_canonicalize_token() {
        // Direct synonym match
        assert_eq!(canonicalize_token("love"), "prefer");
        assert_eq!(canonicalize_token("enjoy"), "prefer");

        // Not in synonym groups
        assert_eq!(canonicalize_token("hello"), "hello");

        // Stemmed
        assert_eq!(canonicalize_token("meetings"), "meeting");
    }

    #[test]
    fn test_synonyms_for() {
        let syns = synonyms_for("love");
        assert!(syns.contains("prefer"));
        assert!(syns.contains("love"));
        assert!(syns.contains("like"));
    }

    #[test]
    fn test_canonical_token_set() {
        let set = canonical_token_set("I love dark themes");
        assert!(set.contains("prefer")); // love -> prefer
        assert!(set.contains("dark"));
        assert!(set.contains("theme")); // themes -> theme
    }

    #[test]
    fn test_build_search_doc() {
        let doc = build_search_doc("I like dark mode");
        assert!(doc.contains("prefer")); // like -> prefer
        assert!(doc.contains("dark"));
        assert!(doc.contains("theme")); // mode -> theme
    }

    #[test]
    fn test_token_overlap() {
        let overlap = token_overlap("hello world", "hello there");
        assert!(overlap > 0.0 && overlap < 1.0);

        let same = token_overlap("hello world", "hello world");
        assert!((same - 1.0).abs() < 1e-6);

        let different = token_overlap("hello", "world");
        assert!(different < 1e-6);
    }

    #[test]
    fn test_normalize() {
        assert_eq!(normalize("Hello  World!"), "hello world");
        assert_eq!(normalize("Test123"), "test123");
    }
}