Skip to main content

cloakpipe_core/
resolver.rs

1//! Fuzzy entity resolution — merge variant spellings of the same entity.
2//!
3//! Resolves "Rishikesh", "Rishi", "Rishiksh" (typo) to the same vault token
4//! using Jaro-Winkler similarity, prefix matching, and user-defined alias groups.
5
6use crate::EntityCategory;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10/// Configuration for the entity resolver.
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct ResolverConfig {
13    /// Enable fuzzy entity resolution (default: false).
14    #[serde(default)]
15    pub enabled: bool,
16    /// Minimum similarity score to merge entities (0.0–1.0, default: 0.90).
17    #[serde(default = "default_threshold")]
18    pub threshold: f64,
19    /// Minimum string length for prefix matching (default: 4).
20    #[serde(default = "default_min_prefix_len")]
21    pub min_prefix_len: usize,
22    /// User-defined alias groups — each group shares a single token.
23    #[serde(default)]
24    pub aliases: Vec<AliasGroup>,
25}
26
27/// A group of strings that should all resolve to the same entity.
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct AliasGroup {
30    pub group: Vec<String>,
31}
32
33fn default_threshold() -> f64 {
34    0.90
35}
36
37fn default_min_prefix_len() -> usize {
38    4
39}
40
41impl Default for ResolverConfig {
42    fn default() -> Self {
43        Self {
44            enabled: false,
45            threshold: default_threshold(),
46            min_prefix_len: default_min_prefix_len(),
47            aliases: Vec::new(),
48        }
49    }
50}
51
52/// Fuzzy entity resolver that maps variant spellings to canonical forms.
53pub struct EntityResolver {
54    config: ResolverConfig,
55    /// Alias lookup: normalized variant → canonical form (first entry in group).
56    alias_map: HashMap<String, String>,
57}
58
59impl EntityResolver {
60    pub fn new(config: ResolverConfig) -> Self {
61        let mut alias_map = HashMap::new();
62        for group in &config.aliases {
63            if group.group.len() < 2 {
64                continue;
65            }
66            let canonical = group.group[0].clone();
67            for variant in &group.group {
68                alias_map.insert(variant.to_lowercase().trim().to_string(), canonical.clone());
69            }
70        }
71        Self { config, alias_map }
72    }
73
74    /// Resolve an entity's original text to a canonical form.
75    ///
76    /// Checks (in order):
77    /// 1. User-defined alias groups (exact match, case-insensitive)
78    /// 2. Fuzzy match against existing vault entries (same category only)
79    ///
80    /// Returns the canonical form if a match is found, or None (use original).
81    pub fn resolve(
82        &self,
83        original: &str,
84        category: &EntityCategory,
85        existing_entries: &HashMap<String, EntityCategory>,
86    ) -> Option<String> {
87        if !self.config.enabled {
88            return None;
89        }
90
91        // 1. Check alias groups
92        let normalized = original.to_lowercase().trim().to_string();
93        if let Some(canonical) = self.alias_map.get(&normalized) {
94            return Some(canonical.clone());
95        }
96
97        // 2. Fuzzy match against existing entries in the same category
98        let mut best_match: Option<String> = None;
99        let mut best_score: f64 = 0.0;
100
101        for (existing_original, existing_category) in existing_entries {
102            // Only match within the same category
103            if existing_category != category {
104                continue;
105            }
106
107            // Skip exact match (vault already handles that)
108            if existing_original == original {
109                continue;
110            }
111
112            let score = self.similarity(original, existing_original);
113            if score > best_score && score >= self.config.threshold {
114                best_score = score;
115                best_match = Some(existing_original.clone());
116            }
117        }
118
119        best_match
120    }
121
122    /// Compute a combined similarity score between two strings.
123    ///
124    /// Uses Jaro-Winkler as the base metric, with bonuses for:
125    /// - One string being a prefix of the other (if long enough)
126    /// - Case-insensitive match
127    fn similarity(&self, a: &str, b: &str) -> f64 {
128        let a_lower = a.to_lowercase();
129        let b_lower = b.to_lowercase();
130
131        // Base: Jaro-Winkler distance (0.0–1.0, 1.0 = identical)
132        let mut score = strsim::jaro_winkler(&a_lower, &b_lower);
133
134        // Bonus: prefix matching (one is prefix of the other)
135        let shorter = a_lower.len().min(b_lower.len());
136        if shorter >= self.config.min_prefix_len
137            && (a_lower.starts_with(&b_lower) || b_lower.starts_with(&a_lower))
138        {
139            score += 0.08;
140        }
141
142        // Cap at 1.0
143        score.min(1.0)
144    }
145
146    /// Check if the resolver is enabled.
147    pub fn is_enabled(&self) -> bool {
148        self.config.enabled
149    }
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    fn enabled_config() -> ResolverConfig {
157        ResolverConfig {
158            enabled: true,
159            threshold: 0.90,
160            min_prefix_len: 4,
161            aliases: Vec::new(),
162        }
163    }
164
165    fn config_with_aliases() -> ResolverConfig {
166        ResolverConfig {
167            enabled: true,
168            threshold: 0.90,
169            min_prefix_len: 4,
170            aliases: vec![AliasGroup {
171                group: vec![
172                    "Rishikesh Kumar".into(),
173                    "Rishi".into(),
174                    "Rishi kesh".into(),
175                ],
176            }],
177        }
178    }
179
180    #[test]
181    fn test_alias_resolution() {
182        let resolver = EntityResolver::new(config_with_aliases());
183        let existing = HashMap::new();
184
185        assert_eq!(
186            resolver.resolve("Rishi", &EntityCategory::Person, &existing),
187            Some("Rishikesh Kumar".into())
188        );
189        assert_eq!(
190            resolver.resolve("Rishi kesh", &EntityCategory::Person, &existing),
191            Some("Rishikesh Kumar".into())
192        );
193        // Case insensitive
194        assert_eq!(
195            resolver.resolve("rishi", &EntityCategory::Person, &existing),
196            Some("Rishikesh Kumar".into())
197        );
198    }
199
200    #[test]
201    fn test_alias_canonical_self_resolves() {
202        let resolver = EntityResolver::new(config_with_aliases());
203        let existing = HashMap::new();
204
205        assert_eq!(
206            resolver.resolve("Rishikesh Kumar", &EntityCategory::Person, &existing),
207            Some("Rishikesh Kumar".into())
208        );
209    }
210
211    #[test]
212    fn test_fuzzy_match_misspelling() {
213        let resolver = EntityResolver::new(enabled_config());
214        let mut existing = HashMap::new();
215        existing.insert("Rishikesh".to_string(), EntityCategory::Person);
216
217        // Typo: "Rishiksh" is close to "Rishikesh"
218        let result = resolver.resolve("Rishiksh", &EntityCategory::Person, &existing);
219        assert_eq!(result, Some("Rishikesh".into()));
220    }
221
222    #[test]
223    fn test_fuzzy_match_prefix() {
224        let resolver = EntityResolver::new(enabled_config());
225        let mut existing = HashMap::new();
226        existing.insert("Rishikesh".to_string(), EntityCategory::Person);
227
228        // "Rishi" is a prefix of "Rishikesh" — gets prefix bonus
229        let result = resolver.resolve("Rishi", &EntityCategory::Person, &existing);
230        // Jaro-Winkler("rishi", "rishikesh") ≈ 0.87 + 0.08 prefix = 0.95 > 0.90
231        assert_eq!(result, Some("Rishikesh".into()));
232    }
233
234    #[test]
235    fn test_no_cross_category_match() {
236        let resolver = EntityResolver::new(enabled_config());
237        let mut existing = HashMap::new();
238        existing.insert("Rishikesh".to_string(), EntityCategory::Location);
239
240        // Same string but different category — should NOT match
241        let result = resolver.resolve("Rishiksh", &EntityCategory::Person, &existing);
242        assert_eq!(result, None);
243    }
244
245    #[test]
246    fn test_no_match_different_entities() {
247        let resolver = EntityResolver::new(enabled_config());
248        let mut existing = HashMap::new();
249        existing.insert("John".to_string(), EntityCategory::Person);
250
251        // "Alice" is nothing like "John"
252        let result = resolver.resolve("Alice", &EntityCategory::Person, &existing);
253        assert_eq!(result, None);
254    }
255
256    #[test]
257    fn test_disabled_resolver() {
258        let resolver = EntityResolver::new(ResolverConfig::default());
259        let mut existing = HashMap::new();
260        existing.insert("Rishikesh".to_string(), EntityCategory::Person);
261
262        let result = resolver.resolve("Rishiksh", &EntityCategory::Person, &existing);
263        assert_eq!(result, None);
264    }
265
266    #[test]
267    fn test_short_prefix_rejected() {
268        let resolver = EntityResolver::new(enabled_config());
269        let mut existing = HashMap::new();
270        existing.insert("Alice".to_string(), EntityCategory::Person);
271
272        // "Al" is too short for prefix matching (< min_prefix_len of 4)
273        let result = resolver.resolve("Al", &EntityCategory::Person, &existing);
274        // Jaro-Winkler("al", "alice") is too low without prefix bonus
275        assert_eq!(result, None);
276    }
277
278    #[test]
279    fn test_similarity_scores() {
280        let resolver = EntityResolver::new(enabled_config());
281
282        // Misspelling: very high
283        assert!(resolver.similarity("Rishikesh", "Rishiksh") > 0.90);
284        // Prefix: high with bonus
285        assert!(resolver.similarity("Rishi", "Rishikesh") > 0.90);
286        // Completely different: low
287        assert!(resolver.similarity("John", "Mumbai") < 0.60);
288        // Similar but dangerous: "John" vs "Joan"
289        let john_joan = resolver.similarity("John", "Joan");
290        assert!(john_joan < 0.92); // Should be below threshold or borderline
291    }
292}