Skip to main content

cognee_ontology/
matching.rs

1//! Entity matching strategies for ontology resolution.
2//!
3//! Provides fuzzy string matching to find ontology entities that
4//! closely match LLM-extracted entity names.
5
6/// Recursive Ratcliff/Obershelp matching-block count.
7///
8/// Counts the total number of matched characters by finding the longest common
9/// substring recursively — matches Python's `difflib.SequenceMatcher` internals
10/// for ASCII input (URI fragment names).
11fn count_matching_chars(a: &[u8], b: &[u8]) -> usize {
12    if a.is_empty() || b.is_empty() {
13        return 0;
14    }
15    let (mut best_ai, mut best_bi, mut best_len) = (0usize, 0usize, 0usize);
16    for i in 0..a.len() {
17        for j in 0..b.len() {
18            let mut k = 0usize;
19            while i + k < a.len() && j + k < b.len() && a[i + k] == b[j + k] {
20                k += 1;
21            }
22            if k > best_len {
23                best_ai = i;
24                best_bi = j;
25                best_len = k;
26            }
27        }
28    }
29    if best_len == 0 {
30        return 0;
31    }
32    count_matching_chars(&a[..best_ai], &b[..best_bi])
33        + best_len
34        + count_matching_chars(&a[best_ai + best_len..], &b[best_bi + best_len..])
35}
36
37/// Gestalt string similarity: `2.0 * M / T`.
38///
39/// `M` = matched characters from [`count_matching_chars`], `T` = total characters
40/// in both strings. Matches Python `difflib.SequenceMatcher.ratio()`, which is
41/// the algorithm underlying `difflib.get_close_matches()`.
42fn gestalt_ratio(a: &str, b: &str) -> f64 {
43    let total = a.len() + b.len();
44    if total == 0 {
45        return 1.0;
46    }
47    2.0 * count_matching_chars(a.as_bytes(), b.as_bytes()) as f64 / total as f64
48}
49
50/// Strategy for matching entity names against ontology.
51///
52/// Allows pluggable matching algorithms (exact, fuzzy, learned, etc.).
53pub trait MatchingStrategy: Send + Sync {
54    /// Find the best matching candidate for a query string.
55    ///
56    /// Returns the matched candidate name if found, or None if no
57    /// suitable match exists above the matching threshold.
58    fn find_match(&self, query: &str, candidates: &[&str]) -> Option<String>;
59}
60
61/// Fuzzy matching strategy using Ratcliff/Obershelp (gestalt) similarity.
62///
63/// Matches Python's `difflib.get_close_matches()` behavior: uses
64/// `SequenceMatcher.ratio()` — `2.0 * M / T` where M is the total matched
65/// characters via recursive longest-common-substring and T is total characters
66/// in both strings — with a configurable similarity threshold (default 0.8).
67///
68/// # Algorithm
69///
70/// 1. Check for exact match first (case-insensitive)
71/// 2. Compute gestalt ratio for all candidates
72/// 3. Filter by cutoff threshold
73/// 4. Return candidate with highest similarity score
74///
75/// # Example
76///
77/// ```
78/// use cognee_ontology::matching::{MatchingStrategy, FuzzyMatchingStrategy};
79///
80/// let matcher = FuzzyMatchingStrategy::new(0.8);
81/// let candidates = vec!["car", "truck", "vehicle"];
82///
83/// // Exact match
84/// assert_eq!(matcher.find_match("car", &candidates), Some("car".to_string()));
85///
86/// // Fuzzy match (typo)
87/// assert_eq!(matcher.find_match("veicle", &candidates), Some("vehicle".to_string()));
88///
89/// // No match below threshold
90/// assert_eq!(matcher.find_match("xyz", &candidates), None);
91/// ```
92#[derive(Debug, Clone)]
93pub struct FuzzyMatchingStrategy {
94    /// Similarity threshold (0.0 - 1.0). Matches with score below this are rejected.
95    cutoff: f64,
96}
97
98impl FuzzyMatchingStrategy {
99    /// Create a new fuzzy matcher with custom threshold.
100    ///
101    /// # Panics
102    ///
103    /// Panics if `cutoff` is not in range [0.0, 1.0].
104    pub fn new(cutoff: f64) -> Self {
105        assert!(
106            (0.0..=1.0).contains(&cutoff),
107            "Cutoff must be between 0.0 and 1.0"
108        );
109        Self { cutoff }
110    }
111
112    /// Get the current cutoff threshold.
113    pub fn cutoff(&self) -> f64 {
114        self.cutoff
115    }
116}
117
118impl Default for FuzzyMatchingStrategy {
119    /// Create matcher with default threshold of 0.8 (matches Python's difflib default).
120    fn default() -> Self {
121        Self::new(0.8)
122    }
123}
124
125impl MatchingStrategy for FuzzyMatchingStrategy {
126    fn find_match(&self, query: &str, candidates: &[&str]) -> Option<String> {
127        if candidates.is_empty() {
128            return None;
129        }
130
131        // Check for exact match first (case-insensitive)
132        let query_lower = query.to_lowercase();
133        for candidate in candidates {
134            if candidate.to_lowercase() == query_lower {
135                return Some(candidate.to_string());
136            }
137        }
138
139        // Fuzzy match using Ratcliff/Obershelp (gestalt) similarity
140        let mut best_match: Option<(&str, f64)> = None;
141
142        for candidate in candidates {
143            let similarity = gestalt_ratio(&query_lower, &candidate.to_lowercase());
144
145            if similarity >= self.cutoff {
146                match best_match {
147                    None => best_match = Some((candidate, similarity)),
148                    Some((_, best_score)) if similarity > best_score => {
149                        best_match = Some((candidate, similarity));
150                    }
151                    _ => {}
152                }
153            }
154        }
155
156        best_match.map(|(name, _)| name.to_string())
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn test_exact_match() {
166        let matcher = FuzzyMatchingStrategy::default();
167        let candidates = vec!["car", "truck", "vehicle"];
168
169        assert_eq!(
170            matcher.find_match("car", &candidates),
171            Some("car".to_string())
172        );
173    }
174
175    #[test]
176    fn test_exact_match_case_insensitive() {
177        let matcher = FuzzyMatchingStrategy::default();
178        let candidates = vec!["Car", "Truck", "Vehicle"];
179
180        assert_eq!(
181            matcher.find_match("car", &candidates),
182            Some("Car".to_string())
183        );
184        assert_eq!(
185            matcher.find_match("TRUCK", &candidates),
186            Some("Truck".to_string())
187        );
188    }
189
190    #[test]
191    fn test_fuzzy_match_typo() {
192        let matcher = FuzzyMatchingStrategy::default();
193        let candidates = vec!["car", "truck", "vehicle"];
194
195        // "veicle" should match "vehicle" (transposed letters)
196        let result = matcher.find_match("veicle", &candidates);
197        assert_eq!(result, Some("vehicle".to_string()));
198    }
199
200    #[test]
201    fn test_no_match_below_threshold() {
202        let matcher = FuzzyMatchingStrategy::new(0.9);
203        let candidates = vec!["car", "truck", "vehicle"];
204
205        // "xyz" should not match anything
206        assert_eq!(matcher.find_match("xyz", &candidates), None);
207    }
208
209    #[test]
210    fn test_empty_candidates() {
211        let matcher = FuzzyMatchingStrategy::default();
212        let candidates: Vec<&str> = vec![];
213
214        assert_eq!(matcher.find_match("car", &candidates), None);
215    }
216
217    #[test]
218    fn test_best_match_selected() {
219        let matcher = FuzzyMatchingStrategy::new(0.5);
220        let candidates = vec!["car", "cart", "cardiac"];
221
222        // "car" is exact match, should be returned even though others are similar
223        assert_eq!(
224            matcher.find_match("car", &candidates),
225            Some("car".to_string())
226        );
227
228        // "carr" should match "car" better than "cart"
229        let result = matcher.find_match("carr", &candidates);
230        assert!(result.is_some());
231    }
232
233    #[test]
234    fn test_default_cutoff() {
235        let matcher = FuzzyMatchingStrategy::default();
236        assert_eq!(matcher.cutoff(), 0.8);
237    }
238
239    #[test]
240    fn test_custom_cutoff() {
241        let matcher = FuzzyMatchingStrategy::new(0.6);
242        assert_eq!(matcher.cutoff(), 0.6);
243    }
244
245    #[test]
246    #[should_panic(expected = "Cutoff must be between 0.0 and 1.0")]
247    fn test_invalid_cutoff_above_one() {
248        FuzzyMatchingStrategy::new(1.5);
249    }
250
251    #[test]
252    #[should_panic(expected = "Cutoff must be between 0.0 and 1.0")]
253    fn test_invalid_cutoff_negative() {
254        FuzzyMatchingStrategy::new(-0.5);
255    }
256}