Skip to main content

arbor_graph/
search_index.rs

1//! Search index for fast substring matching.
2//!
3//! This module provides an inverted index that enables O(k) substring
4//! search where k is the number of matches, instead of O(n) linear scan.
5
6use crate::graph::NodeId;
7use std::collections::{HashMap, HashSet};
8
9/// Minimum n-gram length for indexing.
10const MIN_NGRAM_LEN: usize = 2;
11
12/// Maximum n-gram length for indexing.
13const MAX_NGRAM_LEN: usize = 4;
14
15/// An inverted index for fast substring search.
16///
17/// Uses n-gram indexing to support substring matching. When a name is added,
18/// we break it into overlapping n-grams and index each one. During search,
19/// we look up the query's n-grams and intersect the results.
20#[derive(Debug, Default, Clone)]
21pub struct SearchIndex {
22    /// Maps lowercased full names to NodeIds for exact match lookup.
23    exact_index: HashMap<String, Vec<NodeId>>,
24    /// Maps lowercased n-grams to NodeIds for substring search.
25    ngram_index: HashMap<String, HashSet<NodeId>>,
26}
27
28impl SearchIndex {
29    /// Creates a new empty search index.
30    pub fn new() -> Self {
31        Self::default()
32    }
33
34    /// Inserts a name into the index.
35    pub fn insert(&mut self, name: &str, id: NodeId) {
36        let lower = name.to_lowercase();
37
38        // Add to exact index
39        self.exact_index.entry(lower.clone()).or_default().push(id);
40
41        // Add to n-gram index
42        for ngram in self.generate_ngrams(&lower) {
43            self.ngram_index.entry(ngram).or_default().insert(id);
44        }
45    }
46
47    /// Removes a name from the index.
48    pub fn remove(&mut self, name: &str, id: NodeId) {
49        let lower = name.to_lowercase();
50
51        // Remove from exact index
52        if let Some(ids) = self.exact_index.get_mut(&lower) {
53            ids.retain(|&x| x != id);
54            if ids.is_empty() {
55                self.exact_index.remove(&lower);
56            }
57        }
58
59        // Remove from n-gram index
60        for ngram in self.generate_ngrams(&lower) {
61            if let Some(ids) = self.ngram_index.get_mut(&ngram) {
62                ids.remove(&id);
63                if ids.is_empty() {
64                    self.ngram_index.remove(&ngram);
65                }
66            }
67        }
68    }
69
70    /// Searches for nodes whose names contain the query substring.
71    ///
72    /// Returns matching NodeIds sorted for deterministic output.
73    pub fn search(&self, query: &str) -> Vec<NodeId> {
74        if query.is_empty() {
75            return Vec::new();
76        }
77
78        let query_lower = query.to_lowercase();
79
80        // For very short queries, fall back to prefix matching on exact index
81        if query_lower.len() < MIN_NGRAM_LEN {
82            let mut results: Vec<NodeId> = self
83                .exact_index
84                .iter()
85                .filter(|(name, _)| name.starts_with(&query_lower))
86                .flat_map(|(_, ids)| ids.iter().copied())
87                .collect();
88            results.sort();
89            results.dedup();
90            return results;
91        }
92
93        // Generate n-grams for the query
94        let query_ngrams: Vec<String> = self.generate_ngrams(&query_lower);
95
96        if query_ngrams.is_empty() {
97            return Vec::new();
98        }
99
100        // Find candidate nodes by intersecting n-gram matches
101        let mut candidates: Option<HashSet<NodeId>> = None;
102
103        for ngram in &query_ngrams {
104            if let Some(ids) = self.ngram_index.get(ngram) {
105                match &mut candidates {
106                    None => candidates = Some(ids.clone()),
107                    Some(c) => {
108                        c.retain(|id| ids.contains(id));
109                    }
110                }
111            } else {
112                // If any n-gram has no matches, the query has no results
113                return Vec::new();
114            }
115        }
116
117        // Filter candidates to ensure full substring match
118        // (n-gram intersection can have false positives)
119        let mut results: Vec<NodeId> = candidates
120            .unwrap_or_default()
121            .into_iter()
122            .filter(|id| {
123                self.exact_index
124                    .iter()
125                    .any(|(name, ids)| ids.contains(id) && name.contains(&query_lower))
126            })
127            .collect();
128
129        results.sort();
130        results
131    }
132
133    /// Generates n-grams for a lowercased string.
134    fn generate_ngrams(&self, s: &str) -> Vec<String> {
135        let chars: Vec<char> = s.chars().collect();
136        let mut ngrams = Vec::new();
137
138        for n in MIN_NGRAM_LEN..=MAX_NGRAM_LEN {
139            if chars.len() >= n {
140                for i in 0..=(chars.len() - n) {
141                    ngrams.push(chars[i..i + n].iter().collect());
142                }
143            }
144        }
145
146        ngrams
147    }
148
149    /// Returns the number of unique names indexed.
150    pub fn len(&self) -> usize {
151        self.exact_index.len()
152    }
153
154    /// Returns true if the index is empty.
155    pub fn is_empty(&self) -> bool {
156        self.exact_index.is_empty()
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163    use petgraph::graph::NodeIndex;
164
165    fn node_id(n: u32) -> NodeId {
166        NodeIndex::new(n as usize)
167    }
168
169    #[test]
170    fn test_insert_and_search_exact() {
171        let mut index = SearchIndex::new();
172        index.insert("validate_user", node_id(0));
173        index.insert("validate_email", node_id(1));
174        index.insert("send_email", node_id(2));
175
176        let results = index.search("validate_user");
177        assert_eq!(results, vec![node_id(0)]);
178    }
179
180    #[test]
181    fn test_search_substring() {
182        let mut index = SearchIndex::new();
183        index.insert("validate_user", node_id(0));
184        index.insert("validate_email", node_id(1));
185        index.insert("send_email", node_id(2));
186
187        let results = index.search("validate");
188        assert!(results.contains(&node_id(0)));
189        assert!(results.contains(&node_id(1)));
190        assert!(!results.contains(&node_id(2)));
191    }
192
193    #[test]
194    fn test_search_case_insensitive() {
195        let mut index = SearchIndex::new();
196        index.insert("ValidateUser", node_id(0));
197
198        let results = index.search("validateuser");
199        assert_eq!(results, vec![node_id(0)]);
200
201        let results = index.search("VALIDATEUSER");
202        assert_eq!(results, vec![node_id(0)]);
203    }
204
205    #[test]
206    fn test_search_middle_substring() {
207        let mut index = SearchIndex::new();
208        index.insert("get_user_profile", node_id(0));
209
210        let results = index.search("user");
211        assert_eq!(results, vec![node_id(0)]);
212
213        let results = index.search("_user_");
214        assert_eq!(results, vec![node_id(0)]);
215    }
216
217    #[test]
218    fn test_remove_from_index() {
219        let mut index = SearchIndex::new();
220        index.insert("foo", node_id(0));
221        index.insert("foobar", node_id(1));
222
223        index.remove("foo", node_id(0));
224
225        let results = index.search("foo");
226        assert!(!results.contains(&node_id(0)));
227        assert!(results.contains(&node_id(1)));
228    }
229
230    #[test]
231    fn test_search_no_match() {
232        let mut index = SearchIndex::new();
233        index.insert("hello", node_id(0));
234
235        let results = index.search("world");
236        assert!(results.is_empty());
237    }
238
239    #[test]
240    fn test_short_query() {
241        let mut index = SearchIndex::new();
242        index.insert("ab", node_id(0));
243        index.insert("abc", node_id(1));
244        index.insert("xyz", node_id(2));
245
246        // Query shorter than MIN_NGRAM_LEN uses prefix matching
247        let results = index.search("a");
248        assert!(results.contains(&node_id(0)));
249        assert!(results.contains(&node_id(1)));
250        assert!(!results.contains(&node_id(2)));
251    }
252
253    #[test]
254    fn test_index_len_and_is_empty() {
255        let mut index = SearchIndex::new();
256        assert!(index.is_empty());
257        assert_eq!(index.len(), 0);
258
259        index.insert("foo", node_id(0));
260        assert!(!index.is_empty());
261        assert_eq!(index.len(), 1);
262
263        index.insert("bar", node_id(1));
264        assert_eq!(index.len(), 2);
265    }
266
267    #[test]
268    fn test_duplicate_name_different_ids() {
269        let mut index = SearchIndex::new();
270        // Two different nodes with the same name (overloads, different files)
271        index.insert("process", node_id(0));
272        index.insert("process", node_id(1));
273
274        let results = index.search("process");
275        assert!(results.contains(&node_id(0)));
276        assert!(results.contains(&node_id(1)));
277        // len() should still be 1 since it's the same name
278        assert_eq!(index.len(), 1);
279    }
280
281    #[test]
282    fn test_remove_nonexistent_does_not_panic() {
283        let mut index = SearchIndex::new();
284        // Removing from empty index should not panic
285        index.remove("nonexistent", node_id(99));
286        assert!(index.is_empty());
287    }
288
289    #[test]
290    fn test_search_empty_query() {
291        let mut index = SearchIndex::new();
292        index.insert("hello", node_id(0));
293
294        // Empty query should return empty (no prefix matches)
295        let results = index.search("");
296        assert!(results.is_empty());
297    }
298}