1use crate::intern::StrId;
2
3pub struct SearchIndex {
11 entries: Vec<(StrId, u32)>,
13 lower_buf: Vec<u8>,
14}
15
16impl SearchIndex {
17 pub fn new() -> Self {
18 Self {
19 entries: Vec::new(),
20 lower_buf: Vec::with_capacity(256),
21 }
22 }
23
24 pub fn clear(&mut self) {
25 self.entries.clear();
26 }
27
28 pub const fn len(&self) -> usize {
29 self.entries.len()
30 }
31
32 pub const fn is_empty(&self) -> bool {
33 self.entries.is_empty()
34 }
35
36 pub fn index_entity(
40 &mut self,
41 interner: &mut crate::intern::StringInterner,
42 entity_idx: u32,
43 name: StrId,
44 entity_type: StrId,
45 observations: &[StrId],
46 ) {
47 self.tokenize_and_insert(interner, entity_idx, name);
48 self.tokenize_and_insert(interner, entity_idx, entity_type);
49 for &obs in observations {
50 self.tokenize_and_insert(interner, entity_idx, obs);
51 }
52 }
53
54 pub fn remove_entity(&mut self, entity_idx: u32) {
56 self.entries.retain(|&(_, idx)| idx != entity_idx);
57 }
58
59 pub fn search(&self, query: &str, interner: &crate::intern::StringInterner) -> Vec<u32> {
63 if query.is_empty() || self.entries.is_empty() {
64 return Vec::new();
65 }
66
67 let lower_query: String = query.to_ascii_lowercase();
68
69 let mut matched = if let Some(token_id) = interner.get_optional(&lower_query) {
71 let range_begin = self.entries.binary_search_by(|(t, _)| t.cmp(&token_id));
72 let range_end = self.entries.binary_search_by(|(t, _)| {
73 if *t <= token_id { std::cmp::Ordering::Less } else { std::cmp::Ordering::Greater }
74 });
75 if let (Ok(begin), Err(end)) = (range_begin, range_end) {
76 self.entries[begin..end].iter().map(|&(_, idx)| idx).collect()
77 } else {
78 Vec::new()
79 }
80 } else {
81 Vec::new()
82 };
83
84 for &(token_id, entity_idx) in &self.entries {
86 if matched.last().is_none_or(|&last| last != entity_idx) {
87 let token = interner.lookup(token_id);
88 if token.len() >= lower_query.len()
89 && token.as_bytes().starts_with(lower_query.as_bytes())
90 {
91 matched.push(entity_idx);
92 }
93 }
94 }
95
96 matched.sort_unstable();
97 matched.dedup();
98 matched
99 }
100
101 fn tokenize_and_insert(
102 &mut self,
103 interner: &mut crate::intern::StringInterner,
104 entity_idx: u32,
105 text: StrId,
106 ) {
107 let s = interner.lookup(text);
108 if s.is_empty() {
109 return;
110 }
111
112 self.lower_buf.clear();
113 self.lower_buf.extend(s.bytes().map(|b| b.to_ascii_lowercase()));
114 let lowered = unsafe { std::str::from_utf8_unchecked(&self.lower_buf) };
115
116 let tokens: Vec<&str> = lowered.split_whitespace().filter(|t| !t.is_empty()).collect();
117 let interned: Vec<StrId> = tokens.iter().map(|t| interner.intern(t)).collect();
118
119 for token_id in &interned {
120 self.insert_entry(*token_id, entity_idx);
121 }
122 }
123
124 fn insert_entry(&mut self, token: StrId, entity_idx: u32) {
125 let entry = (token, entity_idx);
126 let pos = match self.entries.binary_search(&entry) {
127 Ok(_) => return,
128 Err(pos) => pos,
129 };
130 self.entries.insert(pos, entry);
131 }
132}
133
134impl Default for SearchIndex {
135 fn default() -> Self {
136 Self::new()
137 }
138}
139
140#[cfg(test)]
141mod tests {
142 use super::*;
143 use crate::intern::StringInterner;
144
145 #[test]
146 fn test_index_and_search() {
147 let mut interner = StringInterner::new();
148 let mut index = SearchIndex::new();
149
150 let alice_name = interner.intern("Alice");
151 let alice_type = interner.intern("person");
152 let alice_obs = interner.intern("likes coffee");
153
154 index.index_entity(&mut interner, 0, alice_name, alice_type, &[alice_obs]);
155
156 let bob_name = interner.intern("Bob");
157 let bob_type = interner.intern("person");
158 let bob_obs = interner.intern("drinks tea");
159
160 index.index_entity(&mut interner, 1, bob_name, bob_type, &[bob_obs]);
161
162 let results = index.search("coffee", &interner);
163 assert_eq!(results, vec![0]);
164
165 let results = index.search("person", &interner);
166 assert_eq!(results.len(), 2);
167 }
168
169 #[test]
170 fn test_remove_entity() {
171 let mut interner = StringInterner::new();
172 let mut index = SearchIndex::new();
173
174 let name = interner.intern("Test");
175 let typ = interner.intern("type");
176
177 index.index_entity(&mut interner, 0, name, typ, &[]);
178 assert!(!index.is_empty());
179
180 index.remove_entity(0);
181 assert!(index.entries.iter().all(|&(_, idx)| idx != 0));
182 }
183
184 #[test]
185 fn test_search_empty_query() {
186 let interner = StringInterner::new();
187 let index = SearchIndex::new();
188 assert!(index.search("", &interner).is_empty());
189 }
190
191 #[test]
192 fn test_search_no_match() {
193 let mut interner = StringInterner::new();
194 let mut index = SearchIndex::new();
195 let name = interner.intern("Alice");
196 let typ = interner.intern("person");
197 index.index_entity(&mut interner, 0, name, typ, &[]);
198 assert!(index.search("zzzzzz", &interner).is_empty());
199 }
200
201 #[test]
202 fn test_search_case_insensitive() {
203 let mut interner = StringInterner::new();
204 let mut index = SearchIndex::new();
205 let name = interner.intern("Alice");
206 let typ = interner.intern("person");
207 index.index_entity(&mut interner, 0, name, typ, &[]);
208 let results = index.search("ALICE", &interner);
209 assert_eq!(results, vec![0]);
210 }
211
212 #[test]
213 fn test_search_partial_substring() {
214 let mut interner = StringInterner::new();
215 let mut index = SearchIndex::new();
216 let name = interner.intern("Alice");
217 let typ = interner.intern("person");
218 index.index_entity(&mut interner, 0, name, typ, &[]);
219 let results = index.search("Ali", &interner);
220 assert_eq!(results, vec![0]);
221 }
222
223 #[test]
224 fn test_multi_token_search() {
225 let mut interner = StringInterner::new();
226 let mut index = SearchIndex::new();
227 let obs = interner.intern("likes drinking coffee");
228 let alice = interner.intern("Alice");
229 let person = interner.intern("person");
230 index.index_entity(
231 &mut interner,
232 0,
233 alice,
234 person,
235 &[obs],
236 );
237 assert_eq!(index.search("likes", &interner), vec![0]);
238 assert_eq!(index.search("drinking", &interner), vec![0]);
239 assert_eq!(index.search("coffee", &interner), vec![0]);
240 }
241
242 #[test]
243 fn test_remove_then_reindex() {
244 let mut interner = StringInterner::new();
245 let mut index = SearchIndex::new();
246 let name = interner.intern("Alice");
247 let typ = interner.intern("person");
248 index.index_entity(&mut interner, 0, name, typ, &[]);
249
250 assert_eq!(index.search("Alice", &interner).len(), 1);
251 index.remove_entity(0);
252 assert!(index.search("Alice", &interner).is_empty());
253
254 index.index_entity(&mut interner, 0, name, typ, &[]);
255 assert_eq!(index.search("Alice", &interner).len(), 1);
256 }
257
258 #[test]
259 fn test_query_longer_than_token() {
260 let mut interner = StringInterner::new();
261 let mut index = SearchIndex::new();
262 let name = interner.intern("Alice");
263 let person = interner.intern("person");
264 index.index_entity(&mut interner, 0, name, person, &[]);
265 assert!(index.search("AliceInWonderland", &interner).is_empty());
266 }
267
268 #[test]
269 fn test_empty_index() {
270 let interner = StringInterner::new();
271 let index = SearchIndex::new();
272 assert!(index.search("anything", &interner).is_empty());
273 }
274
275 #[test]
276 fn test_clear_index() {
277 let mut interner = StringInterner::new();
278 let mut index = SearchIndex::new();
279 let name = interner.intern("Alice");
280 let person = interner.intern("person");
281 index.index_entity(&mut interner, 0, name, person, &[]);
282 assert!(!index.is_empty());
283 index.clear();
284 assert!(index.is_empty());
285 assert!(index.search("Alice", &interner).is_empty());
286 }
287}