dyntext/index.rs
1//! Top-level [`TextIndex`] type combining trigram extraction,
2//! the inverted postings index, and the per-document bloom
3//! filter into a single insert-and-search facility.
4//!
5//! # Search algorithm
6//!
7//! The substring query path is the four-tier filter funnel from
8//! the design doc:
9//!
10//! 1. Extract the query's trigrams.
11//! 2. Intersect their postings lists into a candidate doc-id
12//! set (tier 2).
13//! 3. For each candidate doc, check the per-document bloom
14//! filter for the query's trigrams. This is a cheap
15//! membership test that usually agrees with the postings
16//! intersection but acts as defence in depth (tier 3).
17//! 4. For each survivor, run a real substring match against
18//! the doc's stored bytes (tier 4 / final recheck).
19//!
20//! Results are returned in insertion order so callers can rely
21//! on a deterministic ranking.
22//!
23//! # Short queries
24//!
25//! Queries shorter than [`MIN_TRIGRAM_QUERY_LEN`] cannot be
26//! resolved through the trigram index and fall back to a full
27//! scan of the stored corpus. Callers that want to avoid the
28//! full-scan cost should reject short queries themselves.
29
30use std::collections::BTreeMap;
31
32use serde::{Deserialize, Serialize};
33
34use crate::bloom::BloomFilter;
35use crate::postings::Postings;
36use crate::prefix_extract;
37use crate::regex_ast::{self, RegexError};
38use crate::tre::{TreCompiledPattern, TreError, TreMatchOpts};
39use crate::trigram;
40
41/// Minimum query length in bytes that the trigram index can
42/// directly serve. Shorter queries fall back to a full scan
43/// over the stored corpus.
44pub const MIN_TRIGRAM_QUERY_LEN: usize = 3;
45
46/// Default expected trigram count per document for sizing the
47/// per-document bloom filter. Most short text fields produce
48/// far fewer; a tighter sizing keeps the per-doc memory
49/// footprint reasonable.
50const DEFAULT_BLOOM_N: usize = 256;
51
52/// Default false positive rate for the per-document bloom.
53const DEFAULT_BLOOM_FP: f64 = 0.01;
54
55/// One indexed document: its raw text (for tier-4 substring
56/// recheck) and its trigram bloom filter (for tier-3 cheap
57/// recheck).
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct IndexedDoc {
60 /// Original byte sequence as inserted.
61 pub text: Vec<u8>,
62 /// Bloom filter over the doc's trigram set.
63 pub bloom: BloomFilter,
64}
65
66impl IndexedDoc {
67 /// Construct an indexed doc by computing its trigram set
68 /// and bloom filter from `text`.
69 fn new(text: Vec<u8>) -> Self {
70 let tris = trigram::extract_trigram_set(&text);
71 let mut bloom =
72 BloomFilter::with_size_and_fp_rate(DEFAULT_BLOOM_N.max(tris.len()), DEFAULT_BLOOM_FP);
73 for t in &tris {
74 bloom.insert(&t.to_le_bytes());
75 }
76 Self { text, bloom }
77 }
78}
79
80/// Trigram-based text index supporting incremental insert,
81/// remove, and exact-substring search.
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct TextIndex {
84 /// Inverted index of trigram hash to doc-id bitmap.
85 postings: Postings,
86 /// Per-doc state. The map is keyed by doc id; iteration
87 /// order is therefore insertion order because doc ids are
88 /// monotonically assigned.
89 docs: BTreeMap<u32, IndexedDoc>,
90 /// Next doc id to hand out. Starts at 0; never recycled
91 /// (a removed doc id is gone forever).
92 next_doc_id: u32,
93}
94
95impl Default for TextIndex {
96 fn default() -> Self {
97 Self::new()
98 }
99}
100
101impl TextIndex {
102 /// Construct an empty index.
103 #[must_use]
104 pub fn new() -> Self {
105 Self {
106 postings: Postings::new(),
107 docs: BTreeMap::new(),
108 next_doc_id: 0,
109 }
110 }
111
112 /// Number of documents currently in the index.
113 #[must_use]
114 pub fn doc_count(&self) -> usize {
115 self.docs.len()
116 }
117
118 /// Borrow the inverted postings index.
119 #[must_use]
120 pub fn postings(&self) -> &Postings {
121 &self.postings
122 }
123
124 /// Borrow the document store.
125 #[must_use]
126 pub fn docs(&self) -> &BTreeMap<u32, IndexedDoc> {
127 &self.docs
128 }
129
130 /// Insert `text` and return the assigned doc id.
131 ///
132 /// The doc id is assigned monotonically; doc ids are not
133 /// recycled, so a removed id is not handed out again.
134 pub fn insert(&mut self, text: Vec<u8>) -> u32 {
135 let doc_id = self.next_doc_id;
136 self.next_doc_id = self
137 .next_doc_id
138 .checked_add(1)
139 .expect("invariant: doc ids fit in u32; saturate at 2^32-1 by removing old docs");
140
141 let tris = trigram::extract_trigram_set(&text);
142 for t in &tris {
143 self.postings.insert(*t, doc_id);
144 }
145 let doc = IndexedDoc::new(text);
146 self.docs.insert(doc_id, doc);
147 doc_id
148 }
149
150 /// Remove the document at `doc_id` and return its raw
151 /// bytes, if any.
152 ///
153 /// All trigram entries for the doc are pulled from the
154 /// postings index. A trigram whose postings list becomes
155 /// empty after removal is garbage collected.
156 pub fn remove(&mut self, doc_id: u32) -> Option<Vec<u8>> {
157 let doc = self.docs.remove(&doc_id)?;
158 let tris = trigram::extract_trigram_set(&doc.text);
159 for t in &tris {
160 self.postings.remove(*t, doc_id);
161 }
162 Some(doc.text)
163 }
164
165 /// Search for documents whose text contains `query` as a
166 /// contiguous byte substring.
167 ///
168 /// Results are returned in insertion order. A document that
169 /// has been removed is not returned even if its postings
170 /// entries were missed by a buggy remove (we always
171 /// re-verify against the doc store).
172 ///
173 /// Queries shorter than [`MIN_TRIGRAM_QUERY_LEN`] cannot be
174 /// resolved through the trigram index and fall back to a
175 /// full scan.
176 #[must_use]
177 pub fn search_substring(&self, query: &[u8]) -> Vec<u32> {
178 if query.is_empty() {
179 // An empty substring matches every doc by
180 // definition; preserve insertion order.
181 return self.docs.keys().copied().collect();
182 }
183
184 if query.len() < MIN_TRIGRAM_QUERY_LEN {
185 return self.full_scan(query);
186 }
187
188 let qtris = trigram::extract_query_trigram_set(query);
189 if qtris.is_empty() {
190 return self.full_scan(query);
191 }
192
193 // Tier 2: postings intersection.
194 let candidates = self.postings.intersect(&qtris);
195 if candidates.is_empty() {
196 return Vec::new();
197 }
198
199 let mut hits: Vec<u32> = Vec::new();
200 for doc_id in &candidates {
201 let Some(doc) = self.docs.get(&doc_id) else {
202 continue;
203 };
204 // Tier 3: per-doc bloom filter.
205 if !qtris.iter().all(|t| doc.bloom.contains(&t.to_le_bytes())) {
206 continue;
207 }
208 // Tier 4: real substring recheck.
209 if Self::contains_substring(&doc.text, query) {
210 hits.push(doc_id);
211 }
212 }
213 // The Roaring bitmap iterates ascending, which is also
214 // insertion order because doc ids are monotonic. Sort
215 // anyway to make the contract explicit.
216 hits.sort_unstable();
217 hits
218 }
219
220 /// Full scan over the document store; used for queries too
221 /// short for the trigram index.
222 fn full_scan(&self, query: &[u8]) -> Vec<u32> {
223 let mut out = Vec::new();
224 for (id, doc) in &self.docs {
225 if Self::contains_substring(&doc.text, query) {
226 out.push(*id);
227 }
228 }
229 out
230 }
231
232 /// Search for documents whose text matches `pattern` as a
233 /// regular expression.
234 ///
235 /// The query path is the same four-tier filter funnel as
236 /// [`Self::search_substring`], plus a Phase-2 prefix
237 /// extraction step:
238 ///
239 /// 1. Parse `pattern` into the internal AST and extract
240 /// the trigrams that any matching string MUST contain
241 /// (see [`crate::prefix_extract`]).
242 /// 2. Intersect those trigrams' postings lists into a
243 /// candidate doc-id set. If the AST cannot be lowered
244 /// (named capture group, etc.) or yields no required
245 /// trigrams, fall back to scanning every doc.
246 /// 3. Per-doc bloom filter recheck (skipped on full scan).
247 /// 4. Compile the pattern with [`regex::bytes::Regex`] and
248 /// re-run it against each candidate's stored bytes.
249 ///
250 /// Results are returned in insertion order.
251 ///
252 /// # Errors
253 ///
254 /// Returns [`RegexError::Parse`] if the pattern is
255 /// syntactically invalid or uses a regex feature that the
256 /// underlying `regex` crate does not support (lookarounds,
257 /// backreferences, ...). A pattern that parses cleanly but
258 /// trips the prefix extractor's unsupported-feature path
259 /// (named capture groups) does NOT surface as an error:
260 /// the search still runs, just via the slower full-scan +
261 /// recheck path.
262 pub fn search_regex(&self, pattern: &str) -> Result<Vec<u32>, RegexError> {
263 // Compile the matcher first; a pattern the matcher
264 // cannot handle is a hard error to the caller. We use
265 // `regex::bytes::Regex` because the corpus is byte
266 // slices, not UTF-8.
267 let re = regex::bytes::Regex::new(pattern).map_err(|e| RegexError::Parse(e.to_string()))?;
268
269 // Required trigrams from the AST. If extraction fails
270 // (PrefixUnsupported) or yields no constraint, we have
271 // to scan every doc; the recheck still runs correctly.
272 let trigram_hashes: Vec<u64> = match regex_ast::parse(pattern) {
273 Ok(ast) => prefix_extract::required_trigram_hashes(&ast),
274 Err(_) => Vec::new(),
275 };
276
277 let candidates: Vec<u32> = if trigram_hashes.is_empty() {
278 self.docs.keys().copied().collect()
279 } else {
280 self.postings.intersect(&trigram_hashes).iter().collect()
281 };
282
283 let mut hits: Vec<u32> = Vec::new();
284 for doc_id in candidates {
285 let Some(doc) = self.docs.get(&doc_id) else {
286 continue;
287 };
288 // Tier 3: per-doc bloom filter -- only meaningful
289 // when we have required trigrams. On a full scan
290 // it would be a tautology because we have no
291 // membership query to make.
292 if !trigram_hashes.is_empty()
293 && !trigram_hashes
294 .iter()
295 .all(|t| doc.bloom.contains(&t.to_le_bytes()))
296 {
297 continue;
298 }
299 // Tier 4: real regex recheck.
300 if re.is_match(&doc.text) {
301 hits.push(doc_id);
302 }
303 }
304 hits.sort_unstable();
305 Ok(hits)
306 }
307
308 /// Byte-level substring match.
309 fn contains_substring(haystack: &[u8], needle: &[u8]) -> bool {
310 if needle.is_empty() {
311 return true;
312 }
313 if needle.len() > haystack.len() {
314 return false;
315 }
316 haystack.windows(needle.len()).any(|w| w == needle)
317 }
318
319 /// Search for documents that match `pattern` as an
320 /// approximate POSIX extended regular expression with up
321 /// to `max_errors` edit operations.
322 ///
323 /// This is the Phase 3 entry point for the TRE-backed
324 /// recheck. The current implementation does a full scan
325 /// over the document store: every doc is fed to a single
326 /// compiled `TreCompiledPattern`. Phase 2 will add a
327 /// regex prefix extractor that lets us restrict the scan
328 /// to a trigram-postings-derived candidate set; the
329 /// signature here is forward-compatible with that change.
330 ///
331 /// Results are returned in ascending document-id order,
332 /// which equals insertion order because doc ids are
333 /// monotonic.
334 pub fn search_regex_approx(
335 &self,
336 pattern: &str,
337 max_errors: u16,
338 ) -> Result<Vec<u32>, TreError> {
339 let opts = TreMatchOpts {
340 max_errors,
341 ..TreMatchOpts::default()
342 };
343 let pat = TreCompiledPattern::compile(pattern.as_bytes(), opts)?;
344
345 let mut hits = Vec::new();
346 for (id, doc) in &self.docs {
347 if pat.is_match(&doc.text) {
348 hits.push(*id);
349 }
350 }
351 Ok(hits)
352 }
353}
354
355#[cfg(test)]
356mod tests {
357 use super::*;
358
359 #[test]
360 fn insert_then_search_finds_the_doc() {
361 let mut idx = TextIndex::new();
362 let id = idx.insert(b"hello world".to_vec());
363 let hits = idx.search_substring(b"hello");
364 assert_eq!(hits, vec![id]);
365 }
366
367 #[test]
368 fn search_substring_returns_only_true_positives() {
369 let mut idx = TextIndex::new();
370 let a = idx.insert(b"the quick brown fox".to_vec());
371 let _b = idx.insert(b"jumped over a lazy dog".to_vec());
372 let c = idx.insert(b"a brown fox is quick".to_vec());
373 let hits = idx.search_substring(b"brown fox");
374 // Only docs containing the literal byte sequence
375 // "brown fox" qualify.
376 assert!(hits.contains(&a));
377 assert!(hits.contains(&c));
378 assert_eq!(hits.len(), 2);
379 }
380
381 #[test]
382 fn search_substring_no_false_negatives_on_corpus() {
383 let mut store = TextIndex::new();
384 let corpus: &[&[u8]] = &[
385 b"alpha beta gamma",
386 b"beta cake",
387 b"the alphabet starts with alpha",
388 b"omega only",
389 ];
390 let ids: Vec<u32> = corpus.iter().map(|t| store.insert(t.to_vec())).collect();
391 for q in [b"alpha".as_slice(), b"beta", b"omega", b"the"] {
392 let hits = store.search_substring(q);
393 for (i, doc) in corpus.iter().enumerate() {
394 if doc.windows(q.len()).any(|w| w == q) {
395 assert!(
396 hits.contains(&ids[i]),
397 "false negative: query {q:?} should hit doc {i} {doc:?}",
398 );
399 }
400 }
401 }
402 }
403
404 #[test]
405 fn search_returns_results_in_insertion_order() {
406 let mut idx = TextIndex::new();
407 let id_a = idx.insert(b"hello a".to_vec());
408 let id_b = idx.insert(b"hello b".to_vec());
409 let id_c = idx.insert(b"hello c".to_vec());
410 let hits = idx.search_substring(b"hello");
411 assert_eq!(hits, vec![id_a, id_b, id_c]);
412 }
413
414 #[test]
415 fn remove_excludes_doc_from_subsequent_searches() {
416 let mut idx = TextIndex::new();
417 let a = idx.insert(b"the quick brown fox".to_vec());
418 let b = idx.insert(b"another brown fox here".to_vec());
419 let removed = idx.remove(a).expect("doc a present");
420 assert_eq!(removed, b"the quick brown fox");
421 let hits = idx.search_substring(b"brown fox");
422 assert_eq!(hits, vec![b]);
423 }
424
425 #[test]
426 fn remove_garbage_collects_unique_trigrams() {
427 let mut idx = TextIndex::new();
428 let a = idx.insert(b"unique-string-only-here".to_vec());
429 let postings_before = idx.postings().len();
430 assert!(postings_before > 0);
431 idx.remove(a);
432 // After removing the only doc, every postings entry
433 // should be empty and gone.
434 assert_eq!(idx.postings().len(), 0);
435 assert_eq!(idx.doc_count(), 0);
436 }
437
438 #[test]
439 fn remove_missing_doc_id_returns_none() {
440 let mut idx = TextIndex::new();
441 idx.insert(b"abc".to_vec());
442 assert!(idx.remove(9999).is_none());
443 }
444
445 #[test]
446 fn query_shorter_than_three_chars_uses_full_scan() {
447 let mut idx = TextIndex::new();
448 let a = idx.insert(b"abcdef".to_vec());
449 let _b = idx.insert(b"xyz".to_vec());
450 let c = idx.insert(b"ab".to_vec());
451 let hits = idx.search_substring(b"ab");
452 assert!(hits.contains(&a));
453 assert!(hits.contains(&c));
454 assert_eq!(hits.len(), 2);
455 }
456
457 #[test]
458 fn empty_query_matches_every_doc() {
459 let mut idx = TextIndex::new();
460 let a = idx.insert(b"x".to_vec());
461 let b = idx.insert(b"y".to_vec());
462 let hits = idx.search_substring(b"");
463 assert_eq!(hits, vec![a, b]);
464 }
465
466 #[test]
467 fn unicode_query_byte_level_works() {
468 let mut idx = TextIndex::new();
469 // "cafe" with combining e-acute (UTF-8 bytes
470 // 0xC3 0xA9). Insert one doc with the e-acute form,
471 // one with plain ASCII, and search for the e-acute
472 // suffix.
473 let a = idx.insert(b"caf\xc3\xa9 noir".to_vec());
474 let b = idx.insert(b"cafe noir".to_vec());
475 let hits = idx.search_substring(b"\xc3\xa9");
476 assert_eq!(hits, vec![a]);
477 let hits = idx.search_substring(b"noir");
478 // Both contain the literal bytes "noir".
479 assert!(hits.contains(&a));
480 assert!(hits.contains(&b));
481 assert_eq!(hits.len(), 2);
482 }
483
484 #[test]
485 fn search_for_nonexistent_substring_returns_empty() {
486 let mut idx = TextIndex::new();
487 idx.insert(b"hello world".to_vec());
488 idx.insert(b"another doc".to_vec());
489 assert!(idx.search_substring(b"completely-absent").is_empty());
490 }
491
492 #[test]
493 fn search_on_empty_index_returns_empty() {
494 let idx = TextIndex::new();
495 assert!(idx.search_substring(b"anything").is_empty());
496 // The empty-query special case still returns an empty
497 // vec when there are no docs to enumerate.
498 assert!(idx.search_substring(b"").is_empty());
499 }
500}