dyntext/
index.rs

1//! Top-level [`TextIndex`] type combining trigram extraction,
2//! the inverted postings index, and the per-document bloom
3//! filter into a single insert-and-search facility.
4//!
5//! # Search algorithm
6//!
7//! The substring query path is the four-tier filter funnel from
8//! the design doc:
9//!
10//! 1. Extract the query's trigrams.
11//! 2. Intersect their postings lists into a candidate doc-id
12//!    set (tier 2).
13//! 3. For each candidate doc, check the per-document bloom
14//!    filter for the query's trigrams. This is a cheap
15//!    membership test that usually agrees with the postings
16//!    intersection but acts as defence in depth (tier 3).
17//! 4. For each survivor, run a real substring match against
18//!    the doc's stored bytes (tier 4 / final recheck).
19//!
20//! Results are returned in insertion order so callers can rely
21//! on a deterministic ranking.
22//!
23//! # Short queries
24//!
25//! Queries shorter than [`MIN_TRIGRAM_QUERY_LEN`] cannot be
26//! resolved through the trigram index and fall back to a full
27//! scan of the stored corpus. Callers that want to avoid the
28//! full-scan cost should reject short queries themselves.
29
30use std::collections::BTreeMap;
31
32use serde::{Deserialize, Serialize};
33
34use crate::bloom::BloomFilter;
35use crate::postings::Postings;
36use crate::prefix_extract;
37use crate::regex_ast::{self, RegexError};
38use crate::tre::{TreCompiledPattern, TreError, TreMatchOpts};
39use crate::trigram;
40
41/// Minimum query length in bytes that the trigram index can
42/// directly serve. Shorter queries fall back to a full scan
43/// over the stored corpus.
44pub const MIN_TRIGRAM_QUERY_LEN: usize = 3;
45
46/// Default expected trigram count per document for sizing the
47/// per-document bloom filter. Most short text fields produce
48/// far fewer; a tighter sizing keeps the per-doc memory
49/// footprint reasonable.
50const DEFAULT_BLOOM_N: usize = 256;
51
52/// Default false positive rate for the per-document bloom.
53const DEFAULT_BLOOM_FP: f64 = 0.01;
54
55/// One indexed document: its raw text (for tier-4 substring
56/// recheck) and its trigram bloom filter (for tier-3 cheap
57/// recheck).
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct IndexedDoc {
60    /// Original byte sequence as inserted.
61    pub text: Vec<u8>,
62    /// Bloom filter over the doc's trigram set.
63    pub bloom: BloomFilter,
64}
65
66impl IndexedDoc {
67    /// Construct an indexed doc by computing its trigram set
68    /// and bloom filter from `text`.
69    fn new(text: Vec<u8>) -> Self {
70        let tris = trigram::extract_trigram_set(&text);
71        let mut bloom =
72            BloomFilter::with_size_and_fp_rate(DEFAULT_BLOOM_N.max(tris.len()), DEFAULT_BLOOM_FP);
73        for t in &tris {
74            bloom.insert(&t.to_le_bytes());
75        }
76        Self { text, bloom }
77    }
78}
79
80/// Trigram-based text index supporting incremental insert,
81/// remove, and exact-substring search.
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct TextIndex {
84    /// Inverted index of trigram hash to doc-id bitmap.
85    postings: Postings,
86    /// Per-doc state. The map is keyed by doc id; iteration
87    /// order is therefore insertion order because doc ids are
88    /// monotonically assigned.
89    docs: BTreeMap<u32, IndexedDoc>,
90    /// Next doc id to hand out. Starts at 0; never recycled
91    /// (a removed doc id is gone forever).
92    next_doc_id: u32,
93}
94
95impl Default for TextIndex {
96    fn default() -> Self {
97        Self::new()
98    }
99}
100
101impl TextIndex {
102    /// Construct an empty index.
103    #[must_use]
104    pub fn new() -> Self {
105        Self {
106            postings: Postings::new(),
107            docs: BTreeMap::new(),
108            next_doc_id: 0,
109        }
110    }
111
112    /// Number of documents currently in the index.
113    #[must_use]
114    pub fn doc_count(&self) -> usize {
115        self.docs.len()
116    }
117
118    /// Borrow the inverted postings index.
119    #[must_use]
120    pub fn postings(&self) -> &Postings {
121        &self.postings
122    }
123
124    /// Borrow the document store.
125    #[must_use]
126    pub fn docs(&self) -> &BTreeMap<u32, IndexedDoc> {
127        &self.docs
128    }
129
130    /// Insert `text` and return the assigned doc id.
131    ///
132    /// The doc id is assigned monotonically; doc ids are not
133    /// recycled, so a removed id is not handed out again.
134    pub fn insert(&mut self, text: Vec<u8>) -> u32 {
135        let doc_id = self.next_doc_id;
136        self.next_doc_id = self
137            .next_doc_id
138            .checked_add(1)
139            .expect("invariant: doc ids fit in u32; saturate at 2^32-1 by removing old docs");
140
141        let tris = trigram::extract_trigram_set(&text);
142        for t in &tris {
143            self.postings.insert(*t, doc_id);
144        }
145        let doc = IndexedDoc::new(text);
146        self.docs.insert(doc_id, doc);
147        doc_id
148    }
149
150    /// Remove the document at `doc_id` and return its raw
151    /// bytes, if any.
152    ///
153    /// All trigram entries for the doc are pulled from the
154    /// postings index. A trigram whose postings list becomes
155    /// empty after removal is garbage collected.
156    pub fn remove(&mut self, doc_id: u32) -> Option<Vec<u8>> {
157        let doc = self.docs.remove(&doc_id)?;
158        let tris = trigram::extract_trigram_set(&doc.text);
159        for t in &tris {
160            self.postings.remove(*t, doc_id);
161        }
162        Some(doc.text)
163    }
164
165    /// Search for documents whose text contains `query` as a
166    /// contiguous byte substring.
167    ///
168    /// Results are returned in insertion order. A document that
169    /// has been removed is not returned even if its postings
170    /// entries were missed by a buggy remove (we always
171    /// re-verify against the doc store).
172    ///
173    /// Queries shorter than [`MIN_TRIGRAM_QUERY_LEN`] cannot be
174    /// resolved through the trigram index and fall back to a
175    /// full scan.
176    #[must_use]
177    pub fn search_substring(&self, query: &[u8]) -> Vec<u32> {
178        if query.is_empty() {
179            // An empty substring matches every doc by
180            // definition; preserve insertion order.
181            return self.docs.keys().copied().collect();
182        }
183
184        if query.len() < MIN_TRIGRAM_QUERY_LEN {
185            return self.full_scan(query);
186        }
187
188        let qtris = trigram::extract_query_trigram_set(query);
189        if qtris.is_empty() {
190            return self.full_scan(query);
191        }
192
193        // Tier 2: postings intersection.
194        let candidates = self.postings.intersect(&qtris);
195        if candidates.is_empty() {
196            return Vec::new();
197        }
198
199        let mut hits: Vec<u32> = Vec::new();
200        for doc_id in &candidates {
201            let Some(doc) = self.docs.get(&doc_id) else {
202                continue;
203            };
204            // Tier 3: per-doc bloom filter.
205            if !qtris.iter().all(|t| doc.bloom.contains(&t.to_le_bytes())) {
206                continue;
207            }
208            // Tier 4: real substring recheck.
209            if Self::contains_substring(&doc.text, query) {
210                hits.push(doc_id);
211            }
212        }
213        // The Roaring bitmap iterates ascending, which is also
214        // insertion order because doc ids are monotonic. Sort
215        // anyway to make the contract explicit.
216        hits.sort_unstable();
217        hits
218    }
219
220    /// Full scan over the document store; used for queries too
221    /// short for the trigram index.
222    fn full_scan(&self, query: &[u8]) -> Vec<u32> {
223        let mut out = Vec::new();
224        for (id, doc) in &self.docs {
225            if Self::contains_substring(&doc.text, query) {
226                out.push(*id);
227            }
228        }
229        out
230    }
231
232    /// Search for documents whose text matches `pattern` as a
233    /// regular expression.
234    ///
235    /// The query path is the same four-tier filter funnel as
236    /// [`Self::search_substring`], plus a Phase-2 prefix
237    /// extraction step:
238    ///
239    /// 1. Parse `pattern` into the internal AST and extract
240    ///    the trigrams that any matching string MUST contain
241    ///    (see [`crate::prefix_extract`]).
242    /// 2. Intersect those trigrams' postings lists into a
243    ///    candidate doc-id set. If the AST cannot be lowered
244    ///    (named capture group, etc.) or yields no required
245    ///    trigrams, fall back to scanning every doc.
246    /// 3. Per-doc bloom filter recheck (skipped on full scan).
247    /// 4. Compile the pattern with [`regex::bytes::Regex`] and
248    ///    re-run it against each candidate's stored bytes.
249    ///
250    /// Results are returned in insertion order.
251    ///
252    /// # Errors
253    ///
254    /// Returns [`RegexError::Parse`] if the pattern is
255    /// syntactically invalid or uses a regex feature that the
256    /// underlying `regex` crate does not support (lookarounds,
257    /// backreferences, ...). A pattern that parses cleanly but
258    /// trips the prefix extractor's unsupported-feature path
259    /// (named capture groups) does NOT surface as an error:
260    /// the search still runs, just via the slower full-scan +
261    /// recheck path.
262    pub fn search_regex(&self, pattern: &str) -> Result<Vec<u32>, RegexError> {
263        // Compile the matcher first; a pattern the matcher
264        // cannot handle is a hard error to the caller. We use
265        // `regex::bytes::Regex` because the corpus is byte
266        // slices, not UTF-8.
267        let re = regex::bytes::Regex::new(pattern).map_err(|e| RegexError::Parse(e.to_string()))?;
268
269        // Required trigrams from the AST. If extraction fails
270        // (PrefixUnsupported) or yields no constraint, we have
271        // to scan every doc; the recheck still runs correctly.
272        let trigram_hashes: Vec<u64> = match regex_ast::parse(pattern) {
273            Ok(ast) => prefix_extract::required_trigram_hashes(&ast),
274            Err(_) => Vec::new(),
275        };
276
277        let candidates: Vec<u32> = if trigram_hashes.is_empty() {
278            self.docs.keys().copied().collect()
279        } else {
280            self.postings.intersect(&trigram_hashes).iter().collect()
281        };
282
283        let mut hits: Vec<u32> = Vec::new();
284        for doc_id in candidates {
285            let Some(doc) = self.docs.get(&doc_id) else {
286                continue;
287            };
288            // Tier 3: per-doc bloom filter -- only meaningful
289            // when we have required trigrams. On a full scan
290            // it would be a tautology because we have no
291            // membership query to make.
292            if !trigram_hashes.is_empty()
293                && !trigram_hashes
294                    .iter()
295                    .all(|t| doc.bloom.contains(&t.to_le_bytes()))
296            {
297                continue;
298            }
299            // Tier 4: real regex recheck.
300            if re.is_match(&doc.text) {
301                hits.push(doc_id);
302            }
303        }
304        hits.sort_unstable();
305        Ok(hits)
306    }
307
308    /// Byte-level substring match.
309    fn contains_substring(haystack: &[u8], needle: &[u8]) -> bool {
310        if needle.is_empty() {
311            return true;
312        }
313        if needle.len() > haystack.len() {
314            return false;
315        }
316        haystack.windows(needle.len()).any(|w| w == needle)
317    }
318
319    /// Search for documents that match `pattern` as an
320    /// approximate POSIX extended regular expression with up
321    /// to `max_errors` edit operations.
322    ///
323    /// This is the Phase 3 entry point for the TRE-backed
324    /// recheck. The current implementation does a full scan
325    /// over the document store: every doc is fed to a single
326    /// compiled `TreCompiledPattern`. Phase 2 will add a
327    /// regex prefix extractor that lets us restrict the scan
328    /// to a trigram-postings-derived candidate set; the
329    /// signature here is forward-compatible with that change.
330    ///
331    /// Results are returned in ascending document-id order,
332    /// which equals insertion order because doc ids are
333    /// monotonic.
334    pub fn search_regex_approx(
335        &self,
336        pattern: &str,
337        max_errors: u16,
338    ) -> Result<Vec<u32>, TreError> {
339        let opts = TreMatchOpts {
340            max_errors,
341            ..TreMatchOpts::default()
342        };
343        let pat = TreCompiledPattern::compile(pattern.as_bytes(), opts)?;
344
345        let mut hits = Vec::new();
346        for (id, doc) in &self.docs {
347            if pat.is_match(&doc.text) {
348                hits.push(*id);
349            }
350        }
351        Ok(hits)
352    }
353}
354
355#[cfg(test)]
356mod tests {
357    use super::*;
358
359    #[test]
360    fn insert_then_search_finds_the_doc() {
361        let mut idx = TextIndex::new();
362        let id = idx.insert(b"hello world".to_vec());
363        let hits = idx.search_substring(b"hello");
364        assert_eq!(hits, vec![id]);
365    }
366
367    #[test]
368    fn search_substring_returns_only_true_positives() {
369        let mut idx = TextIndex::new();
370        let a = idx.insert(b"the quick brown fox".to_vec());
371        let _b = idx.insert(b"jumped over a lazy dog".to_vec());
372        let c = idx.insert(b"a brown fox is quick".to_vec());
373        let hits = idx.search_substring(b"brown fox");
374        // Only docs containing the literal byte sequence
375        // "brown fox" qualify.
376        assert!(hits.contains(&a));
377        assert!(hits.contains(&c));
378        assert_eq!(hits.len(), 2);
379    }
380
381    #[test]
382    fn search_substring_no_false_negatives_on_corpus() {
383        let mut store = TextIndex::new();
384        let corpus: &[&[u8]] = &[
385            b"alpha beta gamma",
386            b"beta cake",
387            b"the alphabet starts with alpha",
388            b"omega only",
389        ];
390        let ids: Vec<u32> = corpus.iter().map(|t| store.insert(t.to_vec())).collect();
391        for q in [b"alpha".as_slice(), b"beta", b"omega", b"the"] {
392            let hits = store.search_substring(q);
393            for (i, doc) in corpus.iter().enumerate() {
394                if doc.windows(q.len()).any(|w| w == q) {
395                    assert!(
396                        hits.contains(&ids[i]),
397                        "false negative: query {q:?} should hit doc {i} {doc:?}",
398                    );
399                }
400            }
401        }
402    }
403
404    #[test]
405    fn search_returns_results_in_insertion_order() {
406        let mut idx = TextIndex::new();
407        let id_a = idx.insert(b"hello a".to_vec());
408        let id_b = idx.insert(b"hello b".to_vec());
409        let id_c = idx.insert(b"hello c".to_vec());
410        let hits = idx.search_substring(b"hello");
411        assert_eq!(hits, vec![id_a, id_b, id_c]);
412    }
413
414    #[test]
415    fn remove_excludes_doc_from_subsequent_searches() {
416        let mut idx = TextIndex::new();
417        let a = idx.insert(b"the quick brown fox".to_vec());
418        let b = idx.insert(b"another brown fox here".to_vec());
419        let removed = idx.remove(a).expect("doc a present");
420        assert_eq!(removed, b"the quick brown fox");
421        let hits = idx.search_substring(b"brown fox");
422        assert_eq!(hits, vec![b]);
423    }
424
425    #[test]
426    fn remove_garbage_collects_unique_trigrams() {
427        let mut idx = TextIndex::new();
428        let a = idx.insert(b"unique-string-only-here".to_vec());
429        let postings_before = idx.postings().len();
430        assert!(postings_before > 0);
431        idx.remove(a);
432        // After removing the only doc, every postings entry
433        // should be empty and gone.
434        assert_eq!(idx.postings().len(), 0);
435        assert_eq!(idx.doc_count(), 0);
436    }
437
438    #[test]
439    fn remove_missing_doc_id_returns_none() {
440        let mut idx = TextIndex::new();
441        idx.insert(b"abc".to_vec());
442        assert!(idx.remove(9999).is_none());
443    }
444
445    #[test]
446    fn query_shorter_than_three_chars_uses_full_scan() {
447        let mut idx = TextIndex::new();
448        let a = idx.insert(b"abcdef".to_vec());
449        let _b = idx.insert(b"xyz".to_vec());
450        let c = idx.insert(b"ab".to_vec());
451        let hits = idx.search_substring(b"ab");
452        assert!(hits.contains(&a));
453        assert!(hits.contains(&c));
454        assert_eq!(hits.len(), 2);
455    }
456
457    #[test]
458    fn empty_query_matches_every_doc() {
459        let mut idx = TextIndex::new();
460        let a = idx.insert(b"x".to_vec());
461        let b = idx.insert(b"y".to_vec());
462        let hits = idx.search_substring(b"");
463        assert_eq!(hits, vec![a, b]);
464    }
465
466    #[test]
467    fn unicode_query_byte_level_works() {
468        let mut idx = TextIndex::new();
469        // "cafe" with combining e-acute (UTF-8 bytes
470        // 0xC3 0xA9). Insert one doc with the e-acute form,
471        // one with plain ASCII, and search for the e-acute
472        // suffix.
473        let a = idx.insert(b"caf\xc3\xa9 noir".to_vec());
474        let b = idx.insert(b"cafe noir".to_vec());
475        let hits = idx.search_substring(b"\xc3\xa9");
476        assert_eq!(hits, vec![a]);
477        let hits = idx.search_substring(b"noir");
478        // Both contain the literal bytes "noir".
479        assert!(hits.contains(&a));
480        assert!(hits.contains(&b));
481        assert_eq!(hits.len(), 2);
482    }
483
484    #[test]
485    fn search_for_nonexistent_substring_returns_empty() {
486        let mut idx = TextIndex::new();
487        idx.insert(b"hello world".to_vec());
488        idx.insert(b"another doc".to_vec());
489        assert!(idx.search_substring(b"completely-absent").is_empty());
490    }
491
492    #[test]
493    fn search_on_empty_index_returns_empty() {
494        let idx = TextIndex::new();
495        assert!(idx.search_substring(b"anything").is_empty());
496        // The empty-query special case still returns an empty
497        // vec when there are no docs to enumerate.
498        assert!(idx.search_substring(b"").is_empty());
499    }
500}
dyntext/index.rs

dyntext/
index.rs