Skip to main content

dyntext/
index.rs

1//! Top-level [`TextIndex`] type combining trigram extraction,
2//! the inverted postings index, and the per-document bloom
3//! filter into a single insert-and-search facility.
4//!
5//! # Search algorithm
6//!
7//! The substring query path is the four-tier filter funnel from
8//! the design doc:
9//!
10//! 1. Extract the query's trigrams.
11//! 2. Intersect their postings lists into a candidate doc-id
12//!    set (tier 2).
13//! 3. For each candidate doc, check the per-document bloom
14//!    filter for the query's trigrams. This is a cheap
15//!    membership test that usually agrees with the postings
16//!    intersection but acts as defence in depth (tier 3).
17//! 4. For each survivor, run a real substring match against
18//!    the doc's stored bytes (tier 4 / final recheck).
19//!
20//! Results are returned in insertion order so callers can rely
21//! on a deterministic ranking.
22//!
23//! # Short queries
24//!
25//! Queries shorter than [`MIN_TRIGRAM_QUERY_LEN`] cannot be
26//! resolved through the trigram index and fall back to a full
27//! scan of the stored corpus. Callers that want to avoid the
28//! full-scan cost should reject short queries themselves.
29
30use std::collections::BTreeMap;
31
32use rayon::prelude::*;
33use serde::{Deserialize, Serialize};
34
35use crate::bloom::BloomFilter;
36use crate::postings::Postings;
37use crate::prefix_extract;
38use crate::regex_ast::{self, RegexError};
39use crate::tiling::ApproxFilter;
40use crate::tre::{TreCompiledPattern, TreError, TreMatchOpts};
41use crate::trigram;
42
43/// Minimum query length in bytes that the trigram index can
44/// directly serve. Shorter queries fall back to a full scan
45/// over the stored corpus.
46pub const MIN_TRIGRAM_QUERY_LEN: usize = 3;
47
48/// Default expected trigram count per document for sizing the
49/// per-document bloom filter. Most short text fields produce
50/// far fewer; a tighter sizing keeps the per-doc memory
51/// footprint reasonable.
52const DEFAULT_BLOOM_N: usize = 256;
53
54/// Default false positive rate for the per-document bloom.
55const DEFAULT_BLOOM_FP: f64 = 0.01;
56
57/// Threshold above which `search_regex_approx` parallelises
58/// the per-doc TRE recheck. Below this size the rayon
59/// scheduling overhead is comparable to the savings, so the
60/// sequential path is faster. Tuned empirically against the
61/// 256-byte alnum bench corpus.
62const PARALLEL_RECHECK_THRESHOLD: usize = 1024;
63
64/// Chunk size used by the parallel-recheck path. Enough work
65/// per task to amortise rayon's task-spawn overhead, small
66/// enough to balance load across cores. Empirically 256 docs
67/// per chunk hits the sweet spot for the 100k-doc corpus on
68/// an 8-core box.
69const PARALLEL_RECHECK_CHUNK_SIZE: usize = 256;
70
71/// One indexed document: its raw text (for tier-4 substring
72/// recheck) and its trigram bloom filter (for tier-3 cheap
73/// recheck).
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct IndexedDoc {
76    /// Original byte sequence as inserted.
77    pub text: Vec<u8>,
78    /// Bloom filter over the doc's trigram set.
79    pub bloom: BloomFilter,
80}
81
82impl IndexedDoc {
83    /// Construct an indexed doc by computing its trigram set
84    /// and bloom filter from `text`.
85    fn new(text: Vec<u8>) -> Self {
86        let tris = trigram::extract_trigram_set(&text);
87        let mut bloom =
88            BloomFilter::with_size_and_fp_rate(DEFAULT_BLOOM_N.max(tris.len()), DEFAULT_BLOOM_FP);
89        for t in &tris {
90            bloom.insert(&t.to_le_bytes());
91        }
92        Self { text, bloom }
93    }
94}
95
96/// Trigram-based text index supporting incremental insert,
97/// remove, and exact-substring search.
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct TextIndex {
100    /// Inverted index of trigram hash to doc-id bitmap.
101    postings: Postings,
102    /// Per-doc state. The map is keyed by doc id; iteration
103    /// order is therefore insertion order because doc ids are
104    /// monotonically assigned.
105    docs: BTreeMap<u32, IndexedDoc>,
106    /// Next doc id to hand out. Starts at 0; never recycled
107    /// (a removed doc id is gone forever).
108    next_doc_id: u32,
109}
110
111impl Default for TextIndex {
112    fn default() -> Self {
113        Self::new()
114    }
115}
116
117impl TextIndex {
118    /// Construct an empty index.
119    #[must_use]
120    pub fn new() -> Self {
121        Self {
122            postings: Postings::new(),
123            docs: BTreeMap::new(),
124            next_doc_id: 0,
125        }
126    }
127
128    /// Number of documents currently in the index.
129    #[must_use]
130    pub fn doc_count(&self) -> usize {
131        self.docs.len()
132    }
133
134    /// Borrow the inverted postings index.
135    #[must_use]
136    pub fn postings(&self) -> &Postings {
137        &self.postings
138    }
139
140    /// Borrow the document store.
141    #[must_use]
142    pub fn docs(&self) -> &BTreeMap<u32, IndexedDoc> {
143        &self.docs
144    }
145
146    /// Insert `text` and return the assigned doc id.
147    ///
148    /// The doc id is assigned monotonically; doc ids are not
149    /// recycled, so a removed id is not handed out again.
150    pub fn insert(&mut self, text: Vec<u8>) -> u32 {
151        let doc_id = self.next_doc_id;
152        self.next_doc_id = self
153            .next_doc_id
154            .checked_add(1)
155            .expect("invariant: doc ids fit in u32; saturate at 2^32-1 by removing old docs");
156
157        let tris = trigram::extract_trigram_set(&text);
158        for t in &tris {
159            self.postings.insert(*t, doc_id);
160        }
161        let doc = IndexedDoc::new(text);
162        self.docs.insert(doc_id, doc);
163        doc_id
164    }
165
166    /// Remove the document at `doc_id` and return its raw
167    /// bytes, if any.
168    ///
169    /// All trigram entries for the doc are pulled from the
170    /// postings index. A trigram whose postings list becomes
171    /// empty after removal is garbage collected.
172    pub fn remove(&mut self, doc_id: u32) -> Option<Vec<u8>> {
173        let doc = self.docs.remove(&doc_id)?;
174        let tris = trigram::extract_trigram_set(&doc.text);
175        for t in &tris {
176            self.postings.remove(*t, doc_id);
177        }
178        Some(doc.text)
179    }
180
181    /// Search for documents whose text contains `query` as a
182    /// contiguous byte substring.
183    ///
184    /// Results are returned in insertion order. A document that
185    /// has been removed is not returned even if its postings
186    /// entries were missed by a buggy remove (we always
187    /// re-verify against the doc store).
188    ///
189    /// Queries shorter than [`MIN_TRIGRAM_QUERY_LEN`] cannot be
190    /// resolved through the trigram index and fall back to a
191    /// full scan.
192    #[must_use]
193    pub fn search_substring(&self, query: &[u8]) -> Vec<u32> {
194        if query.is_empty() {
195            // An empty substring matches every doc by
196            // definition; preserve insertion order.
197            return self.docs.keys().copied().collect();
198        }
199
200        if query.len() < MIN_TRIGRAM_QUERY_LEN {
201            return self.full_scan(query);
202        }
203
204        let qtris = trigram::extract_query_trigram_set(query);
205        if qtris.is_empty() {
206            return self.full_scan(query);
207        }
208
209        // Tier 2: postings intersection.
210        let candidates = self.postings.intersect(&qtris);
211        if candidates.is_empty() {
212            return Vec::new();
213        }
214
215        let mut hits: Vec<u32> = Vec::new();
216        for doc_id in &candidates {
217            let Some(doc) = self.docs.get(&doc_id) else {
218                continue;
219            };
220            // Tier 3: per-doc bloom filter.
221            if !qtris.iter().all(|t| doc.bloom.contains(&t.to_le_bytes())) {
222                continue;
223            }
224            // Tier 4: real substring recheck.
225            if Self::contains_substring(&doc.text, query) {
226                hits.push(doc_id);
227            }
228        }
229        // The Roaring bitmap iterates ascending, which is also
230        // insertion order because doc ids are monotonic. Sort
231        // anyway to make the contract explicit.
232        hits.sort_unstable();
233        hits
234    }
235
236    /// Full scan over the document store; used for queries too
237    /// short for the trigram index.
238    fn full_scan(&self, query: &[u8]) -> Vec<u32> {
239        let mut out = Vec::new();
240        for (id, doc) in &self.docs {
241            if Self::contains_substring(&doc.text, query) {
242                out.push(*id);
243            }
244        }
245        out
246    }
247
248    /// Search for documents whose text matches `pattern` as a
249    /// regular expression.
250    ///
251    /// The query path is the same four-tier filter funnel as
252    /// [`Self::search_substring`], plus a Phase-2 prefix
253    /// extraction step:
254    ///
255    /// 1. Parse `pattern` into the internal AST and extract
256    ///    the trigrams that any matching string MUST contain
257    ///    (see [`crate::prefix_extract`]).
258    /// 2. Intersect those trigrams' postings lists into a
259    ///    candidate doc-id set. If the AST cannot be lowered
260    ///    (named capture group, etc.) or yields no required
261    ///    trigrams, fall back to scanning every doc.
262    /// 3. Per-doc bloom filter recheck (skipped on full scan).
263    /// 4. If the AST starts with `^literal`, prune candidates
264    ///    whose first bytes do not equal the literal prefix.
265    /// 5. Compile the pattern with [`regex::bytes::Regex`] and
266    ///    re-run it against each candidate's stored bytes.
267    ///
268    /// Results are returned in insertion order.
269    ///
270    /// # Errors
271    ///
272    /// Returns [`RegexError::Parse`] if the pattern is
273    /// syntactically invalid or uses a regex feature that the
274    /// underlying `regex` crate does not support (lookarounds,
275    /// backreferences, ...). A pattern that parses cleanly but
276    /// trips the prefix extractor's unsupported-feature path
277    /// (named capture groups) does NOT surface as an error:
278    /// the search still runs, just via the slower full-scan +
279    /// recheck path.
280    pub fn search_regex(&self, pattern: &str) -> Result<Vec<u32>, RegexError> {
281        // Compile the matcher first; a pattern the matcher
282        // cannot handle is a hard error to the caller. We use
283        // `regex::bytes::Regex` because the corpus is byte
284        // slices, not UTF-8.
285        let re = regex::bytes::Regex::new(pattern).map_err(|e| RegexError::Parse(e.to_string()))?;
286
287        // Required trigrams from the AST. If extraction fails
288        // (PrefixUnsupported) or yields no constraint, we have
289        // to scan every doc; the recheck still runs correctly.
290        let ast = regex_ast::parse(pattern).ok();
291        let trigram_hashes: Vec<u64> = ast
292            .as_ref()
293            .map(prefix_extract::required_trigram_hashes)
294            .unwrap_or_default();
295        let anchored_prefix = ast.as_ref().and_then(prefix_extract::anchored_prefix);
296
297        let candidates: Vec<u32> = if trigram_hashes.is_empty() {
298            self.docs.keys().copied().collect()
299        } else {
300            self.postings.intersect(&trigram_hashes).iter().collect()
301        };
302
303        let mut hits: Vec<u32> = Vec::new();
304        for doc_id in candidates {
305            let Some(doc) = self.docs.get(&doc_id) else {
306                continue;
307            };
308            // Tier 3: per-doc bloom filter -- only meaningful
309            // when we have required trigrams. On a full scan
310            // it would be a tautology because we have no
311            // membership query to make.
312            if !trigram_hashes.is_empty()
313                && !trigram_hashes
314                    .iter()
315                    .all(|t| doc.bloom.contains(&t.to_le_bytes()))
316            {
317                continue;
318            }
319            // Anchor fast-path: if the pattern is anchored and
320            // followed by a literal prefix, reject docs whose
321            // first bytes do not equal the prefix. This is
322            // cheap (a single byte-compare), sound for K=0,
323            // and lets the much heavier `regex` matcher off
324            // the hook for the common `^literal\w+...` shape.
325            if let Some(prefix) = anchored_prefix.as_ref() {
326                if doc.text.len() < prefix.len() || &doc.text[..prefix.len()] != prefix.as_slice() {
327                    continue;
328                }
329            }
330            // Tier 4: real regex recheck.
331            if re.is_match(&doc.text) {
332                hits.push(doc_id);
333            }
334        }
335        hits.sort_unstable();
336        Ok(hits)
337    }
338
339    /// Byte-level substring match.
340    fn contains_substring(haystack: &[u8], needle: &[u8]) -> bool {
341        if needle.is_empty() {
342            return true;
343        }
344        if needle.len() > haystack.len() {
345            return false;
346        }
347        haystack.windows(needle.len()).any(|w| w == needle)
348    }
349
350    /// Search for documents that match `pattern` as an
351    /// approximate POSIX extended regular expression with up
352    /// to `max_errors` edit operations.
353    ///
354    /// The path mirrors [`Self::search_regex`] but the tier-4
355    /// recheck delegates to [`TreCompiledPattern`] instead of
356    /// the std `regex` matcher because TRE is the only engine
357    /// in the workspace that implements approximate match
358    /// semantics. The pre-recheck filter funnel uses the
359    /// pigeonhole bound `surviving_trigrams >= T - 3k`
360    /// (see [`ApproxFilter`]) to stay sound under the edit
361    /// budget.
362    ///
363    /// For a `^literal...` pattern the anchor fast-path
364    /// rejects every candidate whose first bytes are too far
365    /// (in Hamming distance) from the literal prefix. For
366    /// `max_errors == 0` this is a byte-equality check; for
367    /// `max_errors >= 1` it is a Hamming-distance check
368    /// against the prefix length, which is sound for the
369    /// substitution-only case TRE optimises and conservative
370    /// (always returns `true` for prefixes whose Hamming
371    /// distance is within `max_errors`) for the
372    /// insert/delete cases.
373    ///
374    /// When the surviving candidate set after filtering is
375    /// large (`>= PARALLEL_RECHECK_THRESHOLD`), the per-doc
376    /// TRE recheck is dispatched to a Rayon parallel iterator
377    /// to fan the cost across CPU cores. TRE's compiled
378    /// `regex_t` is `!Send`, so each parallel worker compiles
379    /// its own copy from the original pattern bytes.
380    ///
381    /// Results are returned in ascending document-id order.
382    ///
383    /// # Errors
384    ///
385    /// Returns [`TreError::Compile`] if the pattern fails to
386    /// compile under the given options.
387    pub fn search_regex_approx(
388        &self,
389        pattern: &str,
390        max_errors: u16,
391    ) -> Result<Vec<u32>, TreError> {
392        let opts = TreMatchOpts {
393            max_errors,
394            ..TreMatchOpts::default()
395        };
396        let pat = TreCompiledPattern::compile(pattern.as_bytes(), opts)?;
397
398        // Build the trigram filter and the anchor fast-path
399        // from the AST. A pattern that does not lower into the
400        // internal AST (named captures, regex-syntax features
401        // we don't model) falls back to the no-filter scan.
402        let ast = regex_ast::parse(pattern).ok();
403        let filter = ast.as_ref().map_or_else(
404            || ApproxFilter {
405                trigrams: Vec::new(),
406                min_required: 0,
407            },
408            |a| ApproxFilter::build(a, max_errors),
409        );
410        let anchored_prefix = ast.as_ref().and_then(prefix_extract::anchored_prefix);
411
412        // Tier 3 + anchor fast-path: cheap rejections before
413        // we hand the doc to TRE.
414        let prefilter = |doc: &IndexedDoc| -> bool {
415            if filter.is_active() && !filter.passes(&doc.bloom) {
416                return false;
417            }
418            if let Some(prefix) = anchored_prefix.as_ref() {
419                if !anchor_prefix_compatible(&doc.text, prefix, max_errors) {
420                    return false;
421                }
422            }
423            true
424        };
425
426        // First sweep: build the (doc_id, &text) survivor list.
427        // When the filter is active we walk the candidate vec
428        // returned by the postings union; when it is inactive
429        // we iterate `self.docs` directly to avoid a chain of
430        // 100k BTreeMap lookups that would dominate the cost
431        // of the per-query setup.
432        let survivors: Vec<(u32, &[u8])> = if filter.is_active() {
433            let candidates = filter.candidates(&self.postings);
434            candidates
435                .into_iter()
436                .filter_map(|doc_id| {
437                    let doc = self.docs.get(&doc_id)?;
438                    if !prefilter(doc) {
439                        return None;
440                    }
441                    Some((doc_id, doc.text.as_slice()))
442                })
443                .collect()
444        } else {
445            self.docs
446                .iter()
447                .filter_map(|(id, doc)| {
448                    if !prefilter(doc) {
449                        return None;
450                    }
451                    Some((*id, doc.text.as_slice()))
452                })
453                .collect()
454        };
455
456        let hits: Vec<u32> = if survivors.len() >= PARALLEL_RECHECK_THRESHOLD {
457            run_parallel_recheck(pattern, opts, &survivors)?
458        } else {
459            survivors
460                .into_iter()
461                .filter_map(|(id, text)| if pat.is_match(text) { Some(id) } else { None })
462                .collect()
463        };
464
465        let mut out = hits;
466        out.sort_unstable();
467        Ok(out)
468    }
469}
470
471/// Whether `doc` could plausibly be the start of an
472/// approximate match for the anchored literal `prefix` under
473/// an edit budget of `max_errors`.
474///
475/// The check is sound (never returns `false` when the doc
476/// could match) and cheap (compares at most
477/// `prefix.len() + max_errors` bytes per shift).
478///
479/// Strategy: iterate every alignment offset in
480/// `[0, max_errors]` and compute the edit distance between
481/// `prefix` and the corresponding doc prefix using a banded
482/// dynamic-programming table of width `2*max_errors + 1`. If
483/// any alignment is within `max_errors` edits, accept.
484///
485/// For `max_errors == 0` this collapses to a byte-equality
486/// check on the prefix length. For small `max_errors` (the
487/// common case in dyntext queries) the check costs
488/// O(prefix.len() * (2 * max_errors + 1)) byte comparisons.
489fn anchor_prefix_compatible(doc: &[u8], prefix: &[u8], max_errors: u16) -> bool {
490    if max_errors == 0 {
491        return doc.len() >= prefix.len() && &doc[..prefix.len()] == prefix;
492    }
493    let k = usize::from(max_errors);
494    // Restrict the doc window to the first prefix.len() + k
495    // bytes; an anchored match cannot reach further. If the
496    // doc is shorter than prefix.len() - k we still try -- the
497    // edit distance computation handles the length mismatch.
498    let window_end = (prefix.len() + k).min(doc.len());
499    let window = &doc[..window_end];
500    let dist = bounded_edit_distance(prefix, window, k);
501    dist <= k
502}
503
504/// Compute the prefix-edit distance between `pat` and `txt`,
505/// returning the smallest number of edits that turns `pat` into
506/// some prefix of `txt`. The result is capped at `k + 1` so
507/// the caller can reject early when the budget is exceeded.
508///
509/// This is the standard banded edit-distance recurrence,
510/// O(|pat| * (2 * k + 1)). It is symmetric in the
511/// substitution / insertion / deletion costs (all 1).
512fn bounded_edit_distance(pat: &[u8], txt: &[u8], k: usize) -> usize {
513    let m = pat.len();
514    let n = txt.len();
515    if m == 0 {
516        return 0;
517    }
518    // dp[i][j] = edits to turn pat[..i] into some prefix of
519    // txt[..j]. Only cells with |i - j| <= k matter; we keep
520    // two rows to avoid the |pat| x |txt| matrix.
521    let mut prev = vec![usize::MAX; n + 1];
522    let mut curr = vec![usize::MAX; n + 1];
523    prev[0] = 0;
524    for cell in prev.iter_mut().take(n.min(k) + 1).skip(1) {
525        *cell = 0; // free leading text characters (prefix-match)
526    }
527    for i in 1..=m {
528        curr[0] = i;
529        let lo = i.saturating_sub(k);
530        let hi = (i + k).min(n);
531        for j in 1..=n {
532            if j < lo || j > hi {
533                curr[j] = usize::MAX;
534                continue;
535            }
536            let cost = usize::from(pat[i - 1] != txt[j - 1]);
537            let sub = prev[j - 1].saturating_add(cost);
538            let del = prev[j].saturating_add(1);
539            let ins = curr[j - 1].saturating_add(1);
540            curr[j] = sub.min(del).min(ins);
541        }
542        std::mem::swap(&mut prev, &mut curr);
543    }
544    // Best alignment: the smallest dp[m][j] over j in
545    // [m - k, m + k] intersected with [0, n].
546    let lo = m.saturating_sub(k);
547    let hi = (m + k).min(n);
548    let mut best = usize::MAX;
549    for cell in prev.iter().take(hi + 1).skip(lo) {
550        best = best.min(*cell);
551    }
552    best.min(k + 1)
553}
554
555/// Run the TRE recheck across `survivors` in parallel.
556///
557/// `TreCompiledPattern` is `!Send`, so the parallel iterator
558/// compiles a fresh pattern in each worker via
559/// [`rayon::iter::ParallelIterator::map_init`]. The compile
560/// cost is amortised across the worker's chunk of survivors,
561/// which is much smaller than the per-doc TRE match cost.
562fn run_parallel_recheck(
563    pattern: &str,
564    opts: TreMatchOpts,
565    survivors: &[(u32, &[u8])],
566) -> Result<Vec<u32>, TreError> {
567    use std::sync::atomic::{AtomicBool, Ordering};
568    let pattern_bytes = pattern.as_bytes();
569    let compile_failed = AtomicBool::new(false);
570    let hits: Vec<u32> = survivors
571        .par_chunks(PARALLEL_RECHECK_CHUNK_SIZE)
572        .map_init(
573            || TreCompiledPattern::compile(pattern_bytes, opts).ok(),
574            |worker_pat: &mut Option<TreCompiledPattern>, chunk: &[(u32, &[u8])]| -> Vec<u32> {
575                let Some(pat) = worker_pat.as_ref() else {
576                    compile_failed.store(true, Ordering::Relaxed);
577                    return Vec::new();
578                };
579                let mut local = Vec::new();
580                for &(id, text) in chunk {
581                    if pat.is_match(text) {
582                        local.push(id);
583                    }
584                }
585                local
586            },
587        )
588        .flatten()
589        .collect();
590    if compile_failed.load(Ordering::Relaxed) {
591        return Err(TreError::Internal(
592            "parallel worker failed to compile pattern".into(),
593        ));
594    }
595    Ok(hits)
596}
597
598#[cfg(test)]
599mod tests {
600    use super::*;
601
602    #[test]
603    fn insert_then_search_finds_the_doc() {
604        let mut idx = TextIndex::new();
605        let id = idx.insert(b"hello world".to_vec());
606        let hits = idx.search_substring(b"hello");
607        assert_eq!(hits, vec![id]);
608    }
609
610    #[test]
611    fn search_substring_returns_only_true_positives() {
612        let mut idx = TextIndex::new();
613        let a = idx.insert(b"the quick brown fox".to_vec());
614        let _b = idx.insert(b"jumped over a lazy dog".to_vec());
615        let c = idx.insert(b"a brown fox is quick".to_vec());
616        let hits = idx.search_substring(b"brown fox");
617        // Only docs containing the literal byte sequence
618        // "brown fox" qualify.
619        assert!(hits.contains(&a));
620        assert!(hits.contains(&c));
621        assert_eq!(hits.len(), 2);
622    }
623
624    #[test]
625    fn search_substring_no_false_negatives_on_corpus() {
626        let mut store = TextIndex::new();
627        let corpus: &[&[u8]] = &[
628            b"alpha beta gamma",
629            b"beta cake",
630            b"the alphabet starts with alpha",
631            b"omega only",
632        ];
633        let ids: Vec<u32> = corpus.iter().map(|t| store.insert(t.to_vec())).collect();
634        for q in [b"alpha".as_slice(), b"beta", b"omega", b"the"] {
635            let hits = store.search_substring(q);
636            for (i, doc) in corpus.iter().enumerate() {
637                if doc.windows(q.len()).any(|w| w == q) {
638                    assert!(
639                        hits.contains(&ids[i]),
640                        "false negative: query {q:?} should hit doc {i} {doc:?}",
641                    );
642                }
643            }
644        }
645    }
646
647    #[test]
648    fn search_returns_results_in_insertion_order() {
649        let mut idx = TextIndex::new();
650        let id_a = idx.insert(b"hello a".to_vec());
651        let id_b = idx.insert(b"hello b".to_vec());
652        let id_c = idx.insert(b"hello c".to_vec());
653        let hits = idx.search_substring(b"hello");
654        assert_eq!(hits, vec![id_a, id_b, id_c]);
655    }
656
657    #[test]
658    fn remove_excludes_doc_from_subsequent_searches() {
659        let mut idx = TextIndex::new();
660        let a = idx.insert(b"the quick brown fox".to_vec());
661        let b = idx.insert(b"another brown fox here".to_vec());
662        let removed = idx.remove(a).expect("doc a present");
663        assert_eq!(removed, b"the quick brown fox");
664        let hits = idx.search_substring(b"brown fox");
665        assert_eq!(hits, vec![b]);
666    }
667
668    #[test]
669    fn remove_garbage_collects_unique_trigrams() {
670        let mut idx = TextIndex::new();
671        let a = idx.insert(b"unique-string-only-here".to_vec());
672        let postings_before = idx.postings().len();
673        assert!(postings_before > 0);
674        idx.remove(a);
675        // After removing the only doc, every postings entry
676        // should be empty and gone.
677        assert_eq!(idx.postings().len(), 0);
678        assert_eq!(idx.doc_count(), 0);
679    }
680
681    #[test]
682    fn remove_missing_doc_id_returns_none() {
683        let mut idx = TextIndex::new();
684        idx.insert(b"abc".to_vec());
685        assert!(idx.remove(9999).is_none());
686    }
687
688    #[test]
689    fn query_shorter_than_three_chars_uses_full_scan() {
690        let mut idx = TextIndex::new();
691        let a = idx.insert(b"abcdef".to_vec());
692        let _b = idx.insert(b"xyz".to_vec());
693        let c = idx.insert(b"ab".to_vec());
694        let hits = idx.search_substring(b"ab");
695        assert!(hits.contains(&a));
696        assert!(hits.contains(&c));
697        assert_eq!(hits.len(), 2);
698    }
699
700    #[test]
701    fn empty_query_matches_every_doc() {
702        let mut idx = TextIndex::new();
703        let a = idx.insert(b"x".to_vec());
704        let b = idx.insert(b"y".to_vec());
705        let hits = idx.search_substring(b"");
706        assert_eq!(hits, vec![a, b]);
707    }
708
709    #[test]
710    fn unicode_query_byte_level_works() {
711        let mut idx = TextIndex::new();
712        // "cafe" with combining e-acute (UTF-8 bytes
713        // 0xC3 0xA9). Insert one doc with the e-acute form,
714        // one with plain ASCII, and search for the e-acute
715        // suffix.
716        let a = idx.insert(b"caf\xc3\xa9 noir".to_vec());
717        let b = idx.insert(b"cafe noir".to_vec());
718        let hits = idx.search_substring(b"\xc3\xa9");
719        assert_eq!(hits, vec![a]);
720        let hits = idx.search_substring(b"noir");
721        // Both contain the literal bytes "noir".
722        assert!(hits.contains(&a));
723        assert!(hits.contains(&b));
724        assert_eq!(hits.len(), 2);
725    }
726
727    #[test]
728    fn search_for_nonexistent_substring_returns_empty() {
729        let mut idx = TextIndex::new();
730        idx.insert(b"hello world".to_vec());
731        idx.insert(b"another doc".to_vec());
732        assert!(idx.search_substring(b"completely-absent").is_empty());
733    }
734
735    #[test]
736    fn search_on_empty_index_returns_empty() {
737        let idx = TextIndex::new();
738        assert!(idx.search_substring(b"anything").is_empty());
739        // The empty-query special case still returns an empty
740        // vec when there are no docs to enumerate.
741        assert!(idx.search_substring(b"").is_empty());
742    }
743}