dyntext/index.rs
1//! Top-level [`TextIndex`] type combining trigram extraction,
2//! the inverted postings index, and the per-document bloom
3//! filter into a single insert-and-search facility.
4//!
5//! # Search algorithm
6//!
7//! The substring query path is the four-tier filter funnel from
8//! the design doc:
9//!
10//! 1. Extract the query's trigrams.
11//! 2. Intersect their postings lists into a candidate doc-id
12//! set (tier 2).
13//! 3. For each candidate doc, check the per-document bloom
14//! filter for the query's trigrams. This is a cheap
15//! membership test that usually agrees with the postings
16//! intersection but acts as defence in depth (tier 3).
17//! 4. For each survivor, run a real substring match against
18//! the doc's stored bytes (tier 4 / final recheck).
19//!
20//! Results are returned in insertion order so callers can rely
21//! on a deterministic ranking.
22//!
23//! # Short queries
24//!
25//! Queries shorter than [`MIN_TRIGRAM_QUERY_LEN`] cannot be
26//! resolved through the trigram index and fall back to a full
27//! scan of the stored corpus. Callers that want to avoid the
28//! full-scan cost should reject short queries themselves.
29
30use std::collections::BTreeMap;
31
32use rayon::prelude::*;
33use serde::{Deserialize, Serialize};
34
35use crate::bloom::BloomFilter;
36use crate::postings::Postings;
37use crate::prefix_extract;
38use crate::regex_ast::{self, RegexError};
39use crate::tiling::ApproxFilter;
40use crate::tre::{TreCompiledPattern, TreError, TreMatchOpts};
41use crate::trigram;
42
43/// Minimum query length in bytes that the trigram index can
44/// directly serve. Shorter queries fall back to a full scan
45/// over the stored corpus.
46pub const MIN_TRIGRAM_QUERY_LEN: usize = 3;
47
48/// Default expected trigram count per document for sizing the
49/// per-document bloom filter. Most short text fields produce
50/// far fewer; a tighter sizing keeps the per-doc memory
51/// footprint reasonable.
52const DEFAULT_BLOOM_N: usize = 256;
53
54/// Default false positive rate for the per-document bloom.
55const DEFAULT_BLOOM_FP: f64 = 0.01;
56
57/// Threshold above which `search_regex_approx` parallelises
58/// the per-doc TRE recheck. Below this size the rayon
59/// scheduling overhead is comparable to the savings, so the
60/// sequential path is faster. Tuned empirically against the
61/// 256-byte alnum bench corpus.
62const PARALLEL_RECHECK_THRESHOLD: usize = 1024;
63
64/// Chunk size used by the parallel-recheck path. Enough work
65/// per task to amortise rayon's task-spawn overhead, small
66/// enough to balance load across cores. Empirically 256 docs
67/// per chunk hits the sweet spot for the 100k-doc corpus on
68/// an 8-core box.
69const PARALLEL_RECHECK_CHUNK_SIZE: usize = 256;
70
71/// One indexed document: its raw text (for tier-4 substring
72/// recheck) and its trigram bloom filter (for tier-3 cheap
73/// recheck).
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct IndexedDoc {
76 /// Original byte sequence as inserted.
77 pub text: Vec<u8>,
78 /// Bloom filter over the doc's trigram set.
79 pub bloom: BloomFilter,
80}
81
82impl IndexedDoc {
83 /// Construct an indexed doc by computing its trigram set
84 /// and bloom filter from `text`.
85 fn new(text: Vec<u8>) -> Self {
86 let tris = trigram::extract_trigram_set(&text);
87 let mut bloom =
88 BloomFilter::with_size_and_fp_rate(DEFAULT_BLOOM_N.max(tris.len()), DEFAULT_BLOOM_FP);
89 for t in &tris {
90 bloom.insert(&t.to_le_bytes());
91 }
92 Self { text, bloom }
93 }
94}
95
96/// Trigram-based text index supporting incremental insert,
97/// remove, and exact-substring search.
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct TextIndex {
100 /// Inverted index of trigram hash to doc-id bitmap.
101 postings: Postings,
102 /// Per-doc state. The map is keyed by doc id; iteration
103 /// order is therefore insertion order because doc ids are
104 /// monotonically assigned.
105 docs: BTreeMap<u32, IndexedDoc>,
106 /// Next doc id to hand out. Starts at 0; never recycled
107 /// (a removed doc id is gone forever).
108 next_doc_id: u32,
109}
110
111impl Default for TextIndex {
112 fn default() -> Self {
113 Self::new()
114 }
115}
116
117impl TextIndex {
118 /// Construct an empty index.
119 #[must_use]
120 pub fn new() -> Self {
121 Self {
122 postings: Postings::new(),
123 docs: BTreeMap::new(),
124 next_doc_id: 0,
125 }
126 }
127
128 /// Number of documents currently in the index.
129 #[must_use]
130 pub fn doc_count(&self) -> usize {
131 self.docs.len()
132 }
133
134 /// Borrow the inverted postings index.
135 #[must_use]
136 pub fn postings(&self) -> &Postings {
137 &self.postings
138 }
139
140 /// Borrow the document store.
141 #[must_use]
142 pub fn docs(&self) -> &BTreeMap<u32, IndexedDoc> {
143 &self.docs
144 }
145
146 /// Insert `text` and return the assigned doc id.
147 ///
148 /// The doc id is assigned monotonically; doc ids are not
149 /// recycled, so a removed id is not handed out again.
150 pub fn insert(&mut self, text: Vec<u8>) -> u32 {
151 let doc_id = self.next_doc_id;
152 self.next_doc_id = self
153 .next_doc_id
154 .checked_add(1)
155 .expect("invariant: doc ids fit in u32; saturate at 2^32-1 by removing old docs");
156
157 let tris = trigram::extract_trigram_set(&text);
158 for t in &tris {
159 self.postings.insert(*t, doc_id);
160 }
161 let doc = IndexedDoc::new(text);
162 self.docs.insert(doc_id, doc);
163 doc_id
164 }
165
166 /// Remove the document at `doc_id` and return its raw
167 /// bytes, if any.
168 ///
169 /// All trigram entries for the doc are pulled from the
170 /// postings index. A trigram whose postings list becomes
171 /// empty after removal is garbage collected.
172 pub fn remove(&mut self, doc_id: u32) -> Option<Vec<u8>> {
173 let doc = self.docs.remove(&doc_id)?;
174 let tris = trigram::extract_trigram_set(&doc.text);
175 for t in &tris {
176 self.postings.remove(*t, doc_id);
177 }
178 Some(doc.text)
179 }
180
181 /// Search for documents whose text contains `query` as a
182 /// contiguous byte substring.
183 ///
184 /// Results are returned in insertion order. A document that
185 /// has been removed is not returned even if its postings
186 /// entries were missed by a buggy remove (we always
187 /// re-verify against the doc store).
188 ///
189 /// Queries shorter than [`MIN_TRIGRAM_QUERY_LEN`] cannot be
190 /// resolved through the trigram index and fall back to a
191 /// full scan.
192 #[must_use]
193 pub fn search_substring(&self, query: &[u8]) -> Vec<u32> {
194 if query.is_empty() {
195 // An empty substring matches every doc by
196 // definition; preserve insertion order.
197 return self.docs.keys().copied().collect();
198 }
199
200 if query.len() < MIN_TRIGRAM_QUERY_LEN {
201 return self.full_scan(query);
202 }
203
204 let qtris = trigram::extract_query_trigram_set(query);
205 if qtris.is_empty() {
206 return self.full_scan(query);
207 }
208
209 // Tier 2: postings intersection.
210 let candidates = self.postings.intersect(&qtris);
211 if candidates.is_empty() {
212 return Vec::new();
213 }
214
215 let mut hits: Vec<u32> = Vec::new();
216 for doc_id in &candidates {
217 let Some(doc) = self.docs.get(&doc_id) else {
218 continue;
219 };
220 // Tier 3: per-doc bloom filter.
221 if !qtris.iter().all(|t| doc.bloom.contains(&t.to_le_bytes())) {
222 continue;
223 }
224 // Tier 4: real substring recheck.
225 if Self::contains_substring(&doc.text, query) {
226 hits.push(doc_id);
227 }
228 }
229 // The Roaring bitmap iterates ascending, which is also
230 // insertion order because doc ids are monotonic. Sort
231 // anyway to make the contract explicit.
232 hits.sort_unstable();
233 hits
234 }
235
236 /// Full scan over the document store; used for queries too
237 /// short for the trigram index.
238 fn full_scan(&self, query: &[u8]) -> Vec<u32> {
239 let mut out = Vec::new();
240 for (id, doc) in &self.docs {
241 if Self::contains_substring(&doc.text, query) {
242 out.push(*id);
243 }
244 }
245 out
246 }
247
248 /// Search for documents whose text matches `pattern` as a
249 /// regular expression.
250 ///
251 /// The query path is the same four-tier filter funnel as
252 /// [`Self::search_substring`], plus a Phase-2 prefix
253 /// extraction step:
254 ///
255 /// 1. Parse `pattern` into the internal AST and extract
256 /// the trigrams that any matching string MUST contain
257 /// (see [`crate::prefix_extract`]).
258 /// 2. Intersect those trigrams' postings lists into a
259 /// candidate doc-id set. If the AST cannot be lowered
260 /// (named capture group, etc.) or yields no required
261 /// trigrams, fall back to scanning every doc.
262 /// 3. Per-doc bloom filter recheck (skipped on full scan).
263 /// 4. If the AST starts with `^literal`, prune candidates
264 /// whose first bytes do not equal the literal prefix.
265 /// 5. Compile the pattern with [`regex::bytes::Regex`] and
266 /// re-run it against each candidate's stored bytes.
267 ///
268 /// Results are returned in insertion order.
269 ///
270 /// # Errors
271 ///
272 /// Returns [`RegexError::Parse`] if the pattern is
273 /// syntactically invalid or uses a regex feature that the
274 /// underlying `regex` crate does not support (lookarounds,
275 /// backreferences, ...). A pattern that parses cleanly but
276 /// trips the prefix extractor's unsupported-feature path
277 /// (named capture groups) does NOT surface as an error:
278 /// the search still runs, just via the slower full-scan +
279 /// recheck path.
280 pub fn search_regex(&self, pattern: &str) -> Result<Vec<u32>, RegexError> {
281 // Compile the matcher first; a pattern the matcher
282 // cannot handle is a hard error to the caller. We use
283 // `regex::bytes::Regex` because the corpus is byte
284 // slices, not UTF-8.
285 let re = regex::bytes::Regex::new(pattern).map_err(|e| RegexError::Parse(e.to_string()))?;
286
287 // Required trigrams from the AST. If extraction fails
288 // (PrefixUnsupported) or yields no constraint, we have
289 // to scan every doc; the recheck still runs correctly.
290 let ast = regex_ast::parse(pattern).ok();
291 let trigram_hashes: Vec<u64> = ast
292 .as_ref()
293 .map(prefix_extract::required_trigram_hashes)
294 .unwrap_or_default();
295 let anchored_prefix = ast.as_ref().and_then(prefix_extract::anchored_prefix);
296
297 let candidates: Vec<u32> = if trigram_hashes.is_empty() {
298 self.docs.keys().copied().collect()
299 } else {
300 self.postings.intersect(&trigram_hashes).iter().collect()
301 };
302
303 let mut hits: Vec<u32> = Vec::new();
304 for doc_id in candidates {
305 let Some(doc) = self.docs.get(&doc_id) else {
306 continue;
307 };
308 // Tier 3: per-doc bloom filter -- only meaningful
309 // when we have required trigrams. On a full scan
310 // it would be a tautology because we have no
311 // membership query to make.
312 if !trigram_hashes.is_empty()
313 && !trigram_hashes
314 .iter()
315 .all(|t| doc.bloom.contains(&t.to_le_bytes()))
316 {
317 continue;
318 }
319 // Anchor fast-path: if the pattern is anchored and
320 // followed by a literal prefix, reject docs whose
321 // first bytes do not equal the prefix. This is
322 // cheap (a single byte-compare), sound for K=0,
323 // and lets the much heavier `regex` matcher off
324 // the hook for the common `^literal\w+...` shape.
325 if let Some(prefix) = anchored_prefix.as_ref() {
326 if doc.text.len() < prefix.len() || &doc.text[..prefix.len()] != prefix.as_slice() {
327 continue;
328 }
329 }
330 // Tier 4: real regex recheck.
331 if re.is_match(&doc.text) {
332 hits.push(doc_id);
333 }
334 }
335 hits.sort_unstable();
336 Ok(hits)
337 }
338
339 /// Byte-level substring match.
340 fn contains_substring(haystack: &[u8], needle: &[u8]) -> bool {
341 if needle.is_empty() {
342 return true;
343 }
344 if needle.len() > haystack.len() {
345 return false;
346 }
347 haystack.windows(needle.len()).any(|w| w == needle)
348 }
349
350 /// Search for documents that match `pattern` as an
351 /// approximate POSIX extended regular expression with up
352 /// to `max_errors` edit operations.
353 ///
354 /// The path mirrors [`Self::search_regex`] but the tier-4
355 /// recheck delegates to [`TreCompiledPattern`] instead of
356 /// the std `regex` matcher because TRE is the only engine
357 /// in the workspace that implements approximate match
358 /// semantics. The pre-recheck filter funnel uses the
359 /// pigeonhole bound `surviving_trigrams >= T - 3k`
360 /// (see [`ApproxFilter`]) to stay sound under the edit
361 /// budget.
362 ///
363 /// For a `^literal...` pattern the anchor fast-path
364 /// rejects every candidate whose first bytes are too far
365 /// (in Hamming distance) from the literal prefix. For
366 /// `max_errors == 0` this is a byte-equality check; for
367 /// `max_errors >= 1` it is a Hamming-distance check
368 /// against the prefix length, which is sound for the
369 /// substitution-only case TRE optimises and conservative
370 /// (always returns `true` for prefixes whose Hamming
371 /// distance is within `max_errors`) for the
372 /// insert/delete cases.
373 ///
374 /// When the surviving candidate set after filtering is
375 /// large (`>= PARALLEL_RECHECK_THRESHOLD`), the per-doc
376 /// TRE recheck is dispatched to a Rayon parallel iterator
377 /// to fan the cost across CPU cores. TRE's compiled
378 /// `regex_t` is `!Send`, so each parallel worker compiles
379 /// its own copy from the original pattern bytes.
380 ///
381 /// Results are returned in ascending document-id order.
382 ///
383 /// # Errors
384 ///
385 /// Returns [`TreError::Compile`] if the pattern fails to
386 /// compile under the given options.
387 pub fn search_regex_approx(
388 &self,
389 pattern: &str,
390 max_errors: u16,
391 ) -> Result<Vec<u32>, TreError> {
392 let opts = TreMatchOpts {
393 max_errors,
394 ..TreMatchOpts::default()
395 };
396 let pat = TreCompiledPattern::compile(pattern.as_bytes(), opts)?;
397
398 // Build the trigram filter and the anchor fast-path
399 // from the AST. A pattern that does not lower into the
400 // internal AST (named captures, regex-syntax features
401 // we don't model) falls back to the no-filter scan.
402 let ast = regex_ast::parse(pattern).ok();
403 let filter = ast.as_ref().map_or_else(
404 || ApproxFilter {
405 trigrams: Vec::new(),
406 min_required: 0,
407 },
408 |a| ApproxFilter::build(a, max_errors),
409 );
410 let anchored_prefix = ast.as_ref().and_then(prefix_extract::anchored_prefix);
411
412 // Tier 3 + anchor fast-path: cheap rejections before
413 // we hand the doc to TRE.
414 let prefilter = |doc: &IndexedDoc| -> bool {
415 if filter.is_active() && !filter.passes(&doc.bloom) {
416 return false;
417 }
418 if let Some(prefix) = anchored_prefix.as_ref() {
419 if !anchor_prefix_compatible(&doc.text, prefix, max_errors) {
420 return false;
421 }
422 }
423 true
424 };
425
426 // First sweep: build the (doc_id, &text) survivor list.
427 // When the filter is active we walk the candidate vec
428 // returned by the postings union; when it is inactive
429 // we iterate `self.docs` directly to avoid a chain of
430 // 100k BTreeMap lookups that would dominate the cost
431 // of the per-query setup.
432 let survivors: Vec<(u32, &[u8])> = if filter.is_active() {
433 let candidates = filter.candidates(&self.postings);
434 candidates
435 .into_iter()
436 .filter_map(|doc_id| {
437 let doc = self.docs.get(&doc_id)?;
438 if !prefilter(doc) {
439 return None;
440 }
441 Some((doc_id, doc.text.as_slice()))
442 })
443 .collect()
444 } else {
445 self.docs
446 .iter()
447 .filter_map(|(id, doc)| {
448 if !prefilter(doc) {
449 return None;
450 }
451 Some((*id, doc.text.as_slice()))
452 })
453 .collect()
454 };
455
456 let hits: Vec<u32> = if survivors.len() >= PARALLEL_RECHECK_THRESHOLD {
457 run_parallel_recheck(pattern, opts, &survivors)?
458 } else {
459 survivors
460 .into_iter()
461 .filter_map(|(id, text)| if pat.is_match(text) { Some(id) } else { None })
462 .collect()
463 };
464
465 let mut out = hits;
466 out.sort_unstable();
467 Ok(out)
468 }
469}
470
471/// Whether `doc` could plausibly be the start of an
472/// approximate match for the anchored literal `prefix` under
473/// an edit budget of `max_errors`.
474///
475/// The check is sound (never returns `false` when the doc
476/// could match) and cheap (compares at most
477/// `prefix.len() + max_errors` bytes per shift).
478///
479/// Strategy: iterate every alignment offset in
480/// `[0, max_errors]` and compute the edit distance between
481/// `prefix` and the corresponding doc prefix using a banded
482/// dynamic-programming table of width `2*max_errors + 1`. If
483/// any alignment is within `max_errors` edits, accept.
484///
485/// For `max_errors == 0` this collapses to a byte-equality
486/// check on the prefix length. For small `max_errors` (the
487/// common case in dyntext queries) the check costs
488/// O(prefix.len() * (2 * max_errors + 1)) byte comparisons.
489fn anchor_prefix_compatible(doc: &[u8], prefix: &[u8], max_errors: u16) -> bool {
490 if max_errors == 0 {
491 return doc.len() >= prefix.len() && &doc[..prefix.len()] == prefix;
492 }
493 let k = usize::from(max_errors);
494 // Restrict the doc window to the first prefix.len() + k
495 // bytes; an anchored match cannot reach further. If the
496 // doc is shorter than prefix.len() - k we still try -- the
497 // edit distance computation handles the length mismatch.
498 let window_end = (prefix.len() + k).min(doc.len());
499 let window = &doc[..window_end];
500 let dist = bounded_edit_distance(prefix, window, k);
501 dist <= k
502}
503
504/// Compute the prefix-edit distance between `pat` and `txt`,
505/// returning the smallest number of edits that turns `pat` into
506/// some prefix of `txt`. The result is capped at `k + 1` so
507/// the caller can reject early when the budget is exceeded.
508///
509/// This is the standard banded edit-distance recurrence,
510/// O(|pat| * (2 * k + 1)). It is symmetric in the
511/// substitution / insertion / deletion costs (all 1).
512fn bounded_edit_distance(pat: &[u8], txt: &[u8], k: usize) -> usize {
513 let m = pat.len();
514 let n = txt.len();
515 if m == 0 {
516 return 0;
517 }
518 // dp[i][j] = edits to turn pat[..i] into some prefix of
519 // txt[..j]. Only cells with |i - j| <= k matter; we keep
520 // two rows to avoid the |pat| x |txt| matrix.
521 let mut prev = vec![usize::MAX; n + 1];
522 let mut curr = vec![usize::MAX; n + 1];
523 prev[0] = 0;
524 for cell in prev.iter_mut().take(n.min(k) + 1).skip(1) {
525 *cell = 0; // free leading text characters (prefix-match)
526 }
527 for i in 1..=m {
528 curr[0] = i;
529 let lo = i.saturating_sub(k);
530 let hi = (i + k).min(n);
531 for j in 1..=n {
532 if j < lo || j > hi {
533 curr[j] = usize::MAX;
534 continue;
535 }
536 let cost = usize::from(pat[i - 1] != txt[j - 1]);
537 let sub = prev[j - 1].saturating_add(cost);
538 let del = prev[j].saturating_add(1);
539 let ins = curr[j - 1].saturating_add(1);
540 curr[j] = sub.min(del).min(ins);
541 }
542 std::mem::swap(&mut prev, &mut curr);
543 }
544 // Best alignment: the smallest dp[m][j] over j in
545 // [m - k, m + k] intersected with [0, n].
546 let lo = m.saturating_sub(k);
547 let hi = (m + k).min(n);
548 let mut best = usize::MAX;
549 for cell in prev.iter().take(hi + 1).skip(lo) {
550 best = best.min(*cell);
551 }
552 best.min(k + 1)
553}
554
555/// Run the TRE recheck across `survivors` in parallel.
556///
557/// `TreCompiledPattern` is `!Send`, so the parallel iterator
558/// compiles a fresh pattern in each worker via
559/// [`rayon::iter::ParallelIterator::map_init`]. The compile
560/// cost is amortised across the worker's chunk of survivors,
561/// which is much smaller than the per-doc TRE match cost.
562fn run_parallel_recheck(
563 pattern: &str,
564 opts: TreMatchOpts,
565 survivors: &[(u32, &[u8])],
566) -> Result<Vec<u32>, TreError> {
567 use std::sync::atomic::{AtomicBool, Ordering};
568 let pattern_bytes = pattern.as_bytes();
569 let compile_failed = AtomicBool::new(false);
570 let hits: Vec<u32> = survivors
571 .par_chunks(PARALLEL_RECHECK_CHUNK_SIZE)
572 .map_init(
573 || TreCompiledPattern::compile(pattern_bytes, opts).ok(),
574 |worker_pat: &mut Option<TreCompiledPattern>, chunk: &[(u32, &[u8])]| -> Vec<u32> {
575 let Some(pat) = worker_pat.as_ref() else {
576 compile_failed.store(true, Ordering::Relaxed);
577 return Vec::new();
578 };
579 let mut local = Vec::new();
580 for &(id, text) in chunk {
581 if pat.is_match(text) {
582 local.push(id);
583 }
584 }
585 local
586 },
587 )
588 .flatten()
589 .collect();
590 if compile_failed.load(Ordering::Relaxed) {
591 return Err(TreError::Internal(
592 "parallel worker failed to compile pattern".into(),
593 ));
594 }
595 Ok(hits)
596}
597
598#[cfg(test)]
599mod tests {
600 use super::*;
601
602 #[test]
603 fn insert_then_search_finds_the_doc() {
604 let mut idx = TextIndex::new();
605 let id = idx.insert(b"hello world".to_vec());
606 let hits = idx.search_substring(b"hello");
607 assert_eq!(hits, vec![id]);
608 }
609
610 #[test]
611 fn search_substring_returns_only_true_positives() {
612 let mut idx = TextIndex::new();
613 let a = idx.insert(b"the quick brown fox".to_vec());
614 let _b = idx.insert(b"jumped over a lazy dog".to_vec());
615 let c = idx.insert(b"a brown fox is quick".to_vec());
616 let hits = idx.search_substring(b"brown fox");
617 // Only docs containing the literal byte sequence
618 // "brown fox" qualify.
619 assert!(hits.contains(&a));
620 assert!(hits.contains(&c));
621 assert_eq!(hits.len(), 2);
622 }
623
624 #[test]
625 fn search_substring_no_false_negatives_on_corpus() {
626 let mut store = TextIndex::new();
627 let corpus: &[&[u8]] = &[
628 b"alpha beta gamma",
629 b"beta cake",
630 b"the alphabet starts with alpha",
631 b"omega only",
632 ];
633 let ids: Vec<u32> = corpus.iter().map(|t| store.insert(t.to_vec())).collect();
634 for q in [b"alpha".as_slice(), b"beta", b"omega", b"the"] {
635 let hits = store.search_substring(q);
636 for (i, doc) in corpus.iter().enumerate() {
637 if doc.windows(q.len()).any(|w| w == q) {
638 assert!(
639 hits.contains(&ids[i]),
640 "false negative: query {q:?} should hit doc {i} {doc:?}",
641 );
642 }
643 }
644 }
645 }
646
647 #[test]
648 fn search_returns_results_in_insertion_order() {
649 let mut idx = TextIndex::new();
650 let id_a = idx.insert(b"hello a".to_vec());
651 let id_b = idx.insert(b"hello b".to_vec());
652 let id_c = idx.insert(b"hello c".to_vec());
653 let hits = idx.search_substring(b"hello");
654 assert_eq!(hits, vec![id_a, id_b, id_c]);
655 }
656
657 #[test]
658 fn remove_excludes_doc_from_subsequent_searches() {
659 let mut idx = TextIndex::new();
660 let a = idx.insert(b"the quick brown fox".to_vec());
661 let b = idx.insert(b"another brown fox here".to_vec());
662 let removed = idx.remove(a).expect("doc a present");
663 assert_eq!(removed, b"the quick brown fox");
664 let hits = idx.search_substring(b"brown fox");
665 assert_eq!(hits, vec![b]);
666 }
667
668 #[test]
669 fn remove_garbage_collects_unique_trigrams() {
670 let mut idx = TextIndex::new();
671 let a = idx.insert(b"unique-string-only-here".to_vec());
672 let postings_before = idx.postings().len();
673 assert!(postings_before > 0);
674 idx.remove(a);
675 // After removing the only doc, every postings entry
676 // should be empty and gone.
677 assert_eq!(idx.postings().len(), 0);
678 assert_eq!(idx.doc_count(), 0);
679 }
680
681 #[test]
682 fn remove_missing_doc_id_returns_none() {
683 let mut idx = TextIndex::new();
684 idx.insert(b"abc".to_vec());
685 assert!(idx.remove(9999).is_none());
686 }
687
688 #[test]
689 fn query_shorter_than_three_chars_uses_full_scan() {
690 let mut idx = TextIndex::new();
691 let a = idx.insert(b"abcdef".to_vec());
692 let _b = idx.insert(b"xyz".to_vec());
693 let c = idx.insert(b"ab".to_vec());
694 let hits = idx.search_substring(b"ab");
695 assert!(hits.contains(&a));
696 assert!(hits.contains(&c));
697 assert_eq!(hits.len(), 2);
698 }
699
700 #[test]
701 fn empty_query_matches_every_doc() {
702 let mut idx = TextIndex::new();
703 let a = idx.insert(b"x".to_vec());
704 let b = idx.insert(b"y".to_vec());
705 let hits = idx.search_substring(b"");
706 assert_eq!(hits, vec![a, b]);
707 }
708
709 #[test]
710 fn unicode_query_byte_level_works() {
711 let mut idx = TextIndex::new();
712 // "cafe" with combining e-acute (UTF-8 bytes
713 // 0xC3 0xA9). Insert one doc with the e-acute form,
714 // one with plain ASCII, and search for the e-acute
715 // suffix.
716 let a = idx.insert(b"caf\xc3\xa9 noir".to_vec());
717 let b = idx.insert(b"cafe noir".to_vec());
718 let hits = idx.search_substring(b"\xc3\xa9");
719 assert_eq!(hits, vec![a]);
720 let hits = idx.search_substring(b"noir");
721 // Both contain the literal bytes "noir".
722 assert!(hits.contains(&a));
723 assert!(hits.contains(&b));
724 assert_eq!(hits.len(), 2);
725 }
726
727 #[test]
728 fn search_for_nonexistent_substring_returns_empty() {
729 let mut idx = TextIndex::new();
730 idx.insert(b"hello world".to_vec());
731 idx.insert(b"another doc".to_vec());
732 assert!(idx.search_substring(b"completely-absent").is_empty());
733 }
734
735 #[test]
736 fn search_on_empty_index_returns_empty() {
737 let idx = TextIndex::new();
738 assert!(idx.search_substring(b"anything").is_empty());
739 // The empty-query special case still returns an empty
740 // vec when there are no docs to enumerate.
741 assert!(idx.search_substring(b"").is_empty());
742 }
743}