dyntext/tiling.rs
1//! Sound trigram filter for approximate-regex matching.
2//!
3//! Given a regex AST and a global edit budget `k`, produce a
4//! [`ApproxFilter`] that lets the index narrow the candidate
5//! set before invoking the (expensive) TRE matcher on each
6//! survivor. The filter is *sound*: every doc that
7//! approximately matches the pattern within `k` edits passes
8//! the filter, so applying it never produces a false negative.
9//!
10//! # Construction
11//!
12//! The filter is parameterised by the pattern's *literal runs*
13//! -- maximal contiguous sequences of literal bytes from the
14//! AST (see [`crate::prefix_extract::extract_literal_runs`]).
15//! For each run of length `L_i`, the run contributes
16//! `T_i = max(0, L_i - 2)` candidate trigrams; the total
17//! across runs is `T = sum T_i`.
18//!
19//! With `k` edits permitted, each error can destroy at most
20//! three pattern trigrams (the up-to-three windows that
21//! contain the edited byte). So at least
22//! `max(0, T - 3k)` of the pattern's trigrams must appear in
23//! any matching doc, giving the soundness invariant
24//!
25//! ```text
26//! #(pattern_trigrams in doc) >= T - 3k
27//! ```
28//!
29//! The filter therefore has two fields:
30//!
31//! * `trigrams` -- the pattern trigrams (deduplicated).
32//! * `min_required` -- `max(0, T - 3k)` clamped to
33//! `trigrams.len()`.
34//!
35//! When `min_required == 0` the filter degenerates and the
36//! caller has to fall back to a full scan.
37//!
38//! # Application
39//!
40//! [`ApproxFilter::candidates`] is the entry point used by
41//! [`crate::index::TextIndex::search_regex_approx`]. It first
42//! computes the postings UNION of all pattern trigrams (an
43//! upper bound on the candidate set: any doc that contains
44//! `min_required >= 1` of them must be in the union of all of
45//! them), then for each survivor counts how many pattern
46//! trigrams the per-doc bloom filter accepts and rejects docs
47//! whose count falls below `min_required`.
48//!
49//! # Why not Navarro tiling?
50//!
51//! Navarro's `(k+1)`-tiling argument partitions the pattern
52//! into `k+1` disjoint contiguous tiles, each of which must
53//! match exactly somewhere because at least one of them is
54//! error-free by pigeonhole. For sufficiently long patterns
55//! this gives a tighter selectivity than the per-trigram
56//! bound. In practice the approximate-regex queries dyntext
57//! sees today have short literal runs (typically 3-5 bytes),
58//! and the `(k+1)`-tiling either degenerates or matches the
59//! per-trigram bound. We use the simpler bound here and leave
60//! the tiling refinement for a follow-up.
61
62use crate::bloom::BloomFilter;
63use crate::postings::Postings;
64use crate::prefix_extract;
65use crate::regex_ast::Ast;
66use crate::trigram;
67/// Sound trigram filter for an approximate-regex query.
68#[derive(Debug, Clone)]
69pub struct ApproxFilter {
70 /// Distinct pattern trigram hashes, sorted ascending.
71 /// Empty means the pattern has no extractable literal runs
72 /// (no filter possible).
73 pub trigrams: Vec<u64>,
74 /// Minimum number of [`Self::trigrams`] that must appear in
75 /// any matching doc. Always `<= trigrams.len()`. When zero
76 /// the filter degenerates to "no constraint" and the
77 /// caller must full-scan.
78 pub min_required: usize,
79}
80
81impl ApproxFilter {
82 /// Build the filter for a regex AST allowing up to `k`
83 /// edit operations.
84 #[must_use]
85 pub fn build(ast: &Ast, k: u16) -> Self {
86 let runs = prefix_extract::extract_literal_runs(ast);
87 let mut trigrams: Vec<u64> = Vec::new();
88 let mut total_trigram_count: usize = 0;
89 for run in &runs {
90 if run.len() < trigram::TRIGRAM_LEN {
91 continue;
92 }
93 for w in run.windows(trigram::TRIGRAM_LEN) {
94 let h = trigram::hash_trigram(w);
95 trigrams.push(h);
96 total_trigram_count += 1;
97 }
98 }
99 trigrams.sort_unstable();
100 trigrams.dedup();
101
102 // Soundness: each edit destroys at most 3 trigrams of
103 // the pattern (the up-to-3 windows that overlap the
104 // edited byte). With k edits the surviving count is
105 // at least `total_trigram_count - 3k` (with overlap
106 // counted, since a single physical edit destroys all
107 // three overlapping windows at once). We use the
108 // conservative formula: surviving >= T - 3k where T
109 // counts trigrams with multiplicity, but we then clamp
110 // to the deduplicated `trigrams.len()` because the
111 // filter operates on distinct trigrams.
112 let edits = usize::from(k);
113 let destroyed = edits.saturating_mul(3);
114 let surviving = total_trigram_count.saturating_sub(destroyed);
115 let min_required = surviving.min(trigrams.len());
116
117 Self {
118 trigrams,
119 min_required,
120 }
121 }
122
123 /// Whether the filter imposes any constraint. A degenerate
124 /// filter (no required trigrams or `min_required == 0`)
125 /// passes every doc and the caller should full-scan.
126 #[must_use]
127 pub fn is_active(&self) -> bool {
128 self.min_required > 0 && !self.trigrams.is_empty()
129 }
130
131 /// Compute the candidate doc-id set for the filter.
132 ///
133 /// The strategy is the postings UNION of every pattern
134 /// trigram. Any doc that contains at least one pattern
135 /// trigram is in the union; any doc with
136 /// `min_required >= 1` must be in the union; this is
137 /// therefore a sound upper bound. Per-doc bloom filtering
138 /// is applied separately by the index in a tighter inner
139 /// loop that already has the doc record in hand.
140 ///
141 /// The output is the surviving doc ids in ascending
142 /// order.
143 #[must_use]
144 pub fn candidates(&self, postings: &Postings) -> Vec<u32> {
145 let union = postings.union(&self.trigrams);
146 union.iter().collect()
147 }
148
149 /// Test the filter against a per-doc bloom filter.
150 ///
151 /// Returns `true` if the doc may match the pattern under
152 /// the configured edit budget; `false` if the pattern
153 /// imposes more required-trigram constraints than the doc
154 /// can plausibly satisfy.
155 #[must_use]
156 pub fn passes(&self, bloom: &BloomFilter) -> bool {
157 if !self.is_active() {
158 return true;
159 }
160 let mut hits = 0_usize;
161 let mut remaining = self.trigrams.len();
162 for t in &self.trigrams {
163 if bloom.contains(&t.to_le_bytes()) {
164 hits += 1;
165 if hits >= self.min_required {
166 return true;
167 }
168 }
169 remaining -= 1;
170 if hits + remaining < self.min_required {
171 return false;
172 }
173 }
174 hits >= self.min_required
175 }
176}
177
178#[cfg(test)]
179mod tests {
180 use super::*;
181 use crate::regex_ast::parse;
182
183 fn build(pattern: &str, k: u16) -> ApproxFilter {
184 let ast = parse(pattern).expect("parses");
185 ApproxFilter::build(&ast, k)
186 }
187
188 #[test]
189 fn k0_filter_requires_all_trigrams() {
190 let f = build("hello", 0);
191 // "hello" has 3 distinct trigrams (hel, ell, llo).
192 assert_eq!(f.trigrams.len(), 3);
193 assert_eq!(f.min_required, 3);
194 }
195
196 #[test]
197 fn k1_filter_loosens_min_required() {
198 // T = 3 trigrams, k=1, surviving >= max(0, 3-3) = 0.
199 let f = build("hello", 1);
200 assert_eq!(f.trigrams.len(), 3);
201 assert_eq!(f.min_required, 0);
202 assert!(!f.is_active());
203 }
204
205 #[test]
206 fn long_pattern_under_k2_keeps_filter_active() {
207 // T = 9 trigrams (length 11), k=2, surviving >= 9-6=3.
208 let f = build("hello world", 2);
209 assert!(
210 f.min_required >= 3,
211 "expected min_required >= 3, got {}",
212 f.min_required
213 );
214 assert!(f.is_active());
215 }
216
217 #[test]
218 fn k0_filter_for_unsupported_pattern_is_inactive() {
219 // `.*` has no required literal runs.
220 let f = build(".*", 0);
221 assert!(f.trigrams.is_empty());
222 assert!(!f.is_active());
223 }
224
225 #[test]
226 fn passes_returns_true_when_inactive() {
227 let f = build(".*", 0);
228 let bloom = BloomFilter::with_size_and_fp_rate(64, 0.01);
229 assert!(f.passes(&bloom));
230 }
231
232 #[test]
233 fn passes_rejects_doc_below_threshold() {
234 let f = build("hello", 0);
235 // Empty bloom: no trigrams present. min_required = 3.
236 let bloom = BloomFilter::with_size_and_fp_rate(64, 0.01);
237 assert!(!f.passes(&bloom));
238 }
239
240 #[test]
241 fn passes_accepts_doc_at_threshold() {
242 let f = build("hello", 0);
243 let mut bloom = BloomFilter::with_size_and_fp_rate(256, 0.001);
244 for t in &f.trigrams {
245 bloom.insert(&t.to_le_bytes());
246 }
247 assert!(f.passes(&bloom));
248 }
249
250 #[test]
251 fn min_required_never_exceeds_unique_trigrams() {
252 // `aaaaa` has 1 distinct trigram; T (with multiplicity)
253 // is 3. min_required must clamp to 1.
254 let f = build("aaaaa", 0);
255 assert_eq!(f.trigrams.len(), 1);
256 assert_eq!(f.min_required, 1);
257 }
258}