Skip to main content

dyntext/
tiling.rs

1//! Sound trigram filter for approximate-regex matching.
2//!
3//! Given a regex AST and a global edit budget `k`, produce a
4//! [`ApproxFilter`] that lets the index narrow the candidate
5//! set before invoking the (expensive) TRE matcher on each
6//! survivor. The filter is *sound*: every doc that
7//! approximately matches the pattern within `k` edits passes
8//! the filter, so applying it never produces a false negative.
9//!
10//! # Construction
11//!
12//! The filter is parameterised by the pattern's *literal runs*
13//! -- maximal contiguous sequences of literal bytes from the
14//! AST (see [`crate::prefix_extract::extract_literal_runs`]).
15//! For each run of length `L_i`, the run contributes
16//! `T_i = max(0, L_i - 2)` candidate trigrams; the total
17//! across runs is `T = sum T_i`.
18//!
19//! With `k` edits permitted, each error can destroy at most
20//! three pattern trigrams (the up-to-three windows that
21//! contain the edited byte). So at least
22//! `max(0, T - 3k)` of the pattern's trigrams must appear in
23//! any matching doc, giving the soundness invariant
24//!
25//! ```text
26//!     #(pattern_trigrams in doc) >= T - 3k
27//! ```
28//!
29//! The filter therefore has two fields:
30//!
31//! * `trigrams` -- the pattern trigrams (deduplicated).
32//! * `min_required` -- `max(0, T - 3k)` clamped to
33//!   `trigrams.len()`.
34//!
35//! When `min_required == 0` the filter degenerates and the
36//! caller has to fall back to a full scan.
37//!
38//! # Application
39//!
40//! [`ApproxFilter::candidates`] is the entry point used by
41//! [`crate::index::TextIndex::search_regex_approx`]. It first
42//! computes the postings UNION of all pattern trigrams (an
43//! upper bound on the candidate set: any doc that contains
44//! `min_required >= 1` of them must be in the union of all of
45//! them), then for each survivor counts how many pattern
46//! trigrams the per-doc bloom filter accepts and rejects docs
47//! whose count falls below `min_required`.
48//!
49//! # Why not Navarro tiling?
50//!
51//! Navarro's `(k+1)`-tiling argument partitions the pattern
52//! into `k+1` disjoint contiguous tiles, each of which must
53//! match exactly somewhere because at least one of them is
54//! error-free by pigeonhole. For sufficiently long patterns
55//! this gives a tighter selectivity than the per-trigram
56//! bound. In practice the approximate-regex queries dyntext
57//! sees today have short literal runs (typically 3-5 bytes),
58//! and the `(k+1)`-tiling either degenerates or matches the
59//! per-trigram bound. We use the simpler bound here and leave
60//! the tiling refinement for a follow-up.
61
62use crate::bloom::BloomFilter;
63use crate::postings::Postings;
64use crate::prefix_extract;
65use crate::regex_ast::Ast;
66use crate::trigram;
67/// Sound trigram filter for an approximate-regex query.
68#[derive(Debug, Clone)]
69pub struct ApproxFilter {
70    /// Distinct pattern trigram hashes, sorted ascending.
71    /// Empty means the pattern has no extractable literal runs
72    /// (no filter possible).
73    pub trigrams: Vec<u64>,
74    /// Minimum number of [`Self::trigrams`] that must appear in
75    /// any matching doc. Always `<= trigrams.len()`. When zero
76    /// the filter degenerates to "no constraint" and the
77    /// caller must full-scan.
78    pub min_required: usize,
79}
80
81impl ApproxFilter {
82    /// Build the filter for a regex AST allowing up to `k`
83    /// edit operations.
84    #[must_use]
85    pub fn build(ast: &Ast, k: u16) -> Self {
86        let runs = prefix_extract::extract_literal_runs(ast);
87        let mut trigrams: Vec<u64> = Vec::new();
88        let mut total_trigram_count: usize = 0;
89        for run in &runs {
90            if run.len() < trigram::TRIGRAM_LEN {
91                continue;
92            }
93            for w in run.windows(trigram::TRIGRAM_LEN) {
94                let h = trigram::hash_trigram(w);
95                trigrams.push(h);
96                total_trigram_count += 1;
97            }
98        }
99        trigrams.sort_unstable();
100        trigrams.dedup();
101
102        // Soundness: each edit destroys at most 3 trigrams of
103        // the pattern (the up-to-3 windows that overlap the
104        // edited byte). With k edits the surviving count is
105        // at least `total_trigram_count - 3k` (with overlap
106        // counted, since a single physical edit destroys all
107        // three overlapping windows at once). We use the
108        // conservative formula: surviving >= T - 3k where T
109        // counts trigrams with multiplicity, but we then clamp
110        // to the deduplicated `trigrams.len()` because the
111        // filter operates on distinct trigrams.
112        let edits = usize::from(k);
113        let destroyed = edits.saturating_mul(3);
114        let surviving = total_trigram_count.saturating_sub(destroyed);
115        let min_required = surviving.min(trigrams.len());
116
117        Self {
118            trigrams,
119            min_required,
120        }
121    }
122
123    /// Whether the filter imposes any constraint. A degenerate
124    /// filter (no required trigrams or `min_required == 0`)
125    /// passes every doc and the caller should full-scan.
126    #[must_use]
127    pub fn is_active(&self) -> bool {
128        self.min_required > 0 && !self.trigrams.is_empty()
129    }
130
131    /// Compute the candidate doc-id set for the filter.
132    ///
133    /// The strategy is the postings UNION of every pattern
134    /// trigram. Any doc that contains at least one pattern
135    /// trigram is in the union; any doc with
136    /// `min_required >= 1` must be in the union; this is
137    /// therefore a sound upper bound. Per-doc bloom filtering
138    /// is applied separately by the index in a tighter inner
139    /// loop that already has the doc record in hand.
140    ///
141    /// The output is the surviving doc ids in ascending
142    /// order.
143    #[must_use]
144    pub fn candidates(&self, postings: &Postings) -> Vec<u32> {
145        let union = postings.union(&self.trigrams);
146        union.iter().collect()
147    }
148
149    /// Test the filter against a per-doc bloom filter.
150    ///
151    /// Returns `true` if the doc may match the pattern under
152    /// the configured edit budget; `false` if the pattern
153    /// imposes more required-trigram constraints than the doc
154    /// can plausibly satisfy.
155    #[must_use]
156    pub fn passes(&self, bloom: &BloomFilter) -> bool {
157        if !self.is_active() {
158            return true;
159        }
160        let mut hits = 0_usize;
161        let mut remaining = self.trigrams.len();
162        for t in &self.trigrams {
163            if bloom.contains(&t.to_le_bytes()) {
164                hits += 1;
165                if hits >= self.min_required {
166                    return true;
167                }
168            }
169            remaining -= 1;
170            if hits + remaining < self.min_required {
171                return false;
172            }
173        }
174        hits >= self.min_required
175    }
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181    use crate::regex_ast::parse;
182
183    fn build(pattern: &str, k: u16) -> ApproxFilter {
184        let ast = parse(pattern).expect("parses");
185        ApproxFilter::build(&ast, k)
186    }
187
188    #[test]
189    fn k0_filter_requires_all_trigrams() {
190        let f = build("hello", 0);
191        // "hello" has 3 distinct trigrams (hel, ell, llo).
192        assert_eq!(f.trigrams.len(), 3);
193        assert_eq!(f.min_required, 3);
194    }
195
196    #[test]
197    fn k1_filter_loosens_min_required() {
198        // T = 3 trigrams, k=1, surviving >= max(0, 3-3) = 0.
199        let f = build("hello", 1);
200        assert_eq!(f.trigrams.len(), 3);
201        assert_eq!(f.min_required, 0);
202        assert!(!f.is_active());
203    }
204
205    #[test]
206    fn long_pattern_under_k2_keeps_filter_active() {
207        // T = 9 trigrams (length 11), k=2, surviving >= 9-6=3.
208        let f = build("hello world", 2);
209        assert!(
210            f.min_required >= 3,
211            "expected min_required >= 3, got {}",
212            f.min_required
213        );
214        assert!(f.is_active());
215    }
216
217    #[test]
218    fn k0_filter_for_unsupported_pattern_is_inactive() {
219        // `.*` has no required literal runs.
220        let f = build(".*", 0);
221        assert!(f.trigrams.is_empty());
222        assert!(!f.is_active());
223    }
224
225    #[test]
226    fn passes_returns_true_when_inactive() {
227        let f = build(".*", 0);
228        let bloom = BloomFilter::with_size_and_fp_rate(64, 0.01);
229        assert!(f.passes(&bloom));
230    }
231
232    #[test]
233    fn passes_rejects_doc_below_threshold() {
234        let f = build("hello", 0);
235        // Empty bloom: no trigrams present. min_required = 3.
236        let bloom = BloomFilter::with_size_and_fp_rate(64, 0.01);
237        assert!(!f.passes(&bloom));
238    }
239
240    #[test]
241    fn passes_accepts_doc_at_threshold() {
242        let f = build("hello", 0);
243        let mut bloom = BloomFilter::with_size_and_fp_rate(256, 0.001);
244        for t in &f.trigrams {
245            bloom.insert(&t.to_le_bytes());
246        }
247        assert!(f.passes(&bloom));
248    }
249
250    #[test]
251    fn min_required_never_exceeds_unique_trigrams() {
252        // `aaaaa` has 1 distinct trigram; T (with multiplicity)
253        // is 3. min_required must clamp to 1.
254        let f = build("aaaaa", 0);
255        assert_eq!(f.trigrams.len(), 1);
256        assert_eq!(f.min_required, 1);
257    }
258}