Skip to main content

fff_search/
bigram_query.rs

1//! Regex → bigram decomposition for the inverted bigram index.
2//!
3//! Parses a regex pattern with `regex-syntax`, walks the HIR to extract
4//! guaranteed bigram keys (u16), and evaluates them as an AND/OR query tree
5//! against [`BigramFilter`]'s inverted posting lists.
6//!
7//! Two bigram types are extracted:
8//! - **Consecutive** (gap=0): adjacent byte pairs `(pattern[i], pattern[i+1])`
9//! - **Sparse-1** (gap=1): pairs across a single-byte wildcard, e.g. `a.b → (a,b)`
10//!
11//! The sparse-1 extraction is the key feature: regex patterns like `foo.bar`
12//! yield the cross-boundary sparse-1 bigram `(o,b)` that provides strong
13//! filtering even when the `.` prevents any consecutive cross-boundary bigram.
14
15use crate::bigram_filter::BigramFilter;
16use regex_syntax::hir::{Class, Hir, HirKind};
17use smallvec::SmallVec;
18use std::borrow::Cow;
19
20/// Maximum byte values to enumerate from a character class.
21/// Larger classes are treated as unknown (no bigram extractable).
22const MAX_CLASS_EXPAND: usize = 16;
23
24#[inline]
25fn consec_key(a: u8, b: u8) -> Option<u16> {
26    let al = a.to_ascii_lowercase();
27    let bl = b.to_ascii_lowercase();
28    if (32..=126).contains(&al) && (32..=126).contains(&bl) {
29        Some((al as u16) << 8 | bl as u16)
30    } else {
31        None
32    }
33}
34
35#[derive(Debug, Clone)]
36pub enum BigramQuery {
37    Any,
38    /// A consecutive bigram key to look up in the main index.
39    Consec(u16),
40    /// A skip-1 bigram key to look up in the skip sub-index.
41    Skip1(u16),
42    /// All children must match (intersect posting lists).
43    And(Vec<BigramQuery>),
44    /// At least one child must match (union posting lists).
45    Or(Vec<BigramQuery>),
46}
47
48/// SIMD-friendly bitwise OR of two equal-length bitsets.
49#[inline]
50fn bitset_or(a: &mut [u64], b: &[u64]) {
51    a.iter_mut().zip(b.iter()).for_each(|(x, y)| *x |= *y);
52}
53
54/// SIMD-friendly bitwise AND of two equal-length bitsets.
55#[inline]
56fn bitset_and(a: &mut [u64], b: &[u64]) {
57    a.iter_mut().zip(b.iter()).for_each(|(x, y)| *x &= *y);
58}
59
60impl BigramQuery {
61    pub fn is_any(&self) -> bool {
62        matches!(self, BigramQuery::Any)
63    }
64
65    pub(crate) fn evaluate(&self, index: &BigramFilter) -> Option<Vec<u64>> {
66        self.evaluate_cow(index).map(Cow::into_owned)
67    }
68
69    fn evaluate_cow<'a>(&self, index: &'a BigramFilter) -> Option<Cow<'a, [u64]>> {
70        match self {
71            BigramQuery::Any => None,
72
73            BigramQuery::Consec(key) => {
74                let col = index.lookup()[*key as usize];
75                if col == u16::MAX {
76                    return None;
77                }
78                let words = index.words();
79                let offset = col as usize * words;
80                let data = index.dense_data();
81                if offset + words > data.len() {
82                    return None;
83                }
84                Some(Cow::Borrowed(&data[offset..offset + words]))
85            }
86
87            BigramQuery::Skip1(key) => {
88                let skip = index.skip_index()?;
89                let col = skip.lookup()[*key as usize];
90                if col == u16::MAX {
91                    return None;
92                }
93                let words = skip.words();
94                let offset = col as usize * words;
95                let data = skip.dense_data();
96                if offset + words > data.len() {
97                    return None;
98                }
99                Some(Cow::Borrowed(&data[offset..offset + words]))
100            }
101
102            BigramQuery::And(children) => {
103                let mut result: Option<Vec<u64>> = None;
104                for child in children {
105                    if let Some(child_bits) = child.evaluate_cow(index) {
106                        result = Some(match result {
107                            None => child_bits.into_owned(),
108                            Some(mut r) => {
109                                bitset_and(&mut r, &child_bits);
110                                r
111                            }
112                        });
113                    }
114                }
115                result.map(Cow::Owned)
116            }
117
118            BigramQuery::Or(children) => {
119                if children.is_empty() {
120                    return None;
121                }
122                let mut result: Option<Vec<u64>> = None;
123                for child in children {
124                    match child.evaluate_cow(index) {
125                        // Any branch can't be filtered → whole OR can't be filtered
126                        None => return None,
127                        Some(child_bits) => {
128                            result = Some(match result {
129                                None => child_bits.into_owned(),
130                                Some(mut r) => {
131                                    bitset_or(&mut r, &child_bits);
132                                    r
133                                }
134                            });
135                        }
136                    }
137                }
138                result.map(Cow::Owned)
139            }
140        }
141    }
142}
143
144/// Intermediate state tracked during HIR traversal for bigram extraction.
145struct HirInfo {
146    query: BigramQuery,
147    /// Possible first bytes (lowercased, printable ASCII) when this node matches.
148    first: Option<SmallVec<[u8; MAX_CLASS_EXPAND]>>,
149    /// Possible last bytes.
150    last: Option<SmallVec<[u8; MAX_CLASS_EXPAND]>>,
151    /// Whether this node can match the empty string.
152    can_be_empty: bool,
153}
154
155impl HirInfo {
156    fn empty() -> Self {
157        Self {
158            query: BigramQuery::Any,
159            first: None,
160            last: None,
161            can_be_empty: true,
162        }
163    }
164}
165
166/// Prefilter fuzzy query. The algorithm is the following:
167/// we allow max_typos = min(len/3,2) every typo destroys at most 2 consecutive bigrams
168/// So out of N bigrams at least N - 2 * max_typos have to present in the matching fil
169pub(crate) fn fuzzy_to_bigram_query(query: &str, num_probes: usize) -> BigramQuery {
170    let lower: Vec<u8> = query.bytes().map(|b| b.to_ascii_lowercase()).collect();
171
172    if lower.len() < 2 {
173        return BigramQuery::Any;
174    }
175
176    let max_typos = (lower.len() / 3).min(2);
177
178    // Extract all consecutive bigram keys.
179    let bigram_keys: Vec<u16> = lower
180        .windows(2)
181        .filter_map(|w| consec_key(w[0], w[1]))
182        .collect();
183
184    if bigram_keys.is_empty() {
185        return BigramQuery::Any;
186    }
187
188    // For very short queries (0 typos), AND all bigrams — exact subsequence.
189    if max_typos == 0 {
190        return simplify_and(
191            bigram_keys
192                .iter()
193                .map(|&k| BigramQuery::Consec(k))
194                .collect(),
195        );
196    }
197
198    // Pick evenly-spaced probe bigrams.
199    let n = num_probes.min(bigram_keys.len());
200    if n <= max_typos {
201        // Too few probes to require anything useful.
202        return simplify_or(
203            bigram_keys
204                .iter()
205                .map(|&k| BigramQuery::Consec(k))
206                .collect(),
207        );
208    }
209
210    let probes: Vec<u16> = if n == bigram_keys.len() {
211        bigram_keys
212    } else {
213        (0..n)
214            .map(|i| {
215                let idx = i * (bigram_keys.len() - 1) / (n - 1);
216                bigram_keys[idx]
217            })
218            .collect()
219    };
220
221    let required = n - max_typos;
222
223    // If required == n, just AND all probes.
224    if required >= n {
225        return simplify_and(probes.iter().map(|&k| BigramQuery::Consec(k)).collect());
226    }
227
228    // Generate all C(n, required) subsets → OR(AND(subset), ...)
229    let mut branches = Vec::new();
230    let mut combo = vec![0u16; required];
231    combine(&probes, required, 0, 0, &mut combo, &mut branches);
232
233    simplify_or(branches)
234}
235
236/// Build C(n, k) combination branches in-place on a fixed-size slice.
237fn combine(
238    items: &[u16],
239    k: usize,
240    start: usize,
241    depth: usize,
242    combo: &mut [u16],
243    branches: &mut Vec<BigramQuery>,
244) {
245    if depth == k {
246        branches.push(simplify_and(
247            combo.iter().map(|&key| BigramQuery::Consec(key)).collect(),
248        ));
249        return;
250    }
251    let remaining = k - depth;
252    for i in start..=items.len() - remaining {
253        combo[depth] = items[i];
254        combine(items, k, i + 1, depth + 1, combo, branches);
255    }
256}
257
258pub(crate) fn regex_to_bigram_query(pattern: &str) -> BigramQuery {
259    let mut parser = regex_syntax::ParserBuilder::new()
260        .unicode(false)
261        .utf8(false)
262        .build();
263
264    let hir = match parser.parse(pattern) {
265        Ok(h) => h,
266        Err(_) => return BigramQuery::Any,
267    };
268
269    decompose(&hir).query
270}
271
272fn decompose(hir: &Hir) -> HirInfo {
273    let can_be_empty = hir.properties().minimum_len().is_none_or(|n| n == 0);
274
275    match hir.kind() {
276        HirKind::Empty => HirInfo::empty(),
277
278        HirKind::Literal(lit) => decompose_literal(lit.0.as_ref()),
279
280        HirKind::Class(class) => {
281            let bytes = expand_class(class);
282            match bytes {
283                Some(b) if !b.is_empty() => HirInfo {
284                    query: BigramQuery::Any,
285                    first: Some(b.clone()),
286                    last: Some(b),
287                    can_be_empty,
288                },
289                _ => HirInfo {
290                    query: BigramQuery::Any,
291                    first: None,
292                    last: None,
293                    can_be_empty,
294                },
295            }
296        }
297
298        HirKind::Look(_) => HirInfo::empty(),
299
300        HirKind::Repetition(rep) => {
301            let inner = decompose(&rep.sub);
302            if rep.min == 0 {
303                HirInfo {
304                    query: BigramQuery::Any,
305                    first: inner.first,
306                    last: inner.last,
307                    can_be_empty: true,
308                }
309            } else {
310                // min >= 1: inner bigrams guaranteed
311                let mut qs = Vec::new();
312                if !inner.query.is_any() {
313                    qs.push(inner.query.clone());
314                }
315                // min >= 2: cross-boundary between consecutive occurrences
316                if rep.min >= 2 {
317                    push_cross_consec(&mut qs, inner.last.as_deref(), inner.first.as_deref());
318                }
319                HirInfo {
320                    query: simplify_and(qs),
321                    first: inner.first,
322                    last: inner.last,
323                    can_be_empty,
324                }
325            }
326        }
327
328        HirKind::Capture(cap) => decompose(&cap.sub),
329
330        HirKind::Concat(parts) => decompose_concat(parts),
331
332        HirKind::Alternation(alts) => decompose_alternation(alts),
333    }
334}
335
336/// Extract bigrams from a literal byte sequence.
337fn decompose_literal(bytes: &[u8]) -> HirInfo {
338    if bytes.is_empty() {
339        return HirInfo::empty();
340    }
341
342    let lower: SmallVec<[u8; 64]> = bytes.iter().map(|b| b.to_ascii_lowercase()).collect();
343
344    if lower.len() == 1 {
345        let b = lower[0];
346        let first = if (32..=126).contains(&b) {
347            Some(SmallVec::from_slice(&[b]))
348        } else {
349            None
350        };
351        return HirInfo {
352            query: BigramQuery::Any,
353            first: first.clone(),
354            last: first,
355            can_be_empty: false,
356        };
357    }
358
359    let mut qs: Vec<BigramQuery> = Vec::new();
360
361    // Consecutive bigrams
362    for w in lower.windows(2) {
363        if let Some(k) = consec_key(w[0], w[1]) {
364            qs.push(BigramQuery::Consec(k));
365        }
366    }
367
368    // Skip-1 bigrams from the literal itself
369    if lower.len() >= 3 {
370        for i in 0..lower.len() - 2 {
371            if let Some(k) = consec_key(lower[i], lower[i + 2]) {
372                qs.push(BigramQuery::Skip1(k));
373            }
374        }
375    }
376
377    let first_byte = lower[0];
378    let last_byte = *lower.last().unwrap();
379
380    HirInfo {
381        query: simplify_and(qs),
382        first: if (32..=126).contains(&first_byte) {
383            Some(SmallVec::from_slice(&[first_byte]))
384        } else {
385            None
386        },
387        last: if (32..=126).contains(&last_byte) {
388            Some(SmallVec::from_slice(&[last_byte]))
389        } else {
390            None
391        },
392        can_be_empty: false,
393    }
394}
395
396fn decompose_concat(parts: &[Hir]) -> HirInfo {
397    if parts.is_empty() {
398        return HirInfo::empty();
399    }
400
401    let infos: Vec<HirInfo> = parts.iter().map(decompose).collect();
402    let mut qs: Vec<BigramQuery> = Vec::new();
403
404    // 1. Collect child bigrams
405    for info in &infos {
406        if !info.query.is_any() {
407            qs.push(info.query.clone());
408        }
409    }
410
411    // 2. Dense cross-boundary between adjacent mandatory parts
412    for pair in infos.windows(2) {
413        if !pair[0].can_be_empty && !pair[1].can_be_empty {
414            push_cross_consec(&mut qs, pair[0].last.as_deref(), pair[1].first.as_deref());
415        }
416    }
417
418    // 3. Sparse-1 cross-boundary: across a single 1-byte-wide middle part.
419    //    Catches `foo.bar` → sparse-1 `(o,b)` across the dot.
420    if parts.len() >= 3 {
421        for i in 0..parts.len() - 2 {
422            let left = &infos[i];
423            let mid = &parts[i + 1];
424            let right = &infos[i + 2];
425
426            let min_len = mid.properties().minimum_len();
427            let max_len = mid.properties().maximum_len();
428            let is_1byte = min_len == Some(1) && max_len == Some(1);
429
430            if is_1byte && !left.can_be_empty && !right.can_be_empty {
431                push_cross_skip1(&mut qs, left.last.as_deref(), right.first.as_deref());
432            }
433        }
434    }
435
436    let first = collect_first(&infos);
437    let last = collect_last(&infos);
438    let can_be_empty = infos.iter().all(|i| i.can_be_empty);
439
440    HirInfo {
441        query: simplify_and(qs),
442        first,
443        last,
444        can_be_empty,
445    }
446}
447
448fn decompose_alternation(alts: &[Hir]) -> HirInfo {
449    if alts.is_empty() {
450        return HirInfo::empty();
451    }
452
453    let infos: Vec<HirInfo> = alts.iter().map(decompose).collect();
454    let query = simplify_or(infos.iter().map(|i| i.query.clone()).collect());
455    let first = merge_byte_sets(infos.iter().map(|i| &i.first));
456    let last = merge_byte_sets(infos.iter().map(|i| &i.last));
457    let can_be_empty = infos.iter().any(|i| i.can_be_empty);
458
459    HirInfo {
460        query,
461        first,
462        last,
463        can_be_empty,
464    }
465}
466
467fn expand_class(class: &Class) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
468    let mut bytes: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
469    match class {
470        Class::Bytes(bc) => {
471            for range in bc.ranges() {
472                let count = (range.end() as usize) - (range.start() as usize) + 1;
473                if bytes.len() + count > MAX_CLASS_EXPAND {
474                    return None;
475                }
476                for b in range.start()..=range.end() {
477                    if (32..=126).contains(&b) {
478                        let lower = b.to_ascii_lowercase();
479                        if !bytes.contains(&lower) {
480                            bytes.push(lower);
481                        }
482                    }
483                }
484            }
485        }
486        Class::Unicode(uc) => {
487            for range in uc.ranges() {
488                let start = range.start() as u32;
489                let end = range.end() as u32;
490                if start > 127 {
491                    continue;
492                }
493                let ascii_end = end.min(126) as u8;
494                let ascii_start = start.max(32) as u8;
495                if ascii_start > ascii_end {
496                    continue;
497                }
498                let count = (ascii_end - ascii_start) as usize + 1;
499                if bytes.len() + count > MAX_CLASS_EXPAND {
500                    return None;
501                }
502                for b in ascii_start..=ascii_end {
503                    let lower = b.to_ascii_lowercase();
504                    if !bytes.contains(&lower) {
505                        bytes.push(lower);
506                    }
507                }
508            }
509        }
510    }
511    if bytes.is_empty() { None } else { Some(bytes) }
512}
513
514/// Push consecutive cross-product bigrams into `qs`.
515fn push_cross_consec(qs: &mut Vec<BigramQuery>, last: Option<&[u8]>, first: Option<&[u8]>) {
516    if let Some(q) = cross_product(last, first, false) {
517        qs.push(q);
518    }
519}
520
521/// Push skip-1 cross-product bigrams into `qs`.
522fn push_cross_skip1(qs: &mut Vec<BigramQuery>, last: Option<&[u8]>, first: Option<&[u8]>) {
523    if let Some(q) = cross_product(last, first, true) {
524        qs.push(q);
525    }
526}
527
528fn cross_product(last: Option<&[u8]>, first: Option<&[u8]>, skip: bool) -> Option<BigramQuery> {
529    let last = last?;
530    let first = first?;
531    let n = last.len() * first.len();
532    if n == 0 || n > MAX_CLASS_EXPAND * MAX_CLASS_EXPAND {
533        return None;
534    }
535
536    let mut bigrams: Vec<BigramQuery> = Vec::with_capacity(n);
537    for &l in last {
538        for &f in first {
539            if let Some(k) = consec_key(l, f) {
540                let node = if skip {
541                    BigramQuery::Skip1(k)
542                } else {
543                    BigramQuery::Consec(k)
544                };
545                bigrams.push(node);
546            }
547        }
548    }
549
550    match bigrams.len() {
551        0 => None,
552        1 => Some(bigrams.into_iter().next().unwrap()),
553        _ => Some(simplify_or(bigrams)),
554    }
555}
556
557fn collect_first(infos: &[HirInfo]) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
558    let mut result: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
559    for info in infos {
560        if let Some(ref bytes) = info.first {
561            for &b in bytes {
562                if !result.contains(&b) {
563                    if result.len() >= MAX_CLASS_EXPAND {
564                        return None;
565                    }
566                    result.push(b);
567                }
568            }
569        } else if !info.can_be_empty {
570            return None;
571        }
572        if !info.can_be_empty {
573            break;
574        }
575    }
576    if result.is_empty() {
577        None
578    } else {
579        Some(result)
580    }
581}
582
583fn collect_last(infos: &[HirInfo]) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
584    let mut result: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
585    for info in infos.iter().rev() {
586        if let Some(ref bytes) = info.last {
587            for &b in bytes {
588                if !result.contains(&b) {
589                    if result.len() >= MAX_CLASS_EXPAND {
590                        return None;
591                    }
592                    result.push(b);
593                }
594            }
595        } else if !info.can_be_empty {
596            return None;
597        }
598        if !info.can_be_empty {
599            break;
600        }
601    }
602    if result.is_empty() {
603        None
604    } else {
605        Some(result)
606    }
607}
608
609fn merge_byte_sets<'a>(
610    iter: impl Iterator<Item = &'a Option<SmallVec<[u8; MAX_CLASS_EXPAND]>>>,
611) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
612    let mut result: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
613    for opt in iter {
614        match opt {
615            None => return None,
616            Some(bytes) => {
617                for &b in bytes {
618                    if !result.contains(&b) {
619                        if result.len() >= MAX_CLASS_EXPAND {
620                            return None;
621                        }
622                        result.push(b);
623                    }
624                }
625            }
626        }
627    }
628    if result.is_empty() {
629        None
630    } else {
631        Some(result)
632    }
633}
634
635fn simplify_and(children: Vec<BigramQuery>) -> BigramQuery {
636    let mut flat: Vec<BigramQuery> = Vec::new();
637    for child in children {
638        match child {
639            BigramQuery::Any => {}
640            BigramQuery::And(inner) => flat.extend(inner),
641            other => flat.push(other),
642        }
643    }
644    match flat.len() {
645        0 => BigramQuery::Any,
646        1 => flat.into_iter().next().unwrap(),
647        _ => BigramQuery::And(flat),
648    }
649}
650
651fn simplify_or(children: Vec<BigramQuery>) -> BigramQuery {
652    if children.iter().any(|c| c.is_any()) {
653        return BigramQuery::Any;
654    }
655    let mut flat: Vec<BigramQuery> = Vec::new();
656    for child in children {
657        match child {
658            BigramQuery::Or(inner) => flat.extend(inner),
659            other => flat.push(other),
660        }
661    }
662    match flat.len() {
663        0 => BigramQuery::Any,
664        1 => flat.into_iter().next().unwrap(),
665        _ => BigramQuery::Or(flat),
666    }
667}
668
669#[cfg(test)]
670mod tests {
671    use super::*;
672    use crate::bigram_filter::BigramIndexBuilder;
673
674    /// Build a tiny index from the given file contents for testing.
675    fn build_test_index(files: &[&[u8]]) -> BigramFilter {
676        let n = files.len();
677        let consec_builder = BigramIndexBuilder::new(n);
678        let skip_builder = BigramIndexBuilder::new(n);
679        for (i, content) in files.iter().enumerate() {
680            consec_builder.add_file_content(&skip_builder, i, content);
681        }
682        let mut idx = consec_builder.compress(Some(0));
683        idx.set_skip_index(skip_builder.compress(Some(0)));
684        idx
685    }
686
687    #[test]
688    fn literal_pattern() {
689        let idx = build_test_index(&[
690            b"hello world",     // 0: contains "hello"
691            b"goodbye world",   // 1: no "hello"
692            b"say hello there", // 2: contains "hello"
693        ]);
694
695        let q = regex_to_bigram_query("hello");
696        assert!(!q.is_any());
697
698        let candidates = q.evaluate(&idx).unwrap();
699        assert!(BigramFilter::is_candidate(&candidates, 0));
700        assert!(!BigramFilter::is_candidate(&candidates, 1));
701        assert!(BigramFilter::is_candidate(&candidates, 2));
702    }
703
704    #[test]
705    fn alternation() {
706        let idx = build_test_index(&[
707            b"has foo in it", // 0
708            b"has bar in it", // 1
709            b"has xyz in it", // 2
710        ]);
711
712        let q = regex_to_bigram_query("foo|bar");
713        assert!(!q.is_any());
714
715        let candidates = q.evaluate(&idx).unwrap();
716        assert!(BigramFilter::is_candidate(&candidates, 0));
717        assert!(BigramFilter::is_candidate(&candidates, 1));
718        // xyz doesn't contain foo or bar bigrams
719        assert!(!BigramFilter::is_candidate(&candidates, 2));
720    }
721
722    #[test]
723    fn wildcard_concat() {
724        let idx = build_test_index(&[
725            b"foo something bar", // 0
726            b"foo only",          // 1: has foo but not bar
727            b"only bar",          // 2: has bar but not foo
728        ]);
729
730        let q = regex_to_bigram_query("foo.*bar");
731        assert!(!q.is_any());
732
733        let candidates = q.evaluate(&idx).unwrap();
734        assert!(BigramFilter::is_candidate(&candidates, 0));
735        // file 1 and 2 should be filtered (missing bigrams from "bar" / "foo")
736        assert!(!BigramFilter::is_candidate(&candidates, 1));
737        assert!(!BigramFilter::is_candidate(&candidates, 2));
738    }
739
740    #[test]
741    fn sparse1_across_dot() {
742        // "a.b" should produce a skip-1 bigram (a,b)
743        let idx = build_test_index(&[
744            b"axb", // 0: has sparse-1 (a,b)
745            b"ayb", // 1: has sparse-1 (a,b)
746            b"xyz", // 2: no (a,b) at all
747        ]);
748
749        let q = regex_to_bigram_query("a.b");
750        assert!(!q.is_any());
751
752        let candidates = q.evaluate(&idx).unwrap();
753        assert!(BigramFilter::is_candidate(&candidates, 0));
754        assert!(BigramFilter::is_candidate(&candidates, 1));
755        assert!(!BigramFilter::is_candidate(&candidates, 2));
756    }
757
758    #[test]
759    fn sparse1_across_digit() {
760        // "foo\dbar" → sparse-1 (o,b) across \d
761        let idx = build_test_index(&[
762            b"foo3bar baz", // 0: has all bigrams
763            b"foobar baz",  // 1: has consecutive (o,b) but pattern needs sparse-1
764            b"xyz only",    // 2: no relevant bigrams
765        ]);
766
767        let q = regex_to_bigram_query(r"foo\dbar");
768        assert!(!q.is_any());
769
770        let candidates = q.evaluate(&idx).unwrap();
771        assert!(BigramFilter::is_candidate(&candidates, 0));
772        // file 1 may or may not match depending on what bigrams are in the index
773        // (it has all the literal bigrams and also o,b as both consec and skip-1)
774        // The important thing is file 2 is excluded:
775        assert!(!BigramFilter::is_candidate(&candidates, 2));
776    }
777
778    #[test]
779    fn pure_wildcard_is_any() {
780        let q = regex_to_bigram_query(".*");
781        assert!(q.is_any());
782    }
783
784    #[test]
785    fn single_char_is_any() {
786        let q = regex_to_bigram_query("a");
787        assert!(q.is_any());
788    }
789
790    #[test]
791    fn invalid_regex_is_any() {
792        let q = regex_to_bigram_query("[invalid");
793        assert!(q.is_any());
794    }
795
796    #[test]
797    fn optional_group_excluded() {
798        // (bar)? is optional — its bigrams are not required
799        let q = regex_to_bigram_query("foo(bar)?baz");
800        assert!(!q.is_any());
801
802        let idx = build_test_index(&[
803            b"foobaz content",    // 0: has foo+baz bigrams (bar absent)
804            b"foobarbaz content", // 1: has everything
805            b"xyz only",          // 2: nothing
806        ]);
807
808        let candidates = q.evaluate(&idx).unwrap();
809        assert!(BigramFilter::is_candidate(&candidates, 0));
810        assert!(BigramFilter::is_candidate(&candidates, 1));
811        assert!(!BigramFilter::is_candidate(&candidates, 2));
812    }
813
814    #[test]
815    fn repetition_min2_cross_boundary() {
816        // (ab){2,} → bigram "ab" + cross-boundary "b","a"
817        let q = regex_to_bigram_query("(ab){2,}");
818        assert!(!q.is_any());
819
820        let idx = build_test_index(&[
821            b"ababab", // 0: has "ab" and "b"->"a"
822            b"abonly", // 1: has "ab" but not "b"->"a"
823            b"xyz",    // 2: nothing
824        ]);
825
826        let candidates = q.evaluate(&idx).unwrap();
827        assert!(BigramFilter::is_candidate(&candidates, 0));
828        assert!(!BigramFilter::is_candidate(&candidates, 2));
829    }
830
831    #[test]
832    fn two_dots_no_sparse1() {
833        // "a..b" — two 1-byte parts between a and b, not a single 1-byte part
834        // No sparse-1 (a,b) should be extracted
835        let q = regex_to_bigram_query("a..b");
836        // Single-char literals with 2 unknown bytes between → Any
837        assert!(q.is_any());
838    }
839
840    #[test]
841    fn character_class_cross_boundary() {
842        // [abc]de → cross-boundary OR(ad,bd,cd) + bigram de
843        // All three class variants must appear in the corpus so the OR
844        // branches are tracked in the index (untracked bigrams make the
845        // OR conservatively return None, which is correct but untestable).
846        let idx = build_test_index(&[
847            b"ade content", // 0: has ad
848            b"bde content", // 1: has bd
849            b"cde content", // 2: has cd
850            b"xde content", // 3: has de but not ad/bd/cd
851        ]);
852
853        let q = regex_to_bigram_query("[abc]de");
854        assert!(!q.is_any());
855
856        let candidates = q.evaluate(&idx).unwrap();
857        assert!(BigramFilter::is_candidate(&candidates, 0));
858        assert!(BigramFilter::is_candidate(&candidates, 1));
859        assert!(BigramFilter::is_candidate(&candidates, 2));
860        // file 3 doesn't have ad/bd/cd so should be filtered
861        assert!(!BigramFilter::is_candidate(&candidates, 3));
862    }
863
864    // ── Helpers for inspecting query trees ──────────────────────────
865
866    fn has_consec(q: &BigramQuery, a: u8, b: u8) -> bool {
867        let Some(key) = consec_key(a, b) else {
868            return false;
869        };
870        match q {
871            BigramQuery::Consec(k) => *k == key,
872            BigramQuery::And(cs) | BigramQuery::Or(cs) => cs.iter().any(|c| has_consec(c, a, b)),
873            _ => false,
874        }
875    }
876
877    fn has_skip1(q: &BigramQuery, a: u8, b: u8) -> bool {
878        let Some(key) = consec_key(a, b) else {
879            return false;
880        };
881        match q {
882            BigramQuery::Skip1(k) => *k == key,
883            BigramQuery::And(cs) | BigramQuery::Or(cs) => cs.iter().any(|c| has_skip1(c, a, b)),
884            _ => false,
885        }
886    }
887
888    /// Bigram expectation: `("ab", is_skip1)`.
889    /// The 2-char str is the byte pair; C = consecutive, S = skip-1.
890    type Bg = (&'static str, bool);
891    const C: bool = false;
892    const S: bool = true;
893
894    /// Top 15+ commonly used regex patterns from
895    /// https://digitalfortress.tech/tips/top-15-commonly-used-regex/
896    /// plus typical grep patterns used by agentic tools.
897    ///
898    /// Each entry: `(regex, Option<&[Bg]>)`.
899    ///   - `None`        → pure classes / unsupported syntax, Any is acceptable.
900    ///   - `Some(&[..])` → must be non-Any, and every listed bigram must appear.
901    #[test]
902    fn common_regex_patterns() {
903        #[rustfmt::skip]
904        let cases: &[(&str, Option<&[Bg]>)] = &[
905            // ── Pure-class / anchor / unsupported → Any is fine ──────
906            (r"^\d+$",                                                      None), // 1.  whole numbers
907            (r"^\d*\.\d+$",                                                 None), // 2.  decimals
908            (r"^\d*(\.\d+)?$",                                              None), // 3.  whole + decimal
909            (r"^-?\d*(\.\d+)?$",                                            None), // 4.  neg/pos decimal
910            (r"[-]?[0-9]+[,.]?[0-9]*([/][0-9]+[,.]?[0-9]*)*",             None), // 5.  fractions
911            (r"^[a-zA-Z0-9]*$",                                             None), // 6.  alphanumeric
912            (r"^[a-zA-Z0-9 ]*$",                                            None), // 7.  alphanum + space
913            (r"^([a-zA-Z0-9._%-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})*$",      None), // 8.  email
914            (r"^([a-z0-9_\.\+-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$",         None), // 9.  email v2
915            (r"(?=(.*[0-9]))(?=.*[!@#$%^&*()\[\]{}\-_+=~`|:;<>,./?\x5c])(?=.*[a-z])(?=(.*[A-Z]))(?=(.*)).{8,}", None), // 10. complex pw
916            (r"(?=(.*[0-9]))((?=.*[A-Za-z0-9])(?=.*[A-Z])(?=.*[a-z]))^.{8,}$", None), // 11. moderate pw
917            (r"^[a-z0-9_-]{3,16}$",                                        None), // 12. username
918            (r"(https?://)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", None), // 14. URL optional
919            (r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$", None), // 15. IPv4
920            (r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))", None), // 16. IPv6
921            (r"[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])",          None), // 17. date
922            (r"^(0?[1-9]|1[0-2]):[0-5][0-9]$",                            None), // 18. time 12h
923            (r"((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))",            None), // 19. time AM/PM
924            (r"^(0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$",                     None), // 20. time 24h
925            (r"^([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$",               None), // 21. time 24h v2
926            (r"(?:[01]\d|2[0123]):(?:[012345]\d):(?:[012345]\d)",          None), // 22. time+sec
927            (r"</?[\w\s]*>|<.+[\W]>",                                      None), // 23. HTML tag
928            (r"\bon\w+=\S+(?=.*>)",                                        None), // 24. inline JS
929            (r"^[a-z0-9]+(?:-[a-z0-9]+)*$",                               None), // 25. slug
930            (r"(\b\w+\b)(?=.*\b\1\b)",                                    None), // 26. dup words
931            (r"^[\w,\s-]+\.[A-Za-z]{3}$",                                 None), // 27. filename
932            (r"^[A-PR-WY][1-9]\d\s?\d{4}[1-9]$",                         None), // 28. HK ID
933
934            // ── Patterns with extractable literal bigrams ────────────
935
936            // 13. URL with required protocol
937            (r"https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", Some(&[
938                ("ht", C), ("tt", C), ("tp", C),   // from "http"
939                ("ht", S), ("tp", S),               // from "http" skip-1
940                (":/", C), ("//", C),               // from "://"
941            ])),
942
943            // 29. fn\s+\w+
944            (r"fn\s+\w+", Some(&[
945                ("fn", C),                          // from "fn"
946                ("n ", C),                          // cross-boundary: 'n' → \s starts ' '
947            ])),
948
949            // 30. use\s+crate::
950            (r"use\s+crate::", Some(&[
951                ("us", C), ("se", C), ("ue", S),   // from "use"
952                ("cr", C), ("ra", C), ("at", C),   // from "crate"
953                ("te", C), ("::", C),
954                ("ca", S), ("rt", S), ("ae", S),   // "crate" skip-1
955            ])),
956
957            // 31. unwrap\(\)|expect\(
958            (r"unwrap\(\)|expect\(", Some(&[
959                ("nw", C), ("wr", C), ("ra", C),   // "unwrap("
960                ("ap", C), ("p(", C),
961                ("xp", C), ("pe", C), ("ec", C),   // "expect("
962                ("ct", C), ("t(", C),
963            ])),
964
965            // 32. TODO|FIXME|HACK
966            (r"TODO|FIXME|HACK", Some(&[
967                ("to", C), ("od", C), ("do", C),   // "TODO"
968                ("fi", C), ("ix", C), ("xm", C),   // "FIXME"
969                ("me", C),
970                ("ha", C), ("ac", C), ("ck", C),   // "HACK"
971                ("hc", S), ("ak", S),               // "HACK" skip-1
972            ])),
973        ];
974
975        for (i, &(pattern, expected)) in cases.iter().enumerate() {
976            let q = regex_to_bigram_query(pattern);
977
978            if let Some(bigrams) = expected {
979                assert!(
980                    !q.is_any(),
981                    "#{i} {pattern:?}: expected bigrams but got Any"
982                );
983
984                for &(pair, skip) in bigrams {
985                    let b = pair.as_bytes();
986                    debug_assert_eq!(b.len(), 2, "bigram must be 2 chars: {pair:?}");
987                    let found = if skip {
988                        has_skip1(&q, b[0], b[1])
989                    } else {
990                        has_consec(&q, b[0], b[1])
991                    };
992                    let kind = if skip { "skip-1" } else { "consec" };
993                    assert!(found, "#{i} {pattern:?}: missing {kind} bigram {pair:?}");
994                }
995            }
996        }
997    }
998}