//! Regex → bigram decomposition for the inverted bigram index.
//!
//! Parses a regex pattern with `regex-syntax`, walks the HIR to extract
//! guaranteed bigram keys (u16), and evaluates them as an AND/OR query tree
//! against [`BigramFilter`]'s inverted posting lists.
//!
//! Two bigram types are extracted:
//! - **Consecutive** (gap=0): adjacent byte pairs `(pattern[i], pattern[i+1])`
//! - **Sparse-1** (gap=1): pairs across a single-byte wildcard, e.g. `a.b → (a,b)`
//!
//! The sparse-1 extraction is the key feature: regex patterns like `foo.bar`
//! yield the cross-boundary sparse-1 bigram `(o,b)` that provides strong
//! filtering even when the `.` prevents any consecutive cross-boundary bigram.
use crate::bigram_filter::BigramFilter;
use regex_syntax::hir::{Class, Hir, HirKind};
use smallvec::SmallVec;
use std::borrow::Cow;
/// Maximum byte values to enumerate from a character class.
/// Larger classes are treated as unknown (no bigram extractable).
const MAX_CLASS_EXPAND: usize = 16;
#[inline]
fn consec_key(a: u8, b: u8) -> Option<u16> {
let al = a.to_ascii_lowercase();
let bl = b.to_ascii_lowercase();
if (32..=126).contains(&al) && (32..=126).contains(&bl) {
Some((al as u16) << 8 | bl as u16)
} else {
None
}
}
#[derive(Debug, Clone)]
pub enum BigramQuery {
Any,
/// A consecutive bigram key to look up in the main index.
Consec(u16),
/// A skip-1 bigram key to look up in the skip sub-index.
Skip1(u16),
/// All children must match (intersect posting lists).
And(Vec<BigramQuery>),
/// At least one child must match (union posting lists).
Or(Vec<BigramQuery>),
}
/// SIMD-friendly bitwise OR of two equal-length bitsets.
#[inline]
fn bitset_or(a: &mut [u64], b: &[u64]) {
a.iter_mut().zip(b.iter()).for_each(|(x, y)| *x |= *y);
}
/// SIMD-friendly bitwise AND of two equal-length bitsets.
#[inline]
fn bitset_and(a: &mut [u64], b: &[u64]) {
a.iter_mut().zip(b.iter()).for_each(|(x, y)| *x &= *y);
}
impl BigramQuery {
pub fn is_any(&self) -> bool {
matches!(self, BigramQuery::Any)
}
pub(crate) fn evaluate(&self, index: &BigramFilter) -> Option<Vec<u64>> {
self.evaluate_cow(index).map(Cow::into_owned)
}
fn evaluate_cow<'a>(&self, index: &'a BigramFilter) -> Option<Cow<'a, [u64]>> {
match self {
BigramQuery::Any => None,
BigramQuery::Consec(key) => {
let col = index.lookup()[*key as usize];
if col == u16::MAX {
return None;
}
let words = index.words();
let offset = col as usize * words;
let data = index.dense_data();
if offset + words > data.len() {
return None;
}
Some(Cow::Borrowed(&data[offset..offset + words]))
}
BigramQuery::Skip1(key) => {
let skip = index.skip_index()?;
let col = skip.lookup()[*key as usize];
if col == u16::MAX {
return None;
}
let words = skip.words();
let offset = col as usize * words;
let data = skip.dense_data();
if offset + words > data.len() {
return None;
}
Some(Cow::Borrowed(&data[offset..offset + words]))
}
BigramQuery::And(children) => {
let mut result: Option<Vec<u64>> = None;
for child in children {
if let Some(child_bits) = child.evaluate_cow(index) {
result = Some(match result {
None => child_bits.into_owned(),
Some(mut r) => {
bitset_and(&mut r, &child_bits);
r
}
});
}
}
result.map(Cow::Owned)
}
BigramQuery::Or(children) => {
if children.is_empty() {
return None;
}
let mut result: Option<Vec<u64>> = None;
for child in children {
match child.evaluate_cow(index) {
// Any branch can't be filtered → whole OR can't be filtered
None => return None,
Some(child_bits) => {
result = Some(match result {
None => child_bits.into_owned(),
Some(mut r) => {
bitset_or(&mut r, &child_bits);
r
}
});
}
}
}
result.map(Cow::Owned)
}
}
}
}
/// Intermediate state tracked during HIR traversal for bigram extraction.
struct HirInfo {
query: BigramQuery,
/// Possible first bytes (lowercased, printable ASCII) when this node matches.
first: Option<SmallVec<[u8; MAX_CLASS_EXPAND]>>,
/// Possible last bytes.
last: Option<SmallVec<[u8; MAX_CLASS_EXPAND]>>,
/// Whether this node can match the empty string.
can_be_empty: bool,
}
impl HirInfo {
fn empty() -> Self {
Self {
query: BigramQuery::Any,
first: None,
last: None,
can_be_empty: true,
}
}
}
/// Prefilter fuzzy query. The algorithm is the following:
/// we allow max_typos = min(len/3,2) every typo destroys at most 2 consecutive bigrams
/// So out of N bigrams at least N - 2 * max_typos have to present in the matching fil
pub(crate) fn fuzzy_to_bigram_query(query: &str, num_probes: usize) -> BigramQuery {
let lower: Vec<u8> = query.bytes().map(|b| b.to_ascii_lowercase()).collect();
if lower.len() < 2 {
return BigramQuery::Any;
}
let max_typos = (lower.len() / 3).min(2);
// Extract all consecutive bigram keys.
let bigram_keys: Vec<u16> = lower
.windows(2)
.filter_map(|w| consec_key(w[0], w[1]))
.collect();
if bigram_keys.is_empty() {
return BigramQuery::Any;
}
// For very short queries (0 typos), AND all bigrams — exact subsequence.
if max_typos == 0 {
return simplify_and(
bigram_keys
.iter()
.map(|&k| BigramQuery::Consec(k))
.collect(),
);
}
// Pick evenly-spaced probe bigrams.
let n = num_probes.min(bigram_keys.len());
if n <= max_typos {
// Too few probes to require anything useful.
return simplify_or(
bigram_keys
.iter()
.map(|&k| BigramQuery::Consec(k))
.collect(),
);
}
let probes: Vec<u16> = if n == bigram_keys.len() {
bigram_keys
} else {
(0..n)
.map(|i| {
let idx = i * (bigram_keys.len() - 1) / (n - 1);
bigram_keys[idx]
})
.collect()
};
let required = n - max_typos;
// If required == n, just AND all probes.
if required >= n {
return simplify_and(probes.iter().map(|&k| BigramQuery::Consec(k)).collect());
}
// Generate all C(n, required) subsets → OR(AND(subset), ...)
let mut branches = Vec::new();
let mut combo = vec![0u16; required];
combine(&probes, required, 0, 0, &mut combo, &mut branches);
simplify_or(branches)
}
/// Build C(n, k) combination branches in-place on a fixed-size slice.
fn combine(
items: &[u16],
k: usize,
start: usize,
depth: usize,
combo: &mut [u16],
branches: &mut Vec<BigramQuery>,
) {
if depth == k {
branches.push(simplify_and(
combo.iter().map(|&key| BigramQuery::Consec(key)).collect(),
));
return;
}
let remaining = k - depth;
for i in start..=items.len() - remaining {
combo[depth] = items[i];
combine(items, k, i + 1, depth + 1, combo, branches);
}
}
pub(crate) fn regex_to_bigram_query(pattern: &str) -> BigramQuery {
let mut parser = regex_syntax::ParserBuilder::new()
.unicode(false)
.utf8(false)
.build();
let hir = match parser.parse(pattern) {
Ok(h) => h,
Err(_) => return BigramQuery::Any,
};
decompose(&hir).query
}
fn decompose(hir: &Hir) -> HirInfo {
let can_be_empty = hir.properties().minimum_len().is_none_or(|n| n == 0);
match hir.kind() {
HirKind::Empty => HirInfo::empty(),
HirKind::Literal(lit) => decompose_literal(lit.0.as_ref()),
HirKind::Class(class) => {
let bytes = expand_class(class);
match bytes {
Some(b) if !b.is_empty() => HirInfo {
query: BigramQuery::Any,
first: Some(b.clone()),
last: Some(b),
can_be_empty,
},
_ => HirInfo {
query: BigramQuery::Any,
first: None,
last: None,
can_be_empty,
},
}
}
HirKind::Look(_) => HirInfo::empty(),
HirKind::Repetition(rep) => {
let inner = decompose(&rep.sub);
if rep.min == 0 {
HirInfo {
query: BigramQuery::Any,
first: inner.first,
last: inner.last,
can_be_empty: true,
}
} else {
// min >= 1: inner bigrams guaranteed
let mut qs = Vec::new();
if !inner.query.is_any() {
qs.push(inner.query.clone());
}
// min >= 2: cross-boundary between consecutive occurrences
if rep.min >= 2 {
push_cross_consec(&mut qs, inner.last.as_deref(), inner.first.as_deref());
}
HirInfo {
query: simplify_and(qs),
first: inner.first,
last: inner.last,
can_be_empty,
}
}
}
HirKind::Capture(cap) => decompose(&cap.sub),
HirKind::Concat(parts) => decompose_concat(parts),
HirKind::Alternation(alts) => decompose_alternation(alts),
}
}
/// Extract bigrams from a literal byte sequence.
fn decompose_literal(bytes: &[u8]) -> HirInfo {
if bytes.is_empty() {
return HirInfo::empty();
}
let lower: SmallVec<[u8; 64]> = bytes.iter().map(|b| b.to_ascii_lowercase()).collect();
if lower.len() == 1 {
let b = lower[0];
let first = if (32..=126).contains(&b) {
Some(SmallVec::from_slice(&[b]))
} else {
None
};
return HirInfo {
query: BigramQuery::Any,
first: first.clone(),
last: first,
can_be_empty: false,
};
}
let mut qs: Vec<BigramQuery> = Vec::new();
// Consecutive bigrams
for w in lower.windows(2) {
if let Some(k) = consec_key(w[0], w[1]) {
qs.push(BigramQuery::Consec(k));
}
}
// Skip-1 bigrams from the literal itself
if lower.len() >= 3 {
for i in 0..lower.len() - 2 {
if let Some(k) = consec_key(lower[i], lower[i + 2]) {
qs.push(BigramQuery::Skip1(k));
}
}
}
let first_byte = lower[0];
let last_byte = *lower.last().unwrap();
HirInfo {
query: simplify_and(qs),
first: if (32..=126).contains(&first_byte) {
Some(SmallVec::from_slice(&[first_byte]))
} else {
None
},
last: if (32..=126).contains(&last_byte) {
Some(SmallVec::from_slice(&[last_byte]))
} else {
None
},
can_be_empty: false,
}
}
fn decompose_concat(parts: &[Hir]) -> HirInfo {
if parts.is_empty() {
return HirInfo::empty();
}
let infos: Vec<HirInfo> = parts.iter().map(decompose).collect();
let mut qs: Vec<BigramQuery> = Vec::new();
// 1. Collect child bigrams
for info in &infos {
if !info.query.is_any() {
qs.push(info.query.clone());
}
}
// 2. Dense cross-boundary between adjacent mandatory parts
for pair in infos.windows(2) {
if !pair[0].can_be_empty && !pair[1].can_be_empty {
push_cross_consec(&mut qs, pair[0].last.as_deref(), pair[1].first.as_deref());
}
}
// 3. Sparse-1 cross-boundary: across a single 1-byte-wide middle part.
// Catches `foo.bar` → sparse-1 `(o,b)` across the dot.
if parts.len() >= 3 {
for i in 0..parts.len() - 2 {
let left = &infos[i];
let mid = &parts[i + 1];
let right = &infos[i + 2];
let min_len = mid.properties().minimum_len();
let max_len = mid.properties().maximum_len();
let is_1byte = min_len == Some(1) && max_len == Some(1);
if is_1byte && !left.can_be_empty && !right.can_be_empty {
push_cross_skip1(&mut qs, left.last.as_deref(), right.first.as_deref());
}
}
}
let first = collect_first(&infos);
let last = collect_last(&infos);
let can_be_empty = infos.iter().all(|i| i.can_be_empty);
HirInfo {
query: simplify_and(qs),
first,
last,
can_be_empty,
}
}
fn decompose_alternation(alts: &[Hir]) -> HirInfo {
if alts.is_empty() {
return HirInfo::empty();
}
let infos: Vec<HirInfo> = alts.iter().map(decompose).collect();
let query = simplify_or(infos.iter().map(|i| i.query.clone()).collect());
let first = merge_byte_sets(infos.iter().map(|i| &i.first));
let last = merge_byte_sets(infos.iter().map(|i| &i.last));
let can_be_empty = infos.iter().any(|i| i.can_be_empty);
HirInfo {
query,
first,
last,
can_be_empty,
}
}
fn expand_class(class: &Class) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
let mut bytes: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
match class {
Class::Bytes(bc) => {
for range in bc.ranges() {
let count = (range.end() as usize) - (range.start() as usize) + 1;
if bytes.len() + count > MAX_CLASS_EXPAND {
return None;
}
for b in range.start()..=range.end() {
if (32..=126).contains(&b) {
let lower = b.to_ascii_lowercase();
if !bytes.contains(&lower) {
bytes.push(lower);
}
}
}
}
}
Class::Unicode(uc) => {
for range in uc.ranges() {
let start = range.start() as u32;
let end = range.end() as u32;
if start > 127 {
continue;
}
let ascii_end = end.min(126) as u8;
let ascii_start = start.max(32) as u8;
if ascii_start > ascii_end {
continue;
}
let count = (ascii_end - ascii_start) as usize + 1;
if bytes.len() + count > MAX_CLASS_EXPAND {
return None;
}
for b in ascii_start..=ascii_end {
let lower = b.to_ascii_lowercase();
if !bytes.contains(&lower) {
bytes.push(lower);
}
}
}
}
}
if bytes.is_empty() { None } else { Some(bytes) }
}
/// Push consecutive cross-product bigrams into `qs`.
fn push_cross_consec(qs: &mut Vec<BigramQuery>, last: Option<&[u8]>, first: Option<&[u8]>) {
if let Some(q) = cross_product(last, first, false) {
qs.push(q);
}
}
/// Push skip-1 cross-product bigrams into `qs`.
fn push_cross_skip1(qs: &mut Vec<BigramQuery>, last: Option<&[u8]>, first: Option<&[u8]>) {
if let Some(q) = cross_product(last, first, true) {
qs.push(q);
}
}
fn cross_product(last: Option<&[u8]>, first: Option<&[u8]>, skip: bool) -> Option<BigramQuery> {
let last = last?;
let first = first?;
let n = last.len() * first.len();
if n == 0 || n > MAX_CLASS_EXPAND * MAX_CLASS_EXPAND {
return None;
}
let mut bigrams: Vec<BigramQuery> = Vec::with_capacity(n);
for &l in last {
for &f in first {
if let Some(k) = consec_key(l, f) {
let node = if skip {
BigramQuery::Skip1(k)
} else {
BigramQuery::Consec(k)
};
bigrams.push(node);
}
}
}
match bigrams.len() {
0 => None,
1 => Some(bigrams.into_iter().next().unwrap()),
_ => Some(simplify_or(bigrams)),
}
}
fn collect_first(infos: &[HirInfo]) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
let mut result: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
for info in infos {
if let Some(ref bytes) = info.first {
for &b in bytes {
if !result.contains(&b) {
if result.len() >= MAX_CLASS_EXPAND {
return None;
}
result.push(b);
}
}
} else if !info.can_be_empty {
return None;
}
if !info.can_be_empty {
break;
}
}
if result.is_empty() {
None
} else {
Some(result)
}
}
fn collect_last(infos: &[HirInfo]) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
let mut result: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
for info in infos.iter().rev() {
if let Some(ref bytes) = info.last {
for &b in bytes {
if !result.contains(&b) {
if result.len() >= MAX_CLASS_EXPAND {
return None;
}
result.push(b);
}
}
} else if !info.can_be_empty {
return None;
}
if !info.can_be_empty {
break;
}
}
if result.is_empty() {
None
} else {
Some(result)
}
}
fn merge_byte_sets<'a>(
iter: impl Iterator<Item = &'a Option<SmallVec<[u8; MAX_CLASS_EXPAND]>>>,
) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
let mut result: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
for opt in iter {
match opt {
None => return None,
Some(bytes) => {
for &b in bytes {
if !result.contains(&b) {
if result.len() >= MAX_CLASS_EXPAND {
return None;
}
result.push(b);
}
}
}
}
}
if result.is_empty() {
None
} else {
Some(result)
}
}
fn simplify_and(children: Vec<BigramQuery>) -> BigramQuery {
let mut flat: Vec<BigramQuery> = Vec::new();
for child in children {
match child {
BigramQuery::Any => {}
BigramQuery::And(inner) => flat.extend(inner),
other => flat.push(other),
}
}
match flat.len() {
0 => BigramQuery::Any,
1 => flat.into_iter().next().unwrap(),
_ => BigramQuery::And(flat),
}
}
fn simplify_or(children: Vec<BigramQuery>) -> BigramQuery {
if children.iter().any(|c| c.is_any()) {
return BigramQuery::Any;
}
let mut flat: Vec<BigramQuery> = Vec::new();
for child in children {
match child {
BigramQuery::Or(inner) => flat.extend(inner),
other => flat.push(other),
}
}
match flat.len() {
0 => BigramQuery::Any,
1 => flat.into_iter().next().unwrap(),
_ => BigramQuery::Or(flat),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::bigram_filter::BigramIndexBuilder;
/// Build a tiny index from the given file contents for testing.
fn build_test_index(files: &[&[u8]]) -> BigramFilter {
let n = files.len();
let consec_builder = BigramIndexBuilder::new(n);
let skip_builder = BigramIndexBuilder::new(n);
for (i, content) in files.iter().enumerate() {
consec_builder.add_file_content(&skip_builder, i, content);
}
let mut idx = consec_builder.compress(Some(0));
idx.set_skip_index(skip_builder.compress(Some(0)));
idx
}
#[test]
fn literal_pattern() {
let idx = build_test_index(&[
b"hello world", // 0: contains "hello"
b"goodbye world", // 1: no "hello"
b"say hello there", // 2: contains "hello"
]);
let q = regex_to_bigram_query("hello");
assert!(!q.is_any());
let candidates = q.evaluate(&idx).unwrap();
assert!(BigramFilter::is_candidate(&candidates, 0));
assert!(!BigramFilter::is_candidate(&candidates, 1));
assert!(BigramFilter::is_candidate(&candidates, 2));
}
#[test]
fn alternation() {
let idx = build_test_index(&[
b"has foo in it", // 0
b"has bar in it", // 1
b"has xyz in it", // 2
]);
let q = regex_to_bigram_query("foo|bar");
assert!(!q.is_any());
let candidates = q.evaluate(&idx).unwrap();
assert!(BigramFilter::is_candidate(&candidates, 0));
assert!(BigramFilter::is_candidate(&candidates, 1));
// xyz doesn't contain foo or bar bigrams
assert!(!BigramFilter::is_candidate(&candidates, 2));
}
#[test]
fn wildcard_concat() {
let idx = build_test_index(&[
b"foo something bar", // 0
b"foo only", // 1: has foo but not bar
b"only bar", // 2: has bar but not foo
]);
let q = regex_to_bigram_query("foo.*bar");
assert!(!q.is_any());
let candidates = q.evaluate(&idx).unwrap();
assert!(BigramFilter::is_candidate(&candidates, 0));
// file 1 and 2 should be filtered (missing bigrams from "bar" / "foo")
assert!(!BigramFilter::is_candidate(&candidates, 1));
assert!(!BigramFilter::is_candidate(&candidates, 2));
}
#[test]
fn sparse1_across_dot() {
// "a.b" should produce a skip-1 bigram (a,b)
let idx = build_test_index(&[
b"axb", // 0: has sparse-1 (a,b)
b"ayb", // 1: has sparse-1 (a,b)
b"xyz", // 2: no (a,b) at all
]);
let q = regex_to_bigram_query("a.b");
assert!(!q.is_any());
let candidates = q.evaluate(&idx).unwrap();
assert!(BigramFilter::is_candidate(&candidates, 0));
assert!(BigramFilter::is_candidate(&candidates, 1));
assert!(!BigramFilter::is_candidate(&candidates, 2));
}
#[test]
fn sparse1_across_digit() {
// "foo\dbar" → sparse-1 (o,b) across \d
let idx = build_test_index(&[
b"foo3bar baz", // 0: has all bigrams
b"foobar baz", // 1: has consecutive (o,b) but pattern needs sparse-1
b"xyz only", // 2: no relevant bigrams
]);
let q = regex_to_bigram_query(r"foo\dbar");
assert!(!q.is_any());
let candidates = q.evaluate(&idx).unwrap();
assert!(BigramFilter::is_candidate(&candidates, 0));
// file 1 may or may not match depending on what bigrams are in the index
// (it has all the literal bigrams and also o,b as both consec and skip-1)
// The important thing is file 2 is excluded:
assert!(!BigramFilter::is_candidate(&candidates, 2));
}
#[test]
fn pure_wildcard_is_any() {
let q = regex_to_bigram_query(".*");
assert!(q.is_any());
}
#[test]
fn single_char_is_any() {
let q = regex_to_bigram_query("a");
assert!(q.is_any());
}
#[test]
fn invalid_regex_is_any() {
let q = regex_to_bigram_query("[invalid");
assert!(q.is_any());
}
#[test]
fn optional_group_excluded() {
// (bar)? is optional — its bigrams are not required
let q = regex_to_bigram_query("foo(bar)?baz");
assert!(!q.is_any());
let idx = build_test_index(&[
b"foobaz content", // 0: has foo+baz bigrams (bar absent)
b"foobarbaz content", // 1: has everything
b"xyz only", // 2: nothing
]);
let candidates = q.evaluate(&idx).unwrap();
assert!(BigramFilter::is_candidate(&candidates, 0));
assert!(BigramFilter::is_candidate(&candidates, 1));
assert!(!BigramFilter::is_candidate(&candidates, 2));
}
#[test]
fn repetition_min2_cross_boundary() {
// (ab){2,} → bigram "ab" + cross-boundary "b","a"
let q = regex_to_bigram_query("(ab){2,}");
assert!(!q.is_any());
let idx = build_test_index(&[
b"ababab", // 0: has "ab" and "b"->"a"
b"abonly", // 1: has "ab" but not "b"->"a"
b"xyz", // 2: nothing
]);
let candidates = q.evaluate(&idx).unwrap();
assert!(BigramFilter::is_candidate(&candidates, 0));
assert!(!BigramFilter::is_candidate(&candidates, 2));
}
#[test]
fn two_dots_no_sparse1() {
// "a..b" — two 1-byte parts between a and b, not a single 1-byte part
// No sparse-1 (a,b) should be extracted
let q = regex_to_bigram_query("a..b");
// Single-char literals with 2 unknown bytes between → Any
assert!(q.is_any());
}
#[test]
fn character_class_cross_boundary() {
// [abc]de → cross-boundary OR(ad,bd,cd) + bigram de
// All three class variants must appear in the corpus so the OR
// branches are tracked in the index (untracked bigrams make the
// OR conservatively return None, which is correct but untestable).
let idx = build_test_index(&[
b"ade content", // 0: has ad
b"bde content", // 1: has bd
b"cde content", // 2: has cd
b"xde content", // 3: has de but not ad/bd/cd
]);
let q = regex_to_bigram_query("[abc]de");
assert!(!q.is_any());
let candidates = q.evaluate(&idx).unwrap();
assert!(BigramFilter::is_candidate(&candidates, 0));
assert!(BigramFilter::is_candidate(&candidates, 1));
assert!(BigramFilter::is_candidate(&candidates, 2));
// file 3 doesn't have ad/bd/cd so should be filtered
assert!(!BigramFilter::is_candidate(&candidates, 3));
}
// ── Helpers for inspecting query trees ──────────────────────────
fn has_consec(q: &BigramQuery, a: u8, b: u8) -> bool {
let Some(key) = consec_key(a, b) else {
return false;
};
match q {
BigramQuery::Consec(k) => *k == key,
BigramQuery::And(cs) | BigramQuery::Or(cs) => cs.iter().any(|c| has_consec(c, a, b)),
_ => false,
}
}
fn has_skip1(q: &BigramQuery, a: u8, b: u8) -> bool {
let Some(key) = consec_key(a, b) else {
return false;
};
match q {
BigramQuery::Skip1(k) => *k == key,
BigramQuery::And(cs) | BigramQuery::Or(cs) => cs.iter().any(|c| has_skip1(c, a, b)),
_ => false,
}
}
/// Bigram expectation: `("ab", is_skip1)`.
/// The 2-char str is the byte pair; C = consecutive, S = skip-1.
type Bg = (&'static str, bool);
const C: bool = false;
const S: bool = true;
/// Top 15+ commonly used regex patterns from
/// https://digitalfortress.tech/tips/top-15-commonly-used-regex/
/// plus typical grep patterns used by agentic tools.
///
/// Each entry: `(regex, Option<&[Bg]>)`.
/// - `None` → pure classes / unsupported syntax, Any is acceptable.
/// - `Some(&[..])` → must be non-Any, and every listed bigram must appear.
#[test]
fn common_regex_patterns() {
#[rustfmt::skip]
let cases: &[(&str, Option<&[Bg]>)] = &[
// ── Pure-class / anchor / unsupported → Any is fine ──────
(r"^\d+$", None), // 1. whole numbers
(r"^\d*\.\d+$", None), // 2. decimals
(r"^\d*(\.\d+)?$", None), // 3. whole + decimal
(r"^-?\d*(\.\d+)?$", None), // 4. neg/pos decimal
(r"[-]?[0-9]+[,.]?[0-9]*([/][0-9]+[,.]?[0-9]*)*", None), // 5. fractions
(r"^[a-zA-Z0-9]*$", None), // 6. alphanumeric
(r"^[a-zA-Z0-9 ]*$", None), // 7. alphanum + space
(r"^([a-zA-Z0-9._%-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})*$", None), // 8. email
(r"^([a-z0-9_\.\+-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$", None), // 9. email v2
(r"(?=(.*[0-9]))(?=.*[!@#$%^&*()\[\]{}\-_+=~`|:;<>,./?\x5c])(?=.*[a-z])(?=(.*[A-Z]))(?=(.*)).{8,}", None), // 10. complex pw
(r"(?=(.*[0-9]))((?=.*[A-Za-z0-9])(?=.*[A-Z])(?=.*[a-z]))^.{8,}$", None), // 11. moderate pw
(r"^[a-z0-9_-]{3,16}$", None), // 12. username
(r"(https?://)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", None), // 14. URL optional
(r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$", None), // 15. IPv4
(r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))", None), // 16. IPv6
(r"[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])", None), // 17. date
(r"^(0?[1-9]|1[0-2]):[0-5][0-9]$", None), // 18. time 12h
(r"((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))", None), // 19. time AM/PM
(r"^(0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$", None), // 20. time 24h
(r"^([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$", None), // 21. time 24h v2
(r"(?:[01]\d|2[0123]):(?:[012345]\d):(?:[012345]\d)", None), // 22. time+sec
(r"</?[\w\s]*>|<.+[\W]>", None), // 23. HTML tag
(r"\bon\w+=\S+(?=.*>)", None), // 24. inline JS
(r"^[a-z0-9]+(?:-[a-z0-9]+)*$", None), // 25. slug
(r"(\b\w+\b)(?=.*\b\1\b)", None), // 26. dup words
(r"^[\w,\s-]+\.[A-Za-z]{3}$", None), // 27. filename
(r"^[A-PR-WY][1-9]\d\s?\d{4}[1-9]$", None), // 28. HK ID
// ── Patterns with extractable literal bigrams ────────────
// 13. URL with required protocol
(r"https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", Some(&[
("ht", C), ("tt", C), ("tp", C), // from "http"
("ht", S), ("tp", S), // from "http" skip-1
(":/", C), ("//", C), // from "://"
])),
// 29. fn\s+\w+
(r"fn\s+\w+", Some(&[
("fn", C), // from "fn"
("n ", C), // cross-boundary: 'n' → \s starts ' '
])),
// 30. use\s+crate::
(r"use\s+crate::", Some(&[
("us", C), ("se", C), ("ue", S), // from "use"
("cr", C), ("ra", C), ("at", C), // from "crate"
("te", C), ("::", C),
("ca", S), ("rt", S), ("ae", S), // "crate" skip-1
])),
// 31. unwrap\(\)|expect\(
(r"unwrap\(\)|expect\(", Some(&[
("nw", C), ("wr", C), ("ra", C), // "unwrap("
("ap", C), ("p(", C),
("xp", C), ("pe", C), ("ec", C), // "expect("
("ct", C), ("t(", C),
])),
// 32. TODO|FIXME|HACK
(r"TODO|FIXME|HACK", Some(&[
("to", C), ("od", C), ("do", C), // "TODO"
("fi", C), ("ix", C), ("xm", C), // "FIXME"
("me", C),
("ha", C), ("ac", C), ("ck", C), // "HACK"
("hc", S), ("ak", S), // "HACK" skip-1
])),
];
for (i, &(pattern, expected)) in cases.iter().enumerate() {
let q = regex_to_bigram_query(pattern);
if let Some(bigrams) = expected {
assert!(
!q.is_any(),
"#{i} {pattern:?}: expected bigrams but got Any"
);
for &(pair, skip) in bigrams {
let b = pair.as_bytes();
debug_assert_eq!(b.len(), 2, "bigram must be 2 chars: {pair:?}");
let found = if skip {
has_skip1(&q, b[0], b[1])
} else {
has_consec(&q, b[0], b[1])
};
let kind = if skip { "skip-1" } else { "consec" };
assert!(found, "#{i} {pattern:?}: missing {kind} bigram {pair:?}");
}
}
}
}
}