#![cfg_attr(docsrs, feature(doc_auto_cfg))]
use holys3_core::{grams_query, Strategy};
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Query {
All,
None,
And(Vec<Query>),
Or(Vec<Query>),
Gram(Vec<u8>),
}
fn lit_query(lit: &[u8], s: Strategy) -> Query {
let grams = grams_query(lit, s);
if grams.is_empty() {
Query::All
} else {
Query::And(grams.into_iter().map(Query::Gram).collect())
}
}
fn side_query(seq: ®ex_syntax::hir::literal::Seq, strategy: Strategy) -> Option<Query> {
let lits = seq.literals()?;
if lits.is_empty() {
return None;
}
let branches: Vec<Query> = lits
.iter()
.map(|l| lit_query(l.as_bytes(), strategy))
.collect();
if branches.contains(&Query::All) {
None
} else {
Some(Query::Or(branches))
}
}
fn hir_query(hir: ®ex_syntax::hir::Hir, strategy: Strategy) -> Query {
use regex_syntax::hir::literal::{ExtractKind, Extractor};
use regex_syntax::hir::HirKind;
let mut parts: Vec<Query> = [
Extractor::new().extract(hir),
Extractor::new().kind(ExtractKind::Suffix).extract(hir),
]
.iter()
.filter_map(|seq| side_query(seq, strategy))
.collect();
match hir.kind() {
HirKind::Concat(children) => {
parts.extend(children.iter().map(|child| hir_query(child, strategy)));
}
HirKind::Alternation(alternatives) => {
let branches: Vec<Query> = alternatives
.iter()
.map(|alt| hir_query(alt, strategy))
.collect();
if !branches.contains(&Query::All) {
parts.push(Query::Or(branches));
}
}
HirKind::Capture(capture) => parts.push(hir_query(&capture.sub, strategy)),
HirKind::Repetition(rep) if rep.min >= 1 => {
parts.push(hir_query(&rep.sub, strategy));
}
_ => {}
}
let mut unique: Vec<Query> = Vec::new();
for part in parts {
if part != Query::All && !unique.contains(&part) {
unique.push(part);
}
}
match unique.len() {
0 => Query::All,
1 => unique.swap_remove(0),
_ => Query::And(unique),
}
}
pub fn plan(pattern: &str, strategy: Strategy) -> anyhow::Result<Query> {
let hir = regex_syntax::ParserBuilder::new()
.utf8(false)
.build()
.parse(pattern)?;
Ok(hir_query(&hir, strategy))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn wildcard_is_all() {
assert_eq!(plan(".*", Strategy::Sparse).unwrap(), Query::All);
}
#[test]
fn single_char_literal_is_all() {
assert_eq!(plan("a", Strategy::Sparse).unwrap(), Query::All);
}
#[test]
fn literal_yields_gram_conjunction() {
let q = plan("handleClick", Strategy::Sparse).unwrap();
assert!(matches!(q, Query::Or(_)));
assert_ne!(q, Query::All);
}
fn grams_of(q: &Query) -> Vec<Vec<u8>> {
match q {
Query::Gram(g) => vec![g.clone()],
Query::And(children) | Query::Or(children) => {
children.iter().flat_map(grams_of).collect()
}
Query::All | Query::None => Vec::new(),
}
}
#[test]
fn dot_star_gap_constrains_both_sides() {
let q = plan("needle.*haystack", Strategy::Trigram).unwrap();
let Query::And(sides) = &q else {
panic!("expected And of prefix+suffix sides, got {q:?}")
};
assert_eq!(sides.len(), 2);
let grams = grams_of(&q);
assert!(grams.contains(&b"nee".to_vec()), "missing prefix grams");
assert!(grams.contains(&b"hay".to_vec()), "missing suffix grams");
}
#[test]
fn unbounded_suffix_still_uses_prefix() {
let q = plan("needle.*", Strategy::Trigram).unwrap();
assert!(matches!(q, Query::Or(_)));
assert!(grams_of(&q).contains(&b"nee".to_vec()));
}
#[test]
fn alternation_with_common_suffix() {
let q = plan("(http|grpc)_requests_total", Strategy::Trigram).unwrap();
let grams = grams_of(&q);
assert!(
grams.contains(&b"tot".to_vec()),
"suffix constraint dropped"
);
}
#[test]
fn inner_literal_constrains_unanchored_pattern() {
let q = plan(".*ERROR.*", Strategy::Trigram).unwrap();
assert_ne!(q, Query::All, ".*ERROR.* must not scan everything");
assert!(grams_of(&q).contains(&b"ERR".to_vec()));
}
#[test]
fn inner_alternation_constrains() {
let q = plan(".*(quick|lazy).*", Strategy::Trigram).unwrap();
assert_ne!(q, Query::All);
let grams = grams_of(&q);
assert!(grams.contains(&b"qui".to_vec()));
assert!(grams.contains(&b"laz".to_vec()));
}
#[test]
fn optional_inner_literal_stays_all() {
assert_eq!(plan(".*(ERROR)?.*", Strategy::Trigram).unwrap(), Query::All);
}
#[test]
fn repeated_inner_literal_constrains() {
let q = plan(".*(ERROR)+.*", Strategy::Trigram).unwrap();
assert!(grams_of(&q).contains(&b"ERR".to_vec()));
}
}