Skip to main content

rgx/
query.rs

1//! Turn a regex pattern into a boolean trigram query that every matching file must satisfy.
2//!
3//! Soundness is the whole game: the query may be satisfied by files that don't actually match
4//! (ripgrep filters those out), but it must **never** exclude a file that does match. We get this
5//! from `regex-syntax`'s literal extraction, which returns a *complete* over-approximation of a
6//! regex's prefixes (or suffixes) or else reports that it cannot — in which case we fall back to
7//! "match everything" (`Query::All`).
8//!
9//! Construction: every match starts with one of the extracted prefix literals AND ends with one of
10//! the suffix literals. Each is an independently necessary condition, so their conjunction is
11//! necessary. For a literal of length >= 3 we require all its trigrams (an AND); across the
12//! alternatives we OR; a literal shorter than 3 (or an unbounded literal set) carries no trigram
13//! constraint, collapsing that side to `All`. See `docs/index-and-storage.md` (section 2.3).
14
15use crate::trigram::{self, Trigram};
16use regex_syntax::ParserBuilder;
17use regex_syntax::hir::literal::{ExtractKind, Extractor, Seq};
18
19/// Options that affect how a pattern is parsed (mirroring the ripgrep flags that matter for
20/// literal extraction).
21#[derive(Debug, Clone, Copy, Default)]
22pub struct Options {
23    pub case_insensitive: bool,
24    pub multi_line: bool,
25    pub dot_matches_new_line: bool,
26}
27
28/// A boolean formula over trigrams. `All` means "no constraint" — scan everything (the safe
29/// fallback). Leaves are individual trigrams that must be present.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum Query {
32    All,
33    Tri(Trigram),
34    And(Vec<Query>),
35    Or(Vec<Query>),
36}
37
38impl Query {
39    /// Build a trigram query for `pattern`. Any condition we can't reason about soundly yields
40    /// `Query::All` (full scan) rather than risking a missed match.
41    pub fn for_pattern(pattern: &str, opts: Options) -> Query {
42        let hir = match ParserBuilder::new()
43            .case_insensitive(opts.case_insensitive)
44            .multi_line(opts.multi_line)
45            .dot_matches_new_line(opts.dot_matches_new_line)
46            .build()
47            .parse(pattern)
48        {
49            Ok(h) => h,
50            Err(_) => return Query::All, // unsupported syntax (lookaround, backrefs, ...) -> scan
51        };
52        let prefix = seq_query(&Extractor::new().kind(ExtractKind::Prefix).extract(&hir));
53        let suffix = seq_query(&Extractor::new().kind(ExtractKind::Suffix).extract(&hir));
54        Query::and(vec![prefix, suffix])
55    }
56
57    /// True if this query imposes no constraint (the engine must fall back to a full scan).
58    pub fn is_fallback(&self) -> bool {
59        matches!(self, Query::All)
60    }
61
62    /// Evaluate against a predicate telling whether a given trigram is present in a file.
63    pub fn eval(&self, present: &impl Fn(Trigram) -> bool) -> bool {
64        match self {
65            Query::All => true,
66            Query::Tri(t) => present(*t),
67            Query::And(qs) => qs.iter().all(|q| q.eval(present)),
68            Query::Or(qs) => qs.iter().any(|q| q.eval(present)),
69        }
70    }
71
72    /// Collect the distinct trigrams referenced anywhere in the query (for index lookups).
73    pub fn trigrams(&self, out: &mut Vec<Trigram>) {
74        match self {
75            Query::All => {}
76            Query::Tri(t) => out.push(*t),
77            Query::And(qs) | Query::Or(qs) => qs.iter().for_each(|q| q.trigrams(out)),
78        }
79    }
80
81    /// AND with `All`-simplification: `All` is the identity, so it's dropped; if nothing remains,
82    /// the result is `All`; a single term collapses to itself.
83    fn and(parts: Vec<Query>) -> Query {
84        let mut kept: Vec<Query> = parts.into_iter().filter(|q| !q.is_fallback()).collect();
85        match kept.len() {
86            0 => Query::All,
87            1 => kept.pop().unwrap(),
88            _ => Query::And(kept),
89        }
90    }
91
92    /// OR with `All`-absorption: if any branch is `All`, the whole disjunction is `All`.
93    fn or(parts: Vec<Query>) -> Query {
94        if parts.is_empty() || parts.iter().any(Query::is_fallback) {
95            return Query::All;
96        }
97        if parts.len() == 1 {
98            return parts.into_iter().next().unwrap();
99        }
100        Query::Or(parts)
101    }
102}
103
104/// Turn an extracted prefix/suffix literal sequence into a trigram query.
105///
106/// A `None` (infinite) sequence means the extractor couldn't bound the literal set — no constraint.
107/// Otherwise every match must begin (or end) with one of the listed literals, so we OR them; each
108/// literal of length >= 3 contributes the AND of its trigrams, and any shorter literal collapses
109/// the disjunction to `All`.
110fn seq_query(seq: &Seq) -> Query {
111    let Some(lits) = seq.literals() else {
112        return Query::All;
113    };
114    if lits.is_empty() {
115        return Query::All;
116    }
117    let mut branches = Vec::with_capacity(lits.len());
118    for lit in lits {
119        let tris = trigram::of_literal(lit.as_bytes());
120        if tris.is_empty() {
121            return Query::All; // a literal shorter than a trigram imposes no constraint
122        }
123        let conj: Vec<Query> = tris.into_iter().map(Query::Tri).collect();
124        branches.push(Query::and(conj));
125    }
126    Query::or(branches)
127}
128
129#[cfg(test)]
130mod tests {
131    use super::*;
132    use crate::trigram;
133
134    fn q(p: &str) -> Query {
135        Query::for_pattern(p, Options::default())
136    }
137
138    #[test]
139    fn literal_requires_all_its_trigrams() {
140        let query = q("IndexWriter");
141        let mut tris = Vec::new();
142        query.trigrams(&mut tris);
143        assert!(!query.is_fallback());
144        // "IndexWriter" has 9 trigrams; prefix==suffix==exact so AND dedups to those.
145        assert!(tris.contains(b"Ind"));
146        assert!(tris.contains(b"ter"));
147    }
148
149    #[test]
150    fn short_and_wildcard_patterns_fall_back() {
151        assert!(q("ab").is_fallback()); // < 3 chars
152        assert!(q(".").is_fallback());
153        assert!(q("\\w+").is_fallback());
154        assert!(q(".*").is_fallback());
155    }
156
157    #[test]
158    fn alternation_with_an_unconstrained_branch_falls_back() {
159        // "foo|.|bar": the "." branch matches anything -> whole query must scan.
160        assert!(q("foo|.|bar").is_fallback());
161    }
162
163    #[test]
164    fn alternation_of_literals_is_an_or() {
165        let query = q("alpha|bravo|gamma");
166        assert!(!query.is_fallback());
167        // alpha present, bravo/gamma absent -> still matches (OR).
168        let set = trigram::distinct(b"xx alpha xx");
169        assert!(query.eval(&|t| set.contains(&t)));
170        let none = trigram::distinct(b"nothing here");
171        assert!(!query.eval(&|t| none.contains(&t)));
172    }
173}