Skip to main content

luci/query/
regexp.rs

1//! Regexp query: match terms against a regular expression.
2//!
3//! Compiles the pattern to a `RegexAutomaton` (DFA implementing
4//! `fst::Automaton`) and uses `reader.automaton_search` for FST/DFA
5//! intersection — the FST walk prunes non-matching subtrees via
6//! `can_match`, replacing the previous "decode every term, then
7//! `regex.is_match`" full-scan loop with O(matching subtrees) work.
8//! Same primitive `levenshtein_automata::DFA` uses for fuzzy.
9//!
10//! Uses **per-segment rewrite**: term enumeration happens inside
11//! `scorer_supplier(reader)` for each segment, not globally at `bind()`.
12//! Matched terms are unioned via [`ConstantScoreMultiTermSupplier`]
13//! (shared `FilterScorer` + `BufferedUnionScorer`), matching Lucene's
14//! `MultiTermQueryConstantScoreBlendedWrapper`. Every matching doc
15//! receives a constant score of 1.0.
16//!
17//! See [[fix-wildcard-fuzzy-quadratic-dedup]],
18//! [[optimization-multiterm-constant-score-rewrite]], and
19//! [[optimization-regexp-wildcard-fst-automaton]].
20
21use crate::core::{Result, ScoreMode};
22
23use crate::query::multi_term::ConstantScoreMultiTermSupplier;
24use crate::query::regex_automaton::RegexAutomaton;
25use crate::query::{BoundQuery, Query, ScorerSupplier};
26use crate::search::searcher::Searcher;
27use crate::segment::reader::SegmentReader;
28
29/// Regexp query on a field's term dictionary.
30pub struct RegexpQuery {
31    pub field: String,
32    pub pattern: String,
33}
34
35impl Query for RegexpQuery {
36    fn bind(&self, _searcher: &Searcher, _score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
37        // Strip explicit ^...$ anchors from the user pattern — the
38        // RegexAutomaton is anchored implicitly via DFA configuration.
39        let pattern = strip_anchors(&self.pattern);
40        let automaton = RegexAutomaton::new(pattern)?;
41        Ok(Box::new(BoundRegexpQuery {
42            field: self.field.clone(),
43            automaton,
44        }))
45    }
46}
47
48fn strip_anchors(pattern: &str) -> &str {
49    let mut p = pattern;
50    if let Some(stripped) = p.strip_prefix('^') {
51        p = stripped;
52    }
53    if let Some(stripped) = p.strip_suffix('$') {
54        p = stripped;
55    }
56    p
57}
58
59/// Bound regexp query — defers term enumeration to per-segment
60/// `scorer_supplier(reader)` calls via `automaton_search`.
61struct BoundRegexpQuery {
62    field: String,
63    automaton: RegexAutomaton,
64}
65
66impl BoundQuery for BoundRegexpQuery {
67    fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
68        let field_id = match reader
69            .header()
70            .fields
71            .iter()
72            .find(|f| f.field_name == self.field)
73            .map(|f| f.field_id)
74        {
75            Some(id) => id,
76            None => return Ok(None),
77        };
78
79        // FST/DFA intersection — prunes non-matching subtrees via the
80        // automaton's can_match. O(matching subtrees), not O(all terms).
81        let terms: Vec<(String, u32)> = reader.automaton_search(field_id, &self.automaton);
82        if terms.is_empty() {
83            return Ok(None);
84        }
85
86        Ok(Some(Box::new(ConstantScoreMultiTermSupplier::new(
87            reader, field_id, terms,
88        ))))
89    }
90}
91
92#[cfg(test)]
93mod tests {
94    use super::*;
95    use crate::analysis::Token;
96    use crate::core::{FieldId, SegmentId};
97    use crate::mapping::{FieldType, Mapping};
98    use crate::segment::builder::SegmentBuilder;
99    use crate::segment::reader::SegmentReader;
100
101    #[test]
102    fn regexp_basic() {
103        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
104        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
105        for tag in &["technology", "technical", "tennis", "science"] {
106            builder.add_document(
107                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
108                b"{}",
109            );
110        }
111        let reader = SegmentReader::open(builder.build()).unwrap();
112        let store = crate::search::segment_store::SegmentStore::new(
113            vec![reader],
114            crate::analysis::AnalyzerRegistry::new(),
115            None,
116            None,
117        );
118        let searcher = Searcher::new(&store);
119
120        let results = searcher
121            .search_query(
122                &RegexpQuery {
123                    field: "tag".into(),
124                    pattern: "tech.*".into(),
125                },
126                10,
127                0,
128            )
129            .unwrap();
130        assert_eq!(results.total_hits.value, 2); // technology, technical
131    }
132
133    #[test]
134    fn regexp_character_class() {
135        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
136        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
137        for tag in &["cat", "cut", "cot", "cart", "cit"] {
138            builder.add_document(
139                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
140                b"{}",
141            );
142        }
143        let reader = SegmentReader::open(builder.build()).unwrap();
144        let store = crate::search::segment_store::SegmentStore::new(
145            vec![reader],
146            crate::analysis::AnalyzerRegistry::new(),
147            None,
148            None,
149        );
150        let searcher = Searcher::new(&store);
151
152        let results = searcher
153            .search_query(
154                &RegexpQuery {
155                    field: "tag".into(),
156                    pattern: "c[aou]t".into(),
157                },
158                10,
159                0,
160            )
161            .unwrap();
162        assert_eq!(results.total_hits.value, 3); // cat, cut, cot
163    }
164
165    #[test]
166    fn regexp_alternation() {
167        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
168        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
169        for tag in &["red", "blue", "green"] {
170            builder.add_document(
171                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
172                b"{}",
173            );
174        }
175        let reader = SegmentReader::open(builder.build()).unwrap();
176        let store = crate::search::segment_store::SegmentStore::new(
177            vec![reader],
178            crate::analysis::AnalyzerRegistry::new(),
179            None,
180            None,
181        );
182        let searcher = Searcher::new(&store);
183
184        let results = searcher
185            .search_query(
186                &RegexpQuery {
187                    field: "tag".into(),
188                    pattern: "red|blue".into(),
189                },
190                10,
191                0,
192            )
193            .unwrap();
194        assert_eq!(results.total_hits.value, 2);
195    }
196
197    #[test]
198    fn regexp_constant_score_all_ones() {
199        // Multi-term regexp match: every hit should score exactly 1.0,
200        // matching ES's CONSTANT_SCORE_BLENDED_REWRITE behavior.
201        // See [[optimization-multiterm-constant-score-rewrite]].
202        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
203        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
204        for tag in &["cat", "cut", "cot", "cart", "cit"] {
205            builder.add_document(
206                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
207                b"{}",
208            );
209        }
210        let reader = SegmentReader::open(builder.build()).unwrap();
211        let store = crate::search::segment_store::SegmentStore::new(
212            vec![reader],
213            crate::analysis::AnalyzerRegistry::new(),
214            None,
215            None,
216        );
217        let searcher = Searcher::new(&store);
218
219        let results = searcher
220            .search_query(
221                &RegexpQuery {
222                    field: "tag".into(),
223                    pattern: "c[aou]t".into(),
224                },
225                10,
226                0,
227            )
228            .unwrap();
229        // cat, cut, cot — 3 matches
230        assert_eq!(results.total_hits.value, 3);
231        for hit in &results.hits {
232            assert_eq!(
233                hit.score, 1.0,
234                "regexp hit should have constant score 1.0, got {}",
235                hit.score
236            );
237        }
238    }
239
240    #[test]
241    fn regexp_no_matches() {
242        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
243        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
244        builder.add_document(
245            &[(FieldId::new(0), vec![Token::new("hello", 0, 5, 0)])],
246            b"{}",
247        );
248        let reader = SegmentReader::open(builder.build()).unwrap();
249        let store = crate::search::segment_store::SegmentStore::new(
250            vec![reader],
251            crate::analysis::AnalyzerRegistry::new(),
252            None,
253            None,
254        );
255        let searcher = Searcher::new(&store);
256
257        let results = searcher
258            .search_query(
259                &RegexpQuery {
260                    field: "tag".into(),
261                    pattern: "xyz.*".into(),
262                },
263                10,
264                0,
265            )
266            .unwrap();
267        assert_eq!(results.total_hits.value, 0);
268    }
269}