Skip to main content

luci/query/
wildcard.rs

1//! Wildcard query: match terms using * (any chars) and ? (single char).
2//!
3//! Compiles the wildcard pattern to an unanchored regex string, then to
4//! a `RegexAutomaton` (DFA implementing `fst::Automaton`). Term
5//! enumeration happens via `reader.automaton_search` — FST/DFA
6//! intersection that prunes non-matching subtrees via `can_match`.
7//! Same primitive `levenshtein_automata::DFA` uses for fuzzy and
8//! `RegexpQuery` uses for general regex.
9//!
10//! For trailing-wildcard patterns like `tech*` the DFA's `can_match`
11//! naturally prunes the FST to the literal-prefix subtree (no separate
12//! fast path needed). For non-trailing patterns like `*ology` or
13//! `t?ch` the DFA still drives O(matching subtrees) work — this is
14//! the fix that closes the leading-wildcard performance gap.
15//!
16//! Uses **per-segment rewrite**: term enumeration happens inside
17//! `scorer_supplier(reader)` for each segment, not globally at `bind()`.
18//! Matched terms are unioned via [`ConstantScoreMultiTermSupplier`]
19//! (shared `FilterScorer` + `BufferedUnionScorer`), matching Lucene's
20//! `MultiTermQueryConstantScoreBlendedWrapper`. Every matching doc
21//! receives a constant score of 1.0.
22//!
23//! See [[fix-wildcard-fuzzy-quadratic-dedup]],
24//! [[optimization-multiterm-constant-score-rewrite]], and
25//! [[optimization-regexp-wildcard-fst-automaton]].
26
27use crate::core::{Result, ScoreMode};
28use regex::Regex;
29
30use crate::query::multi_term::ConstantScoreMultiTermSupplier;
31use crate::query::regex_automaton::RegexAutomaton;
32use crate::query::{BoundQuery, Query, ScorerSupplier};
33use crate::search::searcher::Searcher;
34use crate::segment::reader::SegmentReader;
35
36/// Wildcard query: * matches any sequence, ? matches single character.
37pub struct WildcardQuery {
38    pub field: String,
39    pub pattern: String,
40}
41
42impl WildcardQuery {
43    /// Check if a term matches the wildcard pattern (for tests).
44    pub fn matches(pattern: &str, term: &str) -> bool {
45        let regex_pattern = wildcard_to_regex_anchored(pattern);
46        Regex::new(&regex_pattern)
47            .map(|re| re.is_match(term))
48            .unwrap_or(false)
49    }
50}
51
52/// Convert a wildcard pattern to an unanchored regex pattern (for the
53/// `RegexAutomaton` path — anchoring is handled by the DFA's
54/// `StartKind::Anchored` configuration).
55/// `*` → `.*`, `?` → `.`, other regex meta chars are escaped.
56fn wildcard_to_regex_unanchored(pattern: &str) -> String {
57    let mut regex = String::with_capacity(pattern.len() * 2);
58    for ch in pattern.chars() {
59        match ch {
60            '*' => regex.push_str(".*"),
61            '?' => regex.push('.'),
62            '.' | '+' | '(' | ')' | '[' | ']' | '{' | '}' | '\\' | '^' | '$' | '|' => {
63                regex.push('\\');
64                regex.push(ch);
65            }
66            _ => regex.push(ch),
67        }
68    }
69    regex
70}
71
72/// Anchored variant for the legacy `WildcardQuery::matches` test helper.
73fn wildcard_to_regex_anchored(pattern: &str) -> String {
74    let mut s = String::with_capacity(pattern.len() * 2 + 2);
75    s.push('^');
76    s.push_str(&wildcard_to_regex_unanchored(pattern));
77    s.push('$');
78    s
79}
80
81impl Query for WildcardQuery {
82    fn bind(&self, _searcher: &Searcher, _score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
83        let regex_pattern = wildcard_to_regex_unanchored(&self.pattern);
84        let automaton = RegexAutomaton::new(&regex_pattern)?;
85        Ok(Box::new(BoundWildcardQuery {
86            field: self.field.clone(),
87            automaton,
88        }))
89    }
90}
91
92/// Bound wildcard query — defers term enumeration to per-segment
93/// `scorer_supplier(reader)` calls via `automaton_search`.
94struct BoundWildcardQuery {
95    field: String,
96    automaton: RegexAutomaton,
97}
98
99impl BoundQuery for BoundWildcardQuery {
100    fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
101        let field_id = match reader
102            .header()
103            .fields
104            .iter()
105            .find(|f| f.field_name == self.field)
106            .map(|f| f.field_id)
107        {
108            Some(id) => id,
109            None => return Ok(None),
110        };
111
112        // FST/DFA intersection — prunes non-matching subtrees via the
113        // automaton's can_match. Trailing wildcards (tech*) naturally
114        // narrow to the literal-prefix subtree; leading and middle
115        // wildcards (*ology, t?ch) still drive O(matching subtrees)
116        // work instead of O(all terms in field).
117        let terms: Vec<(String, u32)> = reader.automaton_search(field_id, &self.automaton);
118        if terms.is_empty() {
119            return Ok(None);
120        }
121
122        Ok(Some(Box::new(ConstantScoreMultiTermSupplier::new(
123            reader, field_id, terms,
124        ))))
125    }
126}
127
128#[cfg(test)]
129mod tests {
130    use super::*;
131    use crate::analysis::Token;
132    use crate::core::{FieldId, SegmentId};
133    use crate::mapping::{FieldType, Mapping};
134    use crate::segment::builder::SegmentBuilder;
135    use crate::segment::reader::SegmentReader;
136
137    #[test]
138    fn wildcard_matching() {
139        assert!(WildcardQuery::matches("tech*", "technology"));
140        assert!(WildcardQuery::matches("tech*", "tech"));
141        assert!(!WildcardQuery::matches("tech*", "tec"));
142        assert!(WildcardQuery::matches("t?ch", "tech"));
143        assert!(!WildcardQuery::matches("t?ch", "touch"));
144        assert!(WildcardQuery::matches("*fox*", "quick fox jumps"));
145        assert!(WildcardQuery::matches("*", "anything"));
146        assert!(WildcardQuery::matches("?", "x"));
147        assert!(!WildcardQuery::matches("?", ""));
148        assert!(WildcardQuery::matches("a*b", "ab"));
149        assert!(WildcardQuery::matches("a*b", "aXYZb"));
150        assert!(!WildcardQuery::matches("a*b", "aXYZc"));
151    }
152
153    #[test]
154    fn wildcard_query_star() {
155        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
156        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
157        for tag in &["technology", "technical", "tennis", "science"] {
158            builder.add_document(
159                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
160                b"{}",
161            );
162        }
163        let reader = SegmentReader::open(builder.build()).unwrap();
164        let store = crate::search::segment_store::SegmentStore::new(
165            vec![reader],
166            crate::analysis::AnalyzerRegistry::new(),
167            None,
168            None,
169        );
170        let searcher = Searcher::new(&store);
171
172        let results = searcher
173            .search_query(
174                &WildcardQuery {
175                    field: "tag".into(),
176                    pattern: "tech*".into(),
177                },
178                10,
179                0,
180            )
181            .unwrap();
182        assert_eq!(results.total_hits.value, 2); // technology, technical
183    }
184
185    #[test]
186    fn wildcard_query_question_mark() {
187        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
188        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
189        for tag in &["cat", "cut", "cot", "cart"] {
190            builder.add_document(
191                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
192                b"{}",
193            );
194        }
195        let reader = SegmentReader::open(builder.build()).unwrap();
196        let store = crate::search::segment_store::SegmentStore::new(
197            vec![reader],
198            crate::analysis::AnalyzerRegistry::new(),
199            None,
200            None,
201        );
202        let searcher = Searcher::new(&store);
203
204        let results = searcher
205            .search_query(
206                &WildcardQuery {
207                    field: "tag".into(),
208                    pattern: "c?t".into(),
209                },
210                10,
211                0,
212            )
213            .unwrap();
214        assert_eq!(results.total_hits.value, 3); // cat, cut, cot (not cart — 4 chars)
215    }
216
217    #[test]
218    fn wildcard_constant_score_all_ones() {
219        // Multi-term wildcard match: every hit should score exactly 1.0,
220        // matching ES's CONSTANT_SCORE_BLENDED_REWRITE behavior.
221        // See [[optimization-multiterm-constant-score-rewrite]].
222        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
223        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
224        for tag in &["technology", "technical", "tennis", "science"] {
225            builder.add_document(
226                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
227                b"{}",
228            );
229        }
230        let reader = SegmentReader::open(builder.build()).unwrap();
231        let store = crate::search::segment_store::SegmentStore::new(
232            vec![reader],
233            crate::analysis::AnalyzerRegistry::new(),
234            None,
235            None,
236        );
237        let searcher = Searcher::new(&store);
238
239        let results = searcher
240            .search_query(
241                &WildcardQuery {
242                    field: "tag".into(),
243                    pattern: "tech*".into(),
244                },
245                10,
246                0,
247            )
248            .unwrap();
249        assert_eq!(results.total_hits.value, 2);
250        for hit in &results.hits {
251            assert_eq!(
252                hit.score, 1.0,
253                "wildcard hit should have constant score 1.0, got {}",
254                hit.score
255            );
256        }
257    }
258
259    #[test]
260    fn wildcard_high_cardinality_constant_score() {
261        // Stress the multi-term BufferedUnionScorer path (>>1 sub-scorer)
262        // with constant scoring. This is the unit-test analogue of the
263        // UMLS wildcard benchmark — exercises window refilling and
264        // sub-scorer exhaustion logic at unit-test speed.
265        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
266        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
267        for i in 0..1000 {
268            let tag = format!("tag_{i:04}");
269            builder.add_document(
270                &[(FieldId::new(0), vec![Token::new(&tag, 0, tag.len(), 0)])],
271                b"{}",
272            );
273        }
274        let reader = SegmentReader::open(builder.build()).unwrap();
275        let store = crate::search::segment_store::SegmentStore::new(
276            vec![reader],
277            crate::analysis::AnalyzerRegistry::new(),
278            None,
279            None,
280        );
281        let searcher = Searcher::new(&store);
282
283        let results = searcher
284            .search_query(
285                &WildcardQuery {
286                    field: "tag".into(),
287                    pattern: "tag_*".into(),
288                },
289                1000,
290                0,
291            )
292            .unwrap();
293        assert_eq!(results.total_hits.value, 1000);
294        assert_eq!(results.hits.len(), 1000);
295        for hit in &results.hits {
296            assert_eq!(
297                hit.score, 1.0,
298                "high-cardinality wildcard hit should score 1.0, got {}",
299                hit.score
300            );
301        }
302    }
303
304    #[test]
305    fn wildcard_no_matches() {
306        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
307        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
308        builder.add_document(
309            &[(FieldId::new(0), vec![Token::new("hello", 0, 5, 0)])],
310            b"{}",
311        );
312        let reader = SegmentReader::open(builder.build()).unwrap();
313        let store = crate::search::segment_store::SegmentStore::new(
314            vec![reader],
315            crate::analysis::AnalyzerRegistry::new(),
316            None,
317            None,
318        );
319        let searcher = Searcher::new(&store);
320
321        let results = searcher
322            .search_query(
323                &WildcardQuery {
324                    field: "tag".into(),
325                    pattern: "xyz*".into(),
326                },
327                10,
328                0,
329            )
330            .unwrap();
331        assert_eq!(results.total_hits.value, 0);
332    }
333
334    #[test]
335    fn wildcard_leading_wildcard_correctness() {
336        // Regression sentinel: leading-wildcard hit set must be
337        // identical before and after the FST automaton intersection
338        // change. See [[optimization-regexp-wildcard-fst-automaton]].
339        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
340        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
341        for tag in &["technology", "ecology", "biology", "tennis"] {
342            builder.add_document(
343                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
344                b"{}",
345            );
346        }
347        let reader = SegmentReader::open(builder.build()).unwrap();
348        let store = crate::search::segment_store::SegmentStore::new(
349            vec![reader],
350            crate::analysis::AnalyzerRegistry::new(),
351            None,
352            None,
353        );
354        let searcher = Searcher::new(&store);
355
356        let results = searcher
357            .search_query(
358                &WildcardQuery {
359                    field: "tag".into(),
360                    pattern: "*ology".into(),
361                },
362                10,
363                0,
364            )
365            .unwrap();
366        // technology, ecology, biology — all end with "ology"
367        assert_eq!(results.total_hits.value, 3);
368        for hit in &results.hits {
369            assert_eq!(hit.score, 1.0);
370        }
371    }
372
373    #[test]
374    fn wildcard_middle_wildcard_correctness() {
375        // Regression sentinel: middle-wildcard hit set must be
376        // identical before and after the FST automaton intersection
377        // change. See [[optimization-regexp-wildcard-fst-automaton]].
378        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
379        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
380        for tag in &["bat1", "bet1", "bit1", "but2", "bat2"] {
381            builder.add_document(
382                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
383                b"{}",
384            );
385        }
386        let reader = SegmentReader::open(builder.build()).unwrap();
387        let store = crate::search::segment_store::SegmentStore::new(
388            vec![reader],
389            crate::analysis::AnalyzerRegistry::new(),
390            None,
391            None,
392        );
393        let searcher = Searcher::new(&store);
394
395        let results = searcher
396            .search_query(
397                &WildcardQuery {
398                    field: "tag".into(),
399                    pattern: "b?t1".into(),
400                },
401                10,
402                0,
403            )
404            .unwrap();
405        // bat1, bet1, bit1 — all match b?t1; but2 and bat2 don't (end in 2)
406        assert_eq!(results.total_hits.value, 3);
407    }
408}