Skip to main content

luci/query/
fuzzy.rs

1//! Fuzzy query: match terms within edit distance (Levenshtein).
2//!
3//! Uses the `levenshtein_automata` crate to build a DFA that accepts
4//! all terms within the specified edit distance. Term enumeration is
5//! done via FST/DFA intersection (`reader.automaton_search`) which
6//! prunes non-matching subtrees.
7//!
8//! Uses **per-segment rewrite**: term enumeration happens inside
9//! `scorer_supplier(reader)` for each segment, not globally at `bind()`.
10//! Matched terms are unioned via [`ConstantScoreMultiTermSupplier`]
11//! (shared `FilterScorer` + `BufferedUnionScorer`), matching Lucene's
12//! `MultiTermQueryConstantScoreBlendedWrapper`. Every matching doc
13//! receives a constant score of 1.0.
14//!
15//! See [[fix-wildcard-fuzzy-quadratic-dedup]],
16//! [[optimization-wildcard-fuzzy-queries]], and
17//! [[optimization-multiterm-constant-score-rewrite]].
18
19use crate::core::{Result, ScoreMode};
20use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder};
21
22use crate::query::multi_term::ConstantScoreMultiTermSupplier;
23use crate::query::{BoundQuery, Query, ScorerSupplier};
24use crate::search::searcher::Searcher;
25use crate::segment::reader::SegmentReader;
26
27/// Fuzzy query: find terms within Levenshtein edit distance.
28pub struct FuzzyQuery {
29    pub field: String,
30    pub value: String,
31    pub fuzziness: u32,
32}
33
34impl Query for FuzzyQuery {
35    fn bind(&self, _searcher: &Searcher, _score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
36        // Compile the Levenshtein DFA once. No segment iteration —
37        // term enumeration happens per-segment in scorer_supplier(reader).
38        let max_dist = self.fuzziness.min(2) as u8;
39        let builder = LevenshteinAutomatonBuilder::new(max_dist, true);
40        let dfa = builder.build_dfa(&self.value);
41
42        Ok(Box::new(BoundFuzzyQuery {
43            field: self.field.clone(),
44            dfa,
45        }))
46    }
47}
48
49/// Bound fuzzy query — defers term enumeration to per-segment
50/// `scorer_supplier(reader)` calls. Matches Lucene's MultiTermQuery model.
51struct BoundFuzzyQuery {
52    field: String,
53    dfa: DFA,
54}
55
56impl BoundQuery for BoundFuzzyQuery {
57    fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
58        let field_id = match reader
59            .header()
60            .fields
61            .iter()
62            .find(|f| f.field_name == self.field)
63            .map(|f| f.field_id)
64        {
65            Some(id) => id,
66            None => return Ok(None),
67        };
68
69        // Enumerate THIS segment's matching terms via FST/DFA
70        // intersection (prunes non-matching subtrees).
71        let terms: Vec<(String, u32)> = reader.automaton_search(field_id, &self.dfa);
72        if terms.is_empty() {
73            return Ok(None);
74        }
75
76        Ok(Some(Box::new(ConstantScoreMultiTermSupplier::new(
77            reader, field_id, terms,
78        ))))
79    }
80}
81
82#[cfg(test)]
83mod tests {
84    use super::*;
85    use crate::analysis::Token;
86    use crate::core::{FieldId, SegmentId};
87    use crate::mapping::{FieldType, Mapping};
88    use crate::segment::builder::SegmentBuilder;
89    use crate::segment::reader::SegmentReader;
90
91    #[test]
92    fn fuzzy_query_basic() {
93        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
94        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
95        for tag in &["technology", "technlogy", "techology", "science", "tennis"] {
96            builder.add_document(
97                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
98                b"{}",
99            );
100        }
101        let reader = SegmentReader::open(builder.build()).unwrap();
102        let store = crate::search::segment_store::SegmentStore::new(
103            vec![reader],
104            crate::analysis::AnalyzerRegistry::new(),
105            None,
106            None,
107        );
108        let searcher = Searcher::new(&store);
109
110        let results = searcher
111            .search_query(
112                &FuzzyQuery {
113                    field: "tag".into(),
114                    value: "technology".into(),
115                    fuzziness: 1,
116                },
117                10,
118                0,
119            )
120            .unwrap();
121        assert_eq!(results.total_hits.value, 3); // technology (0), technlogy (1), techology (1)
122    }
123
124    #[test]
125    fn fuzzy_query_fuzziness_2() {
126        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
127        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
128        for tag in &["apple", "aple", "apples", "apply", "orange"] {
129            builder.add_document(
130                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
131                b"{}",
132            );
133        }
134        let reader = SegmentReader::open(builder.build()).unwrap();
135        let store = crate::search::segment_store::SegmentStore::new(
136            vec![reader],
137            crate::analysis::AnalyzerRegistry::new(),
138            None,
139            None,
140        );
141        let searcher = Searcher::new(&store);
142
143        let results = searcher
144            .search_query(
145                &FuzzyQuery {
146                    field: "tag".into(),
147                    value: "apple".into(),
148                    fuzziness: 2,
149                },
150                10,
151                0,
152            )
153            .unwrap();
154        // apple(0), aple(1), apples(1), apply(2)
155        assert_eq!(results.total_hits.value, 4);
156    }
157
158    #[test]
159    fn fuzzy_constant_score_all_ones() {
160        // Multi-term fuzzy match: every hit should score exactly 1.0,
161        // matching ES's CONSTANT_SCORE_BLENDED_REWRITE behavior.
162        // See [[optimization-multiterm-constant-score-rewrite]].
163        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
164        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
165        for tag in &["cat", "bat", "hat", "dog"] {
166            builder.add_document(
167                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
168                b"{}",
169            );
170        }
171        let reader = SegmentReader::open(builder.build()).unwrap();
172        let store = crate::search::segment_store::SegmentStore::new(
173            vec![reader],
174            crate::analysis::AnalyzerRegistry::new(),
175            None,
176            None,
177        );
178        let searcher = Searcher::new(&store);
179
180        let results = searcher
181            .search_query(
182                &FuzzyQuery {
183                    field: "tag".into(),
184                    value: "cat".into(),
185                    fuzziness: 1,
186                },
187                10,
188                0,
189            )
190            .unwrap();
191        // cat(0), bat(1), hat(1) — 3 matches within edit distance 1
192        assert_eq!(results.total_hits.value, 3);
193        for hit in &results.hits {
194            assert_eq!(
195                hit.score, 1.0,
196                "fuzzy hit should have constant score 1.0, got {}",
197                hit.score
198            );
199        }
200    }
201
202    #[test]
203    fn fuzzy_no_matches() {
204        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
205        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
206        builder.add_document(
207            &[(FieldId::new(0), vec![Token::new("hello", 0, 5, 0)])],
208            b"{}",
209        );
210        let reader = SegmentReader::open(builder.build()).unwrap();
211        let store = crate::search::segment_store::SegmentStore::new(
212            vec![reader],
213            crate::analysis::AnalyzerRegistry::new(),
214            None,
215            None,
216        );
217        let searcher = Searcher::new(&store);
218
219        let results = searcher
220            .search_query(
221                &FuzzyQuery {
222                    field: "tag".into(),
223                    value: "xyz".into(),
224                    fuzziness: 1,
225                },
226                10,
227                0,
228            )
229            .unwrap();
230        assert_eq!(results.total_hits.value, 0);
231    }
232
233    #[test]
234    fn fuzzy_first_char_substitution() {
235        // Test that fuzzy matches work even when the first character differs
236        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
237        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
238        for tag in &["cat", "bat", "hat", "dog"] {
239            builder.add_document(
240                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
241                b"{}",
242            );
243        }
244        let reader = SegmentReader::open(builder.build()).unwrap();
245        let store = crate::search::segment_store::SegmentStore::new(
246            vec![reader],
247            crate::analysis::AnalyzerRegistry::new(),
248            None,
249            None,
250        );
251        let searcher = Searcher::new(&store);
252
253        let results = searcher
254            .search_query(
255                &FuzzyQuery {
256                    field: "tag".into(),
257                    value: "cat".into(),
258                    fuzziness: 1,
259                },
260                10,
261                0,
262            )
263            .unwrap();
264        // cat(0), bat(1), hat(1) — first-char substitution
265        assert_eq!(results.total_hits.value, 3);
266    }
267}