Skip to main content

jmdict_fast/
query.rs

1use crate::dict::{Dict, MatchCandidate};
2use crate::error::JmdictError;
3use crate::model::{LookupResult, MatchMode};
4use std::vec;
5
6/// Upper bound on the Levenshtein edit distance accepted by the fuzzy
7/// search builders. The `fst` Levenshtein automaton's DFA grows rapidly
8/// with distance; values above 4 are rarely useful and risk large
9/// allocations.
10pub const MAX_FUZZY_DISTANCE: u32 = 4;
11
12/// Returns true when `filter` is empty, or any haystack value contains any
13/// filter substring. Matches the existing `pos` filter semantics: filters
14/// are case-sensitive substrings of the JMdict codes (`"v"` catches every
15/// verb POS, `"v1"` only ichidan).
16fn filter_passes(filter: &[String], haystack: &[String]) -> bool {
17    filter.is_empty()
18        || haystack
19            .iter()
20            .any(|h| filter.iter().any(|f| h.contains(f.as_str())))
21}
22
23/// An iterator that lazily deserializes dictionary entries from pre-sorted match candidates.
24pub struct LookupResultIter<'d> {
25    dict: &'d Dict,
26    candidates: vec::IntoIter<MatchCandidate>,
27    common_only: bool,
28    pos_filter: Vec<String>,
29    misc_filter: Vec<String>,
30    field_filter: Vec<String>,
31    dialect_filter: Vec<String>,
32    limit: Option<usize>,
33    yielded: usize,
34}
35
36impl<'d> Iterator for LookupResultIter<'d> {
37    type Item = LookupResult;
38
39    fn next(&mut self) -> Option<Self::Item> {
40        if let Some(limit) = self.limit {
41            if self.yielded >= limit {
42                return None;
43            }
44        }
45
46        let any_sense_filter = !self.pos_filter.is_empty()
47            || !self.misc_filter.is_empty()
48            || !self.field_filter.is_empty()
49            || !self.dialect_filter.is_empty();
50
51        loop {
52            let mc = self.candidates.next()?;
53            let entry = match self.dict.load_entry(mc.id) {
54                Some(e) => e,
55                None => continue,
56            };
57
58            if self.common_only && !entry.is_common() {
59                continue;
60            }
61
62            // Sense-level filters are conjunctive *within* a sense: a single
63            // sense must satisfy every active filter for the entry to match.
64            // This mirrors JMdict's structure where pos/misc/field/dialect
65            // are recorded per sense, so a verb-sense + noun-sense entry
66            // won't match `pos=v` + `misc=abbr` unless one sense is both.
67            if any_sense_filter {
68                let any_match = entry.sense.iter().any(|s| {
69                    filter_passes(&self.pos_filter, &s.part_of_speech)
70                        && filter_passes(&self.misc_filter, &s.misc)
71                        && filter_passes(&self.field_filter, &s.field)
72                        && filter_passes(&self.dialect_filter, &s.dialect)
73                });
74                if !any_match {
75                    continue;
76                }
77            }
78
79            self.yielded += 1;
80            return Some(LookupResult {
81                entry,
82                match_type: mc.match_type,
83                match_key: mc.key,
84                score: mc.score,
85                deinflection: mc.deinflection,
86            });
87        }
88    }
89}
90
91/// A builder for configuring and executing dictionary lookups.
92pub struct QueryBuilder<'d> {
93    dict: &'d Dict,
94    term: String,
95    mode: MatchMode,
96    common_only: bool,
97    pos_filter: Vec<String>,
98    misc_filter: Vec<String>,
99    field_filter: Vec<String>,
100    dialect_filter: Vec<String>,
101    limit: Option<usize>,
102    max_distance: u32,
103}
104
105impl<'d> QueryBuilder<'d> {
106    pub(crate) fn new(dict: &'d Dict, term: impl Into<String>) -> Self {
107        Self {
108            dict,
109            term: term.into(),
110            mode: MatchMode::Exact,
111            common_only: false,
112            pos_filter: Vec::new(),
113            misc_filter: Vec::new(),
114            field_filter: Vec::new(),
115            dialect_filter: Vec::new(),
116            limit: None,
117            max_distance: 2,
118        }
119    }
120
121    /// Set the match mode for this query.
122    pub fn mode(mut self, mode: MatchMode) -> Self {
123        self.mode = mode;
124        self
125    }
126
127    /// Filter to entries where any KanjiEntry or KanaEntry has `common: true`.
128    pub fn common_only(mut self, common: bool) -> Self {
129        self.common_only = common;
130        self
131    }
132
133    /// Filter to entries with matching part_of_speech values in any SenseEntry.
134    pub fn pos(mut self, pos: &[&str]) -> Self {
135        self.pos_filter = pos.iter().map(|s| s.to_string()).collect();
136        self
137    }
138
139    /// Filter to entries with any of the given JMdict `misc` codes
140    /// (e.g. `"uk"` for "usually written in kana", `"abbr"` for abbreviation).
141    pub fn misc(mut self, misc: &[&str]) -> Self {
142        self.misc_filter = misc.iter().map(|s| s.to_string()).collect();
143        self
144    }
145
146    /// Filter to entries with any of the given JMdict `field` codes
147    /// (e.g. `"med"` for medicine, `"comp"` for computing).
148    pub fn field(mut self, field: &[&str]) -> Self {
149        self.field_filter = field.iter().map(|s| s.to_string()).collect();
150        self
151    }
152
153    /// Filter to entries with any of the given JMdict `dialect` codes
154    /// (e.g. `"ksb"` for Kansai-ben, `"ktb"` for Kantou-ben).
155    pub fn dialect(mut self, dialect: &[&str]) -> Self {
156        self.dialect_filter = dialect.iter().map(|s| s.to_string()).collect();
157        self
158    }
159
160    /// Set the maximum edit distance for fuzzy search (default: 2).
161    ///
162    /// Clamped to a maximum of [`MAX_FUZZY_DISTANCE`] to keep the Levenshtein DFA
163    /// from blowing up — the automaton's state space grows quickly with distance.
164    pub fn max_distance(mut self, n: u32) -> Self {
165        self.max_distance = n.min(MAX_FUZZY_DISTANCE);
166        self
167    }
168
169    /// Cap results after filtering and sorting.
170    pub fn limit(mut self, limit: usize) -> Self {
171        self.limit = Some(limit);
172        self
173    }
174
175    /// Execute the query and return all results collected into a Vec.
176    pub fn execute(self) -> Result<Vec<LookupResult>, JmdictError> {
177        Ok(self.execute_iter()?.collect())
178    }
179
180    /// Execute the query and return a lazy iterator that deserializes entries on demand.
181    ///
182    /// This is more memory-efficient than `execute()` for large result sets (e.g., prefix
183    /// or fuzzy queries with many matches), as entries are only deserialized as consumed.
184    pub fn execute_iter(self) -> Result<LookupResultIter<'d>, JmdictError> {
185        let candidates = match self.mode {
186            MatchMode::Exact => self.dict.exact_candidates(&self.term),
187            MatchMode::Prefix => self.dict.prefix_candidates(&self.term),
188            MatchMode::Deinflect => self.dict.deinflect_candidates(&self.term),
189            MatchMode::Fuzzy => self.dict.fuzzy_candidates(&self.term, self.max_distance)?,
190        };
191
192        Ok(LookupResultIter {
193            dict: self.dict,
194            candidates: candidates.into_iter(),
195            common_only: self.common_only,
196            pos_filter: self.pos_filter,
197            misc_filter: self.misc_filter,
198            field_filter: self.field_filter,
199            dialect_filter: self.dialect_filter,
200            limit: self.limit,
201            yielded: 0,
202        })
203    }
204}
205
206/// A builder for configuring and executing batch dictionary lookups.
207pub struct BatchQueryBuilder<'d> {
208    dict: &'d Dict,
209    terms: Vec<String>,
210    mode: MatchMode,
211    common_only: bool,
212    pos_filter: Vec<String>,
213    misc_filter: Vec<String>,
214    field_filter: Vec<String>,
215    dialect_filter: Vec<String>,
216    limit: Option<usize>,
217    max_distance: u32,
218}
219
220impl<'d> BatchQueryBuilder<'d> {
221    pub(crate) fn new(dict: &'d Dict, terms: Vec<String>) -> Self {
222        Self {
223            dict,
224            terms,
225            mode: MatchMode::Exact,
226            common_only: false,
227            pos_filter: Vec::new(),
228            misc_filter: Vec::new(),
229            field_filter: Vec::new(),
230            dialect_filter: Vec::new(),
231            limit: None,
232            max_distance: 2,
233        }
234    }
235
236    /// Set the match mode for this batch query.
237    pub fn mode(mut self, mode: MatchMode) -> Self {
238        self.mode = mode;
239        self
240    }
241
242    /// Filter to entries where any KanjiEntry or KanaEntry has `common: true`.
243    pub fn common_only(mut self, common: bool) -> Self {
244        self.common_only = common;
245        self
246    }
247
248    /// Filter to entries with matching part_of_speech values in any SenseEntry.
249    pub fn pos(mut self, pos: &[&str]) -> Self {
250        self.pos_filter = pos.iter().map(|s| s.to_string()).collect();
251        self
252    }
253
254    /// Filter to entries with any of the given JMdict `misc` codes.
255    pub fn misc(mut self, misc: &[&str]) -> Self {
256        self.misc_filter = misc.iter().map(|s| s.to_string()).collect();
257        self
258    }
259
260    /// Filter to entries with any of the given JMdict `field` codes.
261    pub fn field(mut self, field: &[&str]) -> Self {
262        self.field_filter = field.iter().map(|s| s.to_string()).collect();
263        self
264    }
265
266    /// Filter to entries with any of the given JMdict `dialect` codes.
267    pub fn dialect(mut self, dialect: &[&str]) -> Self {
268        self.dialect_filter = dialect.iter().map(|s| s.to_string()).collect();
269        self
270    }
271
272    /// Cap results per term after filtering and sorting.
273    pub fn limit(mut self, limit: usize) -> Self {
274        self.limit = Some(limit);
275        self
276    }
277
278    /// Set the maximum edit distance for fuzzy search (default: 2).
279    ///
280    /// Clamped to a maximum of [`MAX_FUZZY_DISTANCE`].
281    pub fn max_distance(mut self, n: u32) -> Self {
282        self.max_distance = n.min(MAX_FUZZY_DISTANCE);
283        self
284    }
285
286    /// Execute the batch query and return results paired with each input term.
287    pub fn execute(self) -> Result<Vec<(String, Vec<LookupResult>)>, JmdictError> {
288        let pos_refs: Vec<&str> = self.pos_filter.iter().map(|s| s.as_str()).collect();
289        let misc_refs: Vec<&str> = self.misc_filter.iter().map(|s| s.as_str()).collect();
290        let field_refs: Vec<&str> = self.field_filter.iter().map(|s| s.as_str()).collect();
291        let dialect_refs: Vec<&str> = self.dialect_filter.iter().map(|s| s.as_str()).collect();
292        let mut batch_results = Vec::with_capacity(self.terms.len());
293        for term in &self.terms {
294            let mut builder = self
295                .dict
296                .lookup(term)
297                .mode(self.mode.clone())
298                .common_only(self.common_only)
299                .pos(&pos_refs)
300                .misc(&misc_refs)
301                .field(&field_refs)
302                .dialect(&dialect_refs)
303                .max_distance(self.max_distance);
304            if let Some(limit) = self.limit {
305                builder = builder.limit(limit);
306            }
307            batch_results.push((term.clone(), builder.execute()?));
308        }
309        Ok(batch_results)
310    }
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316
317    fn s(v: &[&str]) -> Vec<String> {
318        v.iter().map(|x| x.to_string()).collect()
319    }
320
321    #[test]
322    fn filter_passes_empty_filter_always_matches() {
323        assert!(filter_passes(&[], &s(&[])));
324        assert!(filter_passes(&[], &s(&["v1"])));
325    }
326
327    #[test]
328    fn filter_passes_substring_match() {
329        // "v" catches every verb POS code that contains "v"
330        assert!(filter_passes(&s(&["v"]), &s(&["v1"])));
331        assert!(filter_passes(&s(&["v"]), &s(&["v5k", "vt"])));
332        // "v1" is more selective
333        assert!(filter_passes(&s(&["v1"]), &s(&["v1", "vt"])));
334        assert!(!filter_passes(&s(&["v1"]), &s(&["v5k"])));
335    }
336
337    #[test]
338    fn filter_passes_misses_when_no_haystack_value_matches() {
339        assert!(!filter_passes(&s(&["v"]), &s(&["n"])));
340        assert!(!filter_passes(&s(&["v"]), &s(&[])));
341    }
342}