jmdict_fast/
dict.rs

1use crate::error::JmdictError;
2use crate::model::{
3    DataVersion, DeinflectionInfo, Entry, LookupResult, MatchType, Xref, FORMAT_VERSION, MAGIC,
4};
5use crate::query::{BatchQueryBuilder, QueryBuilder};
6use fst::{automaton::Levenshtein, automaton::Str, Automaton, IntoStreamer, Map, Streamer};
7use memmap2::Mmap;
8use std::collections::{BTreeSet, HashMap};
9use std::sync::Arc;
10use std::{fs::File, path::Path};
11
12/// Backing storage for [`Dict`] data. Holds either a memory-mapped file,
13/// a `'static` byte slice (used by the embedded feature), or an owned
14/// allocation. All three implement [`AsRef<[u8]>`] so `fst::Map` can wrap
15/// them uniformly.
16#[derive(Clone)]
17pub enum DictStorage {
18    /// Memory-mapped file; the `Arc` keeps the mapping alive while any
19    /// reference into it exists.
20    Mmap(Arc<Mmap>),
21    /// `'static` slice — typically from `include_bytes!`.
22    Static(&'static [u8]),
23    /// Owned buffer on the heap.
24    Owned(Arc<Vec<u8>>),
25}
26
27impl AsRef<[u8]> for DictStorage {
28    fn as_ref(&self) -> &[u8] {
29        match self {
30            DictStorage::Mmap(m) => &m[..],
31            DictStorage::Static(s) => s,
32            DictStorage::Owned(v) => &v[..],
33        }
34    }
35}
36
37/// A raw match candidate from FST search, before entry deserialization.
38#[derive(Clone)]
39pub(crate) struct MatchCandidate {
40    pub(crate) id: u64,
41    pub(crate) key: String,
42    pub(crate) match_type: MatchType,
43    pub(crate) score: f64,
44    pub(crate) deinflection: Option<DeinflectionInfo>,
45}
46
47/// Insert `cand` into `best` keyed by id, replacing the existing value only when
48/// `cand` has a strictly higher score. Used by prefix and fuzzy candidate
49/// collection so a later, better-quality hit (e.g. an `Exact` match in romaji)
50/// can supersede an earlier, lower-quality hit (e.g. a `Fuzzy` match in kana)
51/// for the same entry.
52fn upsert_better(best: &mut HashMap<u64, MatchCandidate>, cand: MatchCandidate) {
53    match best.get(&cand.id) {
54        Some(existing) if existing.score >= cand.score => {}
55        _ => {
56            best.insert(cand.id, cand);
57        }
58    }
59}
60
61pub struct Dict {
62    pub entries_blob: DictStorage,
63    pub kana_fst: Map<DictStorage>,
64    pub kanji_fst: Map<DictStorage>,
65    pub romaji_fst: Map<DictStorage>,
66    pub id_fst: Map<DictStorage>,
67    /// Reverse index: English gloss token → offset into `gloss_postings`.
68    pub gloss_fst: Map<DictStorage>,
69    /// Posting lists keyed by `gloss_fst` offset. At each offset is `u32 count`
70    /// followed by `count × u64` entry ids (little-endian).
71    pub gloss_postings: DictStorage,
72    deinflector: bunpo::deinflector::Deinflector,
73    data_version: DataVersion,
74    header_size: usize,
75    entry_count: u32,
76}
77
78struct HeaderInfo {
79    data_version: DataVersion,
80    /// Total bytes before entry_count (magic + version + metadata strings)
81    header_size: usize,
82    entry_count: u32,
83}
84
85/// Parse the entries.bin header: magic, format version, jmdict_version, generated_at, entry count.
86fn parse_entries_header(data: &[u8]) -> Result<HeaderInfo, JmdictError> {
87    if data.len() < 8 {
88        return Err(JmdictError::DataCorrupted);
89    }
90    if &data[0..4] != MAGIC {
91        return Err(JmdictError::DataCorrupted);
92    }
93    let version = u32::from_le_bytes(data[4..8].try_into().unwrap());
94    if version != FORMAT_VERSION {
95        return Err(JmdictError::DataVersionMismatch {
96            expected: FORMAT_VERSION,
97            found: version,
98        });
99    }
100
101    // Parse jmdict_version (u16 len + bytes)
102    if data.len() < 10 {
103        return Err(JmdictError::DataCorrupted);
104    }
105    let jmdict_ver_len = u16::from_le_bytes(data[8..10].try_into().unwrap()) as usize;
106    let mut pos = 10;
107    if data.len() < pos + jmdict_ver_len + 2 {
108        return Err(JmdictError::DataCorrupted);
109    }
110    let jmdict_version = String::from_utf8_lossy(&data[pos..pos + jmdict_ver_len]).to_string();
111    pos += jmdict_ver_len;
112
113    // Parse generated_at (u16 len + bytes)
114    let gen_at_len = u16::from_le_bytes(data[pos..pos + 2].try_into().unwrap()) as usize;
115    pos += 2;
116    if data.len() < pos + gen_at_len {
117        return Err(JmdictError::DataCorrupted);
118    }
119    let generated_at = String::from_utf8_lossy(&data[pos..pos + gen_at_len]).to_string();
120    pos += gen_at_len;
121
122    // Parse entry_count (u32) — validate presence so later reads don't panic.
123    if data.len() < pos + 4 {
124        return Err(JmdictError::DataCorrupted);
125    }
126    let entry_count = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());
127
128    Ok(HeaderInfo {
129        data_version: DataVersion {
130            format_version: version,
131            jmdict_version,
132            generated_at,
133        },
134        header_size: pos,
135        entry_count,
136    })
137}
138
139/// Binary search for `id` in a posting-list byte slice. The slice is treated
140/// as `N × 8` bytes of little-endian `u64`s sorted ascending — the layout
141/// `xtask` writes for `gloss_postings.bin`. Avoids decoding the whole list
142/// into a `Vec<u64>` for the common case where we only need membership.
143fn postings_contains(bytes: &[u8], id: u64) -> bool {
144    let n = bytes.len() / 8;
145    let mut lo = 0;
146    let mut hi = n;
147    while lo < hi {
148        let mid = lo + (hi - lo) / 2;
149        let chunk = &bytes[mid * 8..mid * 8 + 8];
150        let v = u64::from_le_bytes(chunk.try_into().unwrap());
151        match v.cmp(&id) {
152            std::cmp::Ordering::Less => lo = mid + 1,
153            std::cmp::Ordering::Greater => hi = mid,
154            std::cmp::Ordering::Equal => return true,
155        }
156    }
157    false
158}
159
160fn mmap_storage(path: &Path) -> Result<DictStorage, JmdictError> {
161    let file = File::open(path)?;
162    // SAFETY: the Mmap is kept alive inside an Arc inside DictStorage. We
163    // never mutate the underlying file, and the kernel will surface SIGBUS
164    // if the file is truncated under us — the same constraint memmap2
165    // documents for any user.
166    let map = unsafe { Mmap::map(&file)? };
167    Ok(DictStorage::Mmap(Arc::new(map)))
168}
169
170impl Dict {
171    /// Construct a `Dict` from `'static` in-memory slices, typically produced
172    /// by `include_bytes!` for the `embedded` feature.
173    #[allow(clippy::too_many_arguments)]
174    pub fn from_slices(
175        entries: &'static [u8],
176        kana_fst: &'static [u8],
177        kanji_fst: &'static [u8],
178        romaji_fst: &'static [u8],
179        id_fst: &'static [u8],
180        gloss_fst: &'static [u8],
181        gloss_postings: &'static [u8],
182    ) -> Result<Self, JmdictError> {
183        Self::from_storage(
184            DictStorage::Static(entries),
185            DictStorage::Static(kana_fst),
186            DictStorage::Static(kanji_fst),
187            DictStorage::Static(romaji_fst),
188            DictStorage::Static(id_fst),
189            DictStorage::Static(gloss_fst),
190            DictStorage::Static(gloss_postings),
191        )
192    }
193
194    /// Construct a `Dict` from already-loaded [`DictStorage`] values. Use this
195    /// to wire up custom storage (e.g. data downloaded into an in-memory
196    /// buffer): build `DictStorage::Owned(Arc::new(bytes))` and pass it in.
197    #[allow(clippy::too_many_arguments)]
198    pub fn from_storage(
199        entries: DictStorage,
200        kana_fst: DictStorage,
201        kanji_fst: DictStorage,
202        romaji_fst: DictStorage,
203        id_fst: DictStorage,
204        gloss_fst: DictStorage,
205        gloss_postings: DictStorage,
206    ) -> Result<Self, JmdictError> {
207        let header = parse_entries_header(entries.as_ref())?;
208        Ok(Self {
209            entries_blob: entries,
210            kana_fst: Map::new(kana_fst)?,
211            kanji_fst: Map::new(kanji_fst)?,
212            romaji_fst: Map::new(romaji_fst)?,
213            id_fst: Map::new(id_fst)?,
214            gloss_fst: Map::new(gloss_fst)?,
215            gloss_postings,
216            deinflector: bunpo::deinflector::Deinflector::new(),
217            data_version: header.data_version,
218            header_size: header.header_size,
219            entry_count: header.entry_count,
220        })
221    }
222
223    /// Load all FSTs and entries via real `mmap` (zero-copy). The OS pages in
224    /// data on demand and shares it across processes that map the same file.
225    pub fn load<P: AsRef<Path>>(base_dir: P) -> Result<Self, JmdictError> {
226        let base = base_dir.as_ref();
227        let entries = mmap_storage(&base.join("entries.bin"))?;
228        let kana = mmap_storage(&base.join("kana.fst"))?;
229        let kanji = mmap_storage(&base.join("kanji.fst"))?;
230        let romaji = mmap_storage(&base.join("romaji.fst"))?;
231        let id = mmap_storage(&base.join("id.fst"))?;
232        let gloss = mmap_storage(&base.join("gloss.fst"))?;
233        let gloss_postings = mmap_storage(&base.join("gloss_postings.bin"))?;
234        Self::from_storage(entries, kana, kanji, romaji, id, gloss, gloss_postings)
235    }
236
237    #[cfg(feature = "embedded")]
238    pub fn load_embedded() -> Result<Self, JmdictError> {
239        let entries = include_bytes!(concat!(env!("OUT_DIR"), "/entries.bin"));
240        let kana_fst = include_bytes!(concat!(env!("OUT_DIR"), "/kana.fst"));
241        let kanji_fst = include_bytes!(concat!(env!("OUT_DIR"), "/kanji.fst"));
242        let romaji_fst = include_bytes!(concat!(env!("OUT_DIR"), "/romaji.fst"));
243        let id_fst = include_bytes!(concat!(env!("OUT_DIR"), "/id.fst"));
244        let gloss_fst = include_bytes!(concat!(env!("OUT_DIR"), "/gloss.fst"));
245        let gloss_postings = include_bytes!(concat!(env!("OUT_DIR"), "/gloss_postings.bin"));
246
247        Self::from_slices(
248            entries,
249            kana_fst,
250            kanji_fst,
251            romaji_fst,
252            id_fst,
253            gloss_fst,
254            gloss_postings,
255        )
256    }
257
258    pub fn load_default() -> Result<Self, JmdictError> {
259        #[cfg(feature = "embedded")]
260        {
261            if let Ok(dict) = Self::load_embedded() {
262                return Ok(dict);
263            }
264        }
265
266        if let Ok(data_path) = std::env::var("JMDICT_DATA") {
267            return Self::load(Path::new(&data_path));
268        }
269
270        let dist = Path::new("dist");
271        if dist.join("entries.bin").exists() {
272            return Self::load(dist);
273        }
274
275        // Test-only fallback: when running this crate's own tests, cargo sets CWD to
276        // jmdict-fast/, but `dist/` lives at the workspace root. CARGO_MANIFEST_DIR is
277        // resolved at compile time, so this path is only meaningful for in-repo builds.
278        #[cfg(test)]
279        {
280            let workspace_dist = Path::new(env!("CARGO_MANIFEST_DIR")).join("../dist");
281            if workspace_dist.join("entries.bin").exists() {
282                return Self::load(&workspace_dist);
283            }
284        }
285
286        Self::load(dist)
287    }
288
289    /// Returns the total number of entries in the dictionary.
290    pub fn entry_count(&self) -> usize {
291        self.entry_count as usize
292    }
293
294    /// Returns data version information (format version, JMdict source version, generation timestamp).
295    pub fn version(&self) -> DataVersion {
296        self.data_version.clone()
297    }
298
299    /// Lookup a term exactly across kana, kanji, romaji.
300    ///
301    /// Convenience method equivalent to `dict.lookup(term).mode(MatchMode::Exact).execute()`.
302    pub fn lookup_exact(&self, term: &str) -> Vec<LookupResult> {
303        self.lookup_exact_inner(term)
304    }
305
306    fn lookup_exact_inner(&self, term: &str) -> Vec<LookupResult> {
307        self.candidates_to_results(self.exact_candidates(term))
308    }
309
310    pub(crate) fn exact_candidates(&self, term: &str) -> Vec<MatchCandidate> {
311        let mut ids = Vec::new();
312
313        if let Some(id) = self.kana_fst.get(term) {
314            ids.push(id);
315        }
316        if let Some(id) = self.kanji_fst.get(term) {
317            ids.push(id);
318        }
319        if let Some(id) = self.romaji_fst.get(term) {
320            ids.push(id);
321        }
322
323        ids.sort();
324        ids.dedup();
325
326        ids.into_iter()
327            .map(|id| MatchCandidate {
328                id,
329                key: term.to_string(),
330                match_type: MatchType::Exact,
331                score: 1.0,
332                deinflection: None,
333            })
334            .collect()
335    }
336
337    /// Lookup a term with deinflection fallback.
338    ///
339    /// Convenience method equivalent to `dict.lookup(term).mode(MatchMode::Deinflect).execute()`.
340    pub fn lookup_exact_with_deinflection(&self, term: &str) -> Vec<LookupResult> {
341        self.lookup_exact_with_deinflection_inner(term)
342    }
343
344    fn lookup_exact_with_deinflection_inner(&self, term: &str) -> Vec<LookupResult> {
345        self.candidates_to_results(self.deinflect_candidates(term))
346    }
347
348    pub(crate) fn deinflect_candidates(&self, term: &str) -> Vec<MatchCandidate> {
349        // First try exact
350        let exact = self.exact_candidates(term);
351        if !exact.is_empty() {
352            return exact;
353        }
354
355        // Then deinflect
356        let deinflected = self.deinflector.deinflect(term);
357        let mut seen_ids = BTreeSet::new();
358        let mut candidates = Vec::new();
359        for candidate in deinflected {
360            let exact = self.exact_candidates(&candidate.word);
361            for mc in exact {
362                if !seen_ids.insert(mc.id) {
363                    continue;
364                }
365                candidates.push(MatchCandidate {
366                    id: mc.id,
367                    key: candidate.word.clone(),
368                    match_type: MatchType::Deinflected,
369                    score: 0.75,
370                    deinflection: Some(DeinflectionInfo {
371                        original_form: term.to_string(),
372                        base_form: candidate.word.clone(),
373                        rules: candidate
374                            .reason_chains
375                            .iter()
376                            .flatten()
377                            .map(|r| format!("{:?}", r))
378                            .collect(),
379                    }),
380                });
381            }
382        }
383
384        // Sort by score descending
385        candidates.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
386        candidates
387    }
388
389    /// Lookup entries that start with the given term (prefix search).
390    ///
391    /// Convenience method equivalent to `dict.lookup(term).mode(MatchMode::Prefix).execute()`.
392    pub fn lookup_partial(&self, prefix: &str) -> Vec<LookupResult> {
393        self.lookup_partial_inner(prefix)
394    }
395
396    fn lookup_partial_inner(&self, prefix: &str) -> Vec<LookupResult> {
397        self.candidates_to_results(self.prefix_candidates(prefix))
398    }
399
400    pub(crate) fn prefix_candidates(&self, prefix: &str) -> Vec<MatchCandidate> {
401        let automaton = Str::new(prefix).starts_with();
402
403        // Keep the highest-scored match per entry id. Walking the FSTs in order
404        // alone is not enough: a low-quality prefix hit in one index can be
405        // followed by an exact hit in another for the same id, and the better
406        // hit must win.
407        let mut best: HashMap<u64, MatchCandidate> = HashMap::new();
408        for fst in [&self.kana_fst, &self.kanji_fst, &self.romaji_fst] {
409            let mut stream = fst.search(&automaton).into_stream();
410            while let Some((key, id)) = stream.next() {
411                let key_str = String::from_utf8_lossy(key).to_string();
412                let is_exact = key_str == prefix;
413                let (match_type, score) = if is_exact {
414                    (MatchType::Exact, 1.0)
415                } else {
416                    (MatchType::Prefix, 0.5)
417                };
418                upsert_better(
419                    &mut best,
420                    MatchCandidate {
421                        id,
422                        key: key_str,
423                        match_type,
424                        score,
425                        deinflection: None,
426                    },
427                );
428            }
429        }
430
431        let mut candidates: Vec<MatchCandidate> = best.into_values().collect();
432        // Secondary key on `id` keeps the order deterministic for equal scores —
433        // `HashMap` iteration order varies between instances.
434        candidates.sort_by(|a, b| {
435            b.score
436                .partial_cmp(&a.score)
437                .unwrap()
438                .then(a.id.cmp(&b.id))
439        });
440        candidates
441    }
442
443    pub(crate) fn fuzzy_candidates(
444        &self,
445        term: &str,
446        max_distance: u32,
447    ) -> Result<Vec<MatchCandidate>, JmdictError> {
448        let automaton = Levenshtein::new(term, max_distance)
449            .map_err(|_| JmdictError::InvalidQuery)?;
450
451        let mut best: HashMap<u64, MatchCandidate> = HashMap::new();
452        for fst in [&self.kana_fst, &self.kanji_fst, &self.romaji_fst] {
453            let mut stream = fst.search(&automaton).into_stream();
454            while let Some((key, id)) = stream.next() {
455                let key_str = String::from_utf8_lossy(key).to_string();
456                let is_exact = key_str == term;
457                let (match_type, score) = if is_exact {
458                    (MatchType::Exact, 1.0)
459                } else {
460                    let key_len = key_str.chars().count().max(1) as f64;
461                    let term_len = term.chars().count().max(1) as f64;
462                    let len_diff = (key_len - term_len).abs();
463                    let score = 0.5 - (len_diff / (key_len + term_len)) * 0.2;
464                    (MatchType::Fuzzy, score.max(0.1))
465                };
466                upsert_better(
467                    &mut best,
468                    MatchCandidate {
469                        id,
470                        key: key_str,
471                        match_type,
472                        score,
473                        deinflection: None,
474                    },
475                );
476            }
477        }
478
479        let mut candidates: Vec<MatchCandidate> = best.into_values().collect();
480        // Secondary key on `id` keeps the order deterministic for equal scores —
481        // `HashMap` iteration order varies between instances.
482        candidates.sort_by(|a, b| {
483            b.score
484                .partial_cmp(&a.score)
485                .unwrap()
486                .then(a.id.cmp(&b.id))
487        });
488        Ok(candidates)
489    }
490
491    /// Create a query builder for the given term.
492    pub fn lookup(&self, term: &str) -> QueryBuilder<'_> {
493        QueryBuilder::new(self, term)
494    }
495
496    /// Create a batch query builder for multiple terms.
497    pub fn lookup_batch(&self, terms: &[&str]) -> BatchQueryBuilder<'_> {
498        BatchQueryBuilder::new(self, terms.iter().map(|s| s.to_string()).collect())
499    }
500
501    /// Reverse lookup: find entries whose English glosses contain every token
502    /// in `query` (ANDed). Tokenization at query time mirrors the build-time
503    /// tokenizer: ASCII alphanumeric, lowercased, non-ASCII treated as a
504    /// separator.
505    ///
506    /// Results are ranked: rarer tokens (shorter posting lists) raise the
507    /// score; the rank cap is 0.6 so kanji/kana exact lookups still win in
508    /// mixed pipelines. Returns an empty `Vec` if the query has no usable
509    /// tokens or any token is absent.
510    pub fn lookup_gloss(&self, query: &str) -> Vec<LookupResult> {
511        // Mirror the build-time tokenizer in xtask: ASCII alphanumeric,
512        // lowercased, sorted and deduped. Sorting+dedup makes "cat cat" and
513        // "cat box" / "box cat" canonical so the match_key is stable and we
514        // don't read the same posting list twice.
515        let mut tokens: Vec<String> = query
516            .split(|c: char| !c.is_ascii_alphanumeric())
517            .filter(|s| !s.is_empty())
518            .map(|s| s.to_ascii_lowercase())
519            .collect();
520        tokens.sort();
521        tokens.dedup();
522        if tokens.is_empty() {
523            return Vec::new();
524        }
525
526        // For each token, take a *borrowed slice* into the mmap'd postings
527        // file. No allocation — the slice is already the array of u64 ids
528        // (sorted, deduped, little-endian) at the right offset.
529        let mut posting_lists: Vec<&[u8]> = Vec::with_capacity(tokens.len());
530        for tok in &tokens {
531            match self.gloss_postings_for(tok) {
532                Some(bytes) => posting_lists.push(bytes),
533                None => return Vec::new(),
534            }
535        }
536
537        // Intersect smallest-first to minimize work. Only the smallest list
538        // is iterated; the rest are binary-searched in place.
539        posting_lists.sort_by_key(|p| p.len());
540        let smallest = posting_lists[0];
541        let rest = &posting_lists[1..];
542
543        let intersected: Vec<u64> = smallest
544            .chunks_exact(8)
545            .map(|c| u64::from_le_bytes(c.try_into().unwrap()))
546            .filter(|id| rest.iter().all(|other| postings_contains(other, *id)))
547            .collect();
548
549        // Score: 0.6 ceiling; rarer tokens nudge the score up via 1 /
550        // posting-list-size (in entry counts, not bytes).
551        let total_entries: usize = posting_lists
552            .iter()
553            .map(|p| p.len() / 8)
554            .sum::<usize>()
555            .max(1);
556        let score = 0.6f64.min(0.3 + (tokens.len() as f64) / (total_entries as f64));
557
558        let key = tokens.join(" ");
559        intersected
560            .into_iter()
561            .filter_map(|id| {
562                self.load_entry(id).map(|entry| LookupResult {
563                    entry,
564                    match_type: MatchType::Gloss,
565                    match_key: key.clone(),
566                    score,
567                    deinflection: None,
568                })
569            })
570            .collect()
571    }
572
573    /// Borrow the posting bytes for a single gloss token. The returned slice
574    /// is `count × 8` bytes of little-endian `u64` entry ids, already sorted
575    /// and deduplicated by `xtask`. Returns `None` if the token is absent or
576    /// the postings file is truncated.
577    fn gloss_postings_for(&self, token: &str) -> Option<&[u8]> {
578        let offset = self.gloss_fst.get(token)? as usize;
579        let postings = self.gloss_postings.as_ref();
580        let count = u32::from_le_bytes(postings.get(offset..offset + 4)?.try_into().ok()?) as usize;
581        let start = offset + 4;
582        let end = start + count * 8;
583        postings.get(start..end)
584    }
585
586    /// Resolve a cross-reference ([`Xref`]) to dictionary entries.
587    ///
588    /// Looks up `xref.term` across kanji and kana indexes. If `xref.reading`
589    /// is set, results are further restricted to entries whose kana matches
590    /// that reading — this disambiguates homographs like 生 (なま / せい).
591    /// `xref.sense_index` is preserved on the caller side: this returns whole
592    /// entries, since the surrounding `LookupResult` is per-entry.
593    pub fn resolve_xref(&self, xref: &Xref) -> Vec<LookupResult> {
594        let mut results = self.lookup_exact(&xref.term);
595        if let Some(reading) = xref.reading.as_deref() {
596            results.retain(|r| r.entry.kana.iter().any(|k| k.text == reading));
597        }
598        results
599    }
600
601    /// Lookup an entry by its JMdict ID (the string `entry.id`, e.g. `"1467640"`).
602    ///
603    /// Returns `None` if no entry with that ID exists.
604    pub fn lookup_by_id(&self, jmdict_id: &str) -> Option<LookupResult> {
605        let seq_id = self.id_fst.get(jmdict_id)?;
606        let entry = self.load_entry(seq_id)?;
607        Some(LookupResult {
608            entry,
609            match_type: MatchType::Exact,
610            match_key: jmdict_id.to_string(),
611            score: 1.0,
612            deinflection: None,
613        })
614    }
615
616    /// Fetch an entry by its sequential (internal) index in `0..entry_count()`.
617    ///
618    /// Sequential IDs are stable for a given `entries.bin` but may change when
619    /// the data is regenerated; use [`Dict::lookup_by_id`] for stable lookups
620    /// across regenerations.
621    pub fn get(&self, seq_id: u64) -> Option<Entry> {
622        self.load_entry(seq_id)
623    }
624
625    /// Iterate over every entry in the dictionary, in sequential-ID order.
626    ///
627    /// Entries are deserialized lazily as the iterator advances.
628    pub fn iter_entries(&self) -> EntryIter<'_> {
629        EntryIter {
630            dict: self,
631            next: 0,
632            end: self.entry_count as u64,
633        }
634    }
635
636    /// Convert match candidates to results by deserializing entries.
637    fn candidates_to_results(&self, candidates: Vec<MatchCandidate>) -> Vec<LookupResult> {
638        candidates
639            .into_iter()
640            .filter_map(|mc| {
641                self.load_entry(mc.id).map(|entry| LookupResult {
642                    entry,
643                    match_type: mc.match_type,
644                    match_key: mc.key,
645                    score: mc.score,
646                    deinflection: mc.deinflection,
647                })
648            })
649            .collect()
650    }
651
652    // When reading offsets, start after header + entry_count (4 bytes)
653    pub(crate) fn load_entry(&self, id: u64) -> Option<Entry> {
654        let count = self.entry_count as usize;
655        if id as usize >= count {
656            return None;
657        }
658        let hs = self.header_size;
659        let offset_index = hs + 4 + (id as usize) * 8;
660        let blob = self.entries_blob.as_ref();
661        let off = u32::from_le_bytes(blob[offset_index..offset_index + 4].try_into().ok()?);
662        let len = u32::from_le_bytes(blob[offset_index + 4..offset_index + 8].try_into().ok()?);
663
664        let data_start = hs + 4 + count * 8;
665        let start = data_start + (off as usize);
666        let end = start + len as usize;
667
668        postcard::from_bytes(&blob[start..end]).ok()
669    }
670}
671
672/// Iterator over every [`Entry`] in a [`Dict`], in sequential-ID order.
673pub struct EntryIter<'d> {
674    dict: &'d Dict,
675    next: u64,
676    end: u64,
677}
678
679impl<'d> Iterator for EntryIter<'d> {
680    type Item = Entry;
681
682    fn next(&mut self) -> Option<Self::Item> {
683        while self.next < self.end {
684            let id = self.next;
685            self.next += 1;
686            if let Some(e) = self.dict.load_entry(id) {
687                return Some(e);
688            }
689        }
690        None
691    }
692
693    fn size_hint(&self) -> (usize, Option<usize>) {
694        let remaining = (self.end - self.next) as usize;
695        (0, Some(remaining))
696    }
697}
698
699#[cfg(test)]
700mod tests {
701    use super::*;
702
703    fn pack(ids: &[u64]) -> Vec<u8> {
704        let mut v = Vec::with_capacity(ids.len() * 8);
705        for id in ids {
706            v.extend_from_slice(&id.to_le_bytes());
707        }
708        v
709    }
710
711    #[test]
712    fn postings_contains_hits_and_misses() {
713        let bytes = pack(&[1, 5, 10, 100, 1_000_000]);
714        assert!(postings_contains(&bytes, 1));
715        assert!(postings_contains(&bytes, 10));
716        assert!(postings_contains(&bytes, 1_000_000));
717        assert!(!postings_contains(&bytes, 0));
718        assert!(!postings_contains(&bytes, 2));
719        assert!(!postings_contains(&bytes, 99));
720        assert!(!postings_contains(&bytes, 1_000_001));
721    }
722
723    #[test]
724    fn postings_contains_empty_slice() {
725        assert!(!postings_contains(&[], 0));
726        assert!(!postings_contains(&[], 42));
727    }
728
729    #[test]
730    fn dict_storage_as_ref_owned() {
731        let storage = DictStorage::Owned(Arc::new(vec![1, 2, 3]));
732        assert_eq!(storage.as_ref(), &[1, 2, 3][..]);
733    }
734
735    #[test]
736    fn dict_storage_as_ref_static() {
737        let storage = DictStorage::Static(b"hello");
738        assert_eq!(storage.as_ref(), b"hello");
739    }
740
741    #[test]
742    fn parse_entries_header_rejects_bad_magic() {
743        let bad = b"XXXX\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
744        assert!(matches!(
745            parse_entries_header(bad),
746            Err(JmdictError::DataCorrupted)
747        ));
748    }
749
750    #[test]
751    fn parse_entries_header_rejects_short_buffer() {
752        assert!(matches!(
753            parse_entries_header(&[]),
754            Err(JmdictError::DataCorrupted)
755        ));
756        assert!(matches!(
757            parse_entries_header(b"JMD"),
758            Err(JmdictError::DataCorrupted)
759        ));
760    }
761
762    #[test]
763    fn parse_entries_header_rejects_version_mismatch() {
764        let mut buf = Vec::new();
765        buf.extend_from_slice(MAGIC);
766        buf.extend_from_slice(&(FORMAT_VERSION + 1).to_le_bytes());
767        match parse_entries_header(&buf) {
768            Err(JmdictError::DataVersionMismatch { expected, found }) => {
769                assert_eq!(expected, FORMAT_VERSION);
770                assert_eq!(found, FORMAT_VERSION + 1);
771            }
772            _ => panic!("expected DataVersionMismatch"),
773        }
774    }
775
776    #[test]
777    #[cfg(feature = "embedded")]
778    fn load_dict_embedded() {
779        let dict = Dict::load_embedded().expect("load failed");
780        assert!(dict.kana_fst.contains_key("ねこ"));
781        assert!(dict.kanji_fst.contains_key("猫"));
782        assert!(dict.romaji_fst.contains_key("neko"));
783
784        assert!(dict.kana_fst.contains_key("たべる"));
785        assert!(dict.kanji_fst.contains_key("食べる"));
786
787        // uncommon kana
788        assert!(dict.kana_fst.contains_key("にゃんこ"));
789        // uncommon kanji
790        assert!(dict.kanji_fst.contains_key("鯉"));
791    }
792}
jmdict_fast/dict.rs

jmdict_fast/
dict.rs