1use crate::error::JmdictError;
2use crate::model::{
3 DataVersion, DeinflectionInfo, Entry, LookupResult, MatchType, Xref, FORMAT_VERSION, MAGIC,
4};
5use crate::query::{BatchQueryBuilder, QueryBuilder};
6use fst::{automaton::Levenshtein, automaton::Str, Automaton, IntoStreamer, Map, Streamer};
7use memmap2::Mmap;
8use std::collections::{BTreeSet, HashMap};
9use std::sync::Arc;
10use std::{fs::File, path::Path};
11
12#[derive(Clone)]
17pub enum DictStorage {
18 Mmap(Arc<Mmap>),
21 Static(&'static [u8]),
23 Owned(Arc<Vec<u8>>),
25}
26
27impl AsRef<[u8]> for DictStorage {
28 fn as_ref(&self) -> &[u8] {
29 match self {
30 DictStorage::Mmap(m) => &m[..],
31 DictStorage::Static(s) => s,
32 DictStorage::Owned(v) => &v[..],
33 }
34 }
35}
36
37#[derive(Clone)]
39pub(crate) struct MatchCandidate {
40 pub(crate) id: u64,
41 pub(crate) key: String,
42 pub(crate) match_type: MatchType,
43 pub(crate) score: f64,
44 pub(crate) deinflection: Option<DeinflectionInfo>,
45}
46
47fn upsert_better(best: &mut HashMap<u64, MatchCandidate>, cand: MatchCandidate) {
53 match best.get(&cand.id) {
54 Some(existing) if existing.score >= cand.score => {}
55 _ => {
56 best.insert(cand.id, cand);
57 }
58 }
59}
60
61pub struct Dict {
62 pub entries_blob: DictStorage,
63 pub kana_fst: Map<DictStorage>,
64 pub kanji_fst: Map<DictStorage>,
65 pub romaji_fst: Map<DictStorage>,
66 pub id_fst: Map<DictStorage>,
67 pub gloss_fst: Map<DictStorage>,
69 pub gloss_postings: DictStorage,
72 deinflector: bunpo::deinflector::Deinflector,
73 data_version: DataVersion,
74 header_size: usize,
75 entry_count: u32,
76}
77
78struct HeaderInfo {
79 data_version: DataVersion,
80 header_size: usize,
82 entry_count: u32,
83}
84
85fn parse_entries_header(data: &[u8]) -> Result<HeaderInfo, JmdictError> {
87 if data.len() < 8 {
88 return Err(JmdictError::DataCorrupted);
89 }
90 if &data[0..4] != MAGIC {
91 return Err(JmdictError::DataCorrupted);
92 }
93 let version = u32::from_le_bytes(data[4..8].try_into().unwrap());
94 if version != FORMAT_VERSION {
95 return Err(JmdictError::DataVersionMismatch {
96 expected: FORMAT_VERSION,
97 found: version,
98 });
99 }
100
101 if data.len() < 10 {
103 return Err(JmdictError::DataCorrupted);
104 }
105 let jmdict_ver_len = u16::from_le_bytes(data[8..10].try_into().unwrap()) as usize;
106 let mut pos = 10;
107 if data.len() < pos + jmdict_ver_len + 2 {
108 return Err(JmdictError::DataCorrupted);
109 }
110 let jmdict_version = String::from_utf8_lossy(&data[pos..pos + jmdict_ver_len]).to_string();
111 pos += jmdict_ver_len;
112
113 let gen_at_len = u16::from_le_bytes(data[pos..pos + 2].try_into().unwrap()) as usize;
115 pos += 2;
116 if data.len() < pos + gen_at_len {
117 return Err(JmdictError::DataCorrupted);
118 }
119 let generated_at = String::from_utf8_lossy(&data[pos..pos + gen_at_len]).to_string();
120 pos += gen_at_len;
121
122 if data.len() < pos + 4 {
124 return Err(JmdictError::DataCorrupted);
125 }
126 let entry_count = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());
127
128 Ok(HeaderInfo {
129 data_version: DataVersion {
130 format_version: version,
131 jmdict_version,
132 generated_at,
133 },
134 header_size: pos,
135 entry_count,
136 })
137}
138
139fn postings_contains(bytes: &[u8], id: u64) -> bool {
144 let n = bytes.len() / 8;
145 let mut lo = 0;
146 let mut hi = n;
147 while lo < hi {
148 let mid = lo + (hi - lo) / 2;
149 let chunk = &bytes[mid * 8..mid * 8 + 8];
150 let v = u64::from_le_bytes(chunk.try_into().unwrap());
151 match v.cmp(&id) {
152 std::cmp::Ordering::Less => lo = mid + 1,
153 std::cmp::Ordering::Greater => hi = mid,
154 std::cmp::Ordering::Equal => return true,
155 }
156 }
157 false
158}
159
160fn mmap_storage(path: &Path) -> Result<DictStorage, JmdictError> {
161 let file = File::open(path)?;
162 let map = unsafe { Mmap::map(&file)? };
167 Ok(DictStorage::Mmap(Arc::new(map)))
168}
169
170impl Dict {
171 #[allow(clippy::too_many_arguments)]
174 pub fn from_slices(
175 entries: &'static [u8],
176 kana_fst: &'static [u8],
177 kanji_fst: &'static [u8],
178 romaji_fst: &'static [u8],
179 id_fst: &'static [u8],
180 gloss_fst: &'static [u8],
181 gloss_postings: &'static [u8],
182 ) -> Result<Self, JmdictError> {
183 Self::from_storage(
184 DictStorage::Static(entries),
185 DictStorage::Static(kana_fst),
186 DictStorage::Static(kanji_fst),
187 DictStorage::Static(romaji_fst),
188 DictStorage::Static(id_fst),
189 DictStorage::Static(gloss_fst),
190 DictStorage::Static(gloss_postings),
191 )
192 }
193
194 #[allow(clippy::too_many_arguments)]
198 pub fn from_storage(
199 entries: DictStorage,
200 kana_fst: DictStorage,
201 kanji_fst: DictStorage,
202 romaji_fst: DictStorage,
203 id_fst: DictStorage,
204 gloss_fst: DictStorage,
205 gloss_postings: DictStorage,
206 ) -> Result<Self, JmdictError> {
207 let header = parse_entries_header(entries.as_ref())?;
208 Ok(Self {
209 entries_blob: entries,
210 kana_fst: Map::new(kana_fst)?,
211 kanji_fst: Map::new(kanji_fst)?,
212 romaji_fst: Map::new(romaji_fst)?,
213 id_fst: Map::new(id_fst)?,
214 gloss_fst: Map::new(gloss_fst)?,
215 gloss_postings,
216 deinflector: bunpo::deinflector::Deinflector::new(),
217 data_version: header.data_version,
218 header_size: header.header_size,
219 entry_count: header.entry_count,
220 })
221 }
222
223 pub fn load<P: AsRef<Path>>(base_dir: P) -> Result<Self, JmdictError> {
226 let base = base_dir.as_ref();
227 let entries = mmap_storage(&base.join("entries.bin"))?;
228 let kana = mmap_storage(&base.join("kana.fst"))?;
229 let kanji = mmap_storage(&base.join("kanji.fst"))?;
230 let romaji = mmap_storage(&base.join("romaji.fst"))?;
231 let id = mmap_storage(&base.join("id.fst"))?;
232 let gloss = mmap_storage(&base.join("gloss.fst"))?;
233 let gloss_postings = mmap_storage(&base.join("gloss_postings.bin"))?;
234 Self::from_storage(entries, kana, kanji, romaji, id, gloss, gloss_postings)
235 }
236
237 #[cfg(feature = "embedded")]
238 pub fn load_embedded() -> Result<Self, JmdictError> {
239 let entries = include_bytes!(concat!(env!("OUT_DIR"), "/entries.bin"));
240 let kana_fst = include_bytes!(concat!(env!("OUT_DIR"), "/kana.fst"));
241 let kanji_fst = include_bytes!(concat!(env!("OUT_DIR"), "/kanji.fst"));
242 let romaji_fst = include_bytes!(concat!(env!("OUT_DIR"), "/romaji.fst"));
243 let id_fst = include_bytes!(concat!(env!("OUT_DIR"), "/id.fst"));
244 let gloss_fst = include_bytes!(concat!(env!("OUT_DIR"), "/gloss.fst"));
245 let gloss_postings = include_bytes!(concat!(env!("OUT_DIR"), "/gloss_postings.bin"));
246
247 Self::from_slices(
248 entries,
249 kana_fst,
250 kanji_fst,
251 romaji_fst,
252 id_fst,
253 gloss_fst,
254 gloss_postings,
255 )
256 }
257
258 pub fn load_default() -> Result<Self, JmdictError> {
259 #[cfg(feature = "embedded")]
260 {
261 if let Ok(dict) = Self::load_embedded() {
262 return Ok(dict);
263 }
264 }
265
266 if let Ok(data_path) = std::env::var("JMDICT_DATA") {
267 return Self::load(Path::new(&data_path));
268 }
269
270 let dist = Path::new("dist");
271 if dist.join("entries.bin").exists() {
272 return Self::load(dist);
273 }
274
275 #[cfg(test)]
279 {
280 let workspace_dist = Path::new(env!("CARGO_MANIFEST_DIR")).join("../dist");
281 if workspace_dist.join("entries.bin").exists() {
282 return Self::load(&workspace_dist);
283 }
284 }
285
286 Self::load(dist)
287 }
288
289 pub fn entry_count(&self) -> usize {
291 self.entry_count as usize
292 }
293
294 pub fn version(&self) -> DataVersion {
296 self.data_version.clone()
297 }
298
299 pub fn lookup_exact(&self, term: &str) -> Vec<LookupResult> {
303 self.lookup_exact_inner(term)
304 }
305
306 fn lookup_exact_inner(&self, term: &str) -> Vec<LookupResult> {
307 self.candidates_to_results(self.exact_candidates(term))
308 }
309
310 pub(crate) fn exact_candidates(&self, term: &str) -> Vec<MatchCandidate> {
311 let mut ids = Vec::new();
312
313 if let Some(id) = self.kana_fst.get(term) {
314 ids.push(id);
315 }
316 if let Some(id) = self.kanji_fst.get(term) {
317 ids.push(id);
318 }
319 if let Some(id) = self.romaji_fst.get(term) {
320 ids.push(id);
321 }
322
323 ids.sort();
324 ids.dedup();
325
326 ids.into_iter()
327 .map(|id| MatchCandidate {
328 id,
329 key: term.to_string(),
330 match_type: MatchType::Exact,
331 score: 1.0,
332 deinflection: None,
333 })
334 .collect()
335 }
336
337 pub fn lookup_exact_with_deinflection(&self, term: &str) -> Vec<LookupResult> {
341 self.lookup_exact_with_deinflection_inner(term)
342 }
343
344 fn lookup_exact_with_deinflection_inner(&self, term: &str) -> Vec<LookupResult> {
345 self.candidates_to_results(self.deinflect_candidates(term))
346 }
347
348 pub(crate) fn deinflect_candidates(&self, term: &str) -> Vec<MatchCandidate> {
349 let exact = self.exact_candidates(term);
351 if !exact.is_empty() {
352 return exact;
353 }
354
355 let deinflected = self.deinflector.deinflect(term);
357 let mut seen_ids = BTreeSet::new();
358 let mut candidates = Vec::new();
359 for candidate in deinflected {
360 let exact = self.exact_candidates(&candidate.word);
361 for mc in exact {
362 if !seen_ids.insert(mc.id) {
363 continue;
364 }
365 candidates.push(MatchCandidate {
366 id: mc.id,
367 key: candidate.word.clone(),
368 match_type: MatchType::Deinflected,
369 score: 0.75,
370 deinflection: Some(DeinflectionInfo {
371 original_form: term.to_string(),
372 base_form: candidate.word.clone(),
373 rules: candidate
374 .reason_chains
375 .iter()
376 .flatten()
377 .map(|r| format!("{:?}", r))
378 .collect(),
379 }),
380 });
381 }
382 }
383
384 candidates.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
386 candidates
387 }
388
389 pub fn lookup_partial(&self, prefix: &str) -> Vec<LookupResult> {
393 self.lookup_partial_inner(prefix)
394 }
395
396 fn lookup_partial_inner(&self, prefix: &str) -> Vec<LookupResult> {
397 self.candidates_to_results(self.prefix_candidates(prefix))
398 }
399
400 pub(crate) fn prefix_candidates(&self, prefix: &str) -> Vec<MatchCandidate> {
401 let automaton = Str::new(prefix).starts_with();
402
403 let mut best: HashMap<u64, MatchCandidate> = HashMap::new();
408 for fst in [&self.kana_fst, &self.kanji_fst, &self.romaji_fst] {
409 let mut stream = fst.search(&automaton).into_stream();
410 while let Some((key, id)) = stream.next() {
411 let key_str = String::from_utf8_lossy(key).to_string();
412 let is_exact = key_str == prefix;
413 let (match_type, score) = if is_exact {
414 (MatchType::Exact, 1.0)
415 } else {
416 (MatchType::Prefix, 0.5)
417 };
418 upsert_better(
419 &mut best,
420 MatchCandidate {
421 id,
422 key: key_str,
423 match_type,
424 score,
425 deinflection: None,
426 },
427 );
428 }
429 }
430
431 let mut candidates: Vec<MatchCandidate> = best.into_values().collect();
432 candidates.sort_by(|a, b| {
435 b.score
436 .partial_cmp(&a.score)
437 .unwrap()
438 .then(a.id.cmp(&b.id))
439 });
440 candidates
441 }
442
443 pub(crate) fn fuzzy_candidates(
444 &self,
445 term: &str,
446 max_distance: u32,
447 ) -> Result<Vec<MatchCandidate>, JmdictError> {
448 let automaton = Levenshtein::new(term, max_distance)
449 .map_err(|_| JmdictError::InvalidQuery)?;
450
451 let mut best: HashMap<u64, MatchCandidate> = HashMap::new();
452 for fst in [&self.kana_fst, &self.kanji_fst, &self.romaji_fst] {
453 let mut stream = fst.search(&automaton).into_stream();
454 while let Some((key, id)) = stream.next() {
455 let key_str = String::from_utf8_lossy(key).to_string();
456 let is_exact = key_str == term;
457 let (match_type, score) = if is_exact {
458 (MatchType::Exact, 1.0)
459 } else {
460 let key_len = key_str.chars().count().max(1) as f64;
461 let term_len = term.chars().count().max(1) as f64;
462 let len_diff = (key_len - term_len).abs();
463 let score = 0.5 - (len_diff / (key_len + term_len)) * 0.2;
464 (MatchType::Fuzzy, score.max(0.1))
465 };
466 upsert_better(
467 &mut best,
468 MatchCandidate {
469 id,
470 key: key_str,
471 match_type,
472 score,
473 deinflection: None,
474 },
475 );
476 }
477 }
478
479 let mut candidates: Vec<MatchCandidate> = best.into_values().collect();
480 candidates.sort_by(|a, b| {
483 b.score
484 .partial_cmp(&a.score)
485 .unwrap()
486 .then(a.id.cmp(&b.id))
487 });
488 Ok(candidates)
489 }
490
491 pub fn lookup(&self, term: &str) -> QueryBuilder<'_> {
493 QueryBuilder::new(self, term)
494 }
495
496 pub fn lookup_batch(&self, terms: &[&str]) -> BatchQueryBuilder<'_> {
498 BatchQueryBuilder::new(self, terms.iter().map(|s| s.to_string()).collect())
499 }
500
501 pub fn lookup_gloss(&self, query: &str) -> Vec<LookupResult> {
511 let mut tokens: Vec<String> = query
516 .split(|c: char| !c.is_ascii_alphanumeric())
517 .filter(|s| !s.is_empty())
518 .map(|s| s.to_ascii_lowercase())
519 .collect();
520 tokens.sort();
521 tokens.dedup();
522 if tokens.is_empty() {
523 return Vec::new();
524 }
525
526 let mut posting_lists: Vec<&[u8]> = Vec::with_capacity(tokens.len());
530 for tok in &tokens {
531 match self.gloss_postings_for(tok) {
532 Some(bytes) => posting_lists.push(bytes),
533 None => return Vec::new(),
534 }
535 }
536
537 posting_lists.sort_by_key(|p| p.len());
540 let smallest = posting_lists[0];
541 let rest = &posting_lists[1..];
542
543 let intersected: Vec<u64> = smallest
544 .chunks_exact(8)
545 .map(|c| u64::from_le_bytes(c.try_into().unwrap()))
546 .filter(|id| rest.iter().all(|other| postings_contains(other, *id)))
547 .collect();
548
549 let total_entries: usize = posting_lists
552 .iter()
553 .map(|p| p.len() / 8)
554 .sum::<usize>()
555 .max(1);
556 let score = 0.6f64.min(0.3 + (tokens.len() as f64) / (total_entries as f64));
557
558 let key = tokens.join(" ");
559 intersected
560 .into_iter()
561 .filter_map(|id| {
562 self.load_entry(id).map(|entry| LookupResult {
563 entry,
564 match_type: MatchType::Gloss,
565 match_key: key.clone(),
566 score,
567 deinflection: None,
568 })
569 })
570 .collect()
571 }
572
573 fn gloss_postings_for(&self, token: &str) -> Option<&[u8]> {
578 let offset = self.gloss_fst.get(token)? as usize;
579 let postings = self.gloss_postings.as_ref();
580 let count = u32::from_le_bytes(postings.get(offset..offset + 4)?.try_into().ok()?) as usize;
581 let start = offset + 4;
582 let end = start + count * 8;
583 postings.get(start..end)
584 }
585
586 pub fn resolve_xref(&self, xref: &Xref) -> Vec<LookupResult> {
594 let mut results = self.lookup_exact(&xref.term);
595 if let Some(reading) = xref.reading.as_deref() {
596 results.retain(|r| r.entry.kana.iter().any(|k| k.text == reading));
597 }
598 results
599 }
600
601 pub fn lookup_by_id(&self, jmdict_id: &str) -> Option<LookupResult> {
605 let seq_id = self.id_fst.get(jmdict_id)?;
606 let entry = self.load_entry(seq_id)?;
607 Some(LookupResult {
608 entry,
609 match_type: MatchType::Exact,
610 match_key: jmdict_id.to_string(),
611 score: 1.0,
612 deinflection: None,
613 })
614 }
615
616 pub fn get(&self, seq_id: u64) -> Option<Entry> {
622 self.load_entry(seq_id)
623 }
624
625 pub fn iter_entries(&self) -> EntryIter<'_> {
629 EntryIter {
630 dict: self,
631 next: 0,
632 end: self.entry_count as u64,
633 }
634 }
635
636 fn candidates_to_results(&self, candidates: Vec<MatchCandidate>) -> Vec<LookupResult> {
638 candidates
639 .into_iter()
640 .filter_map(|mc| {
641 self.load_entry(mc.id).map(|entry| LookupResult {
642 entry,
643 match_type: mc.match_type,
644 match_key: mc.key,
645 score: mc.score,
646 deinflection: mc.deinflection,
647 })
648 })
649 .collect()
650 }
651
652 pub(crate) fn load_entry(&self, id: u64) -> Option<Entry> {
654 let count = self.entry_count as usize;
655 if id as usize >= count {
656 return None;
657 }
658 let hs = self.header_size;
659 let offset_index = hs + 4 + (id as usize) * 8;
660 let blob = self.entries_blob.as_ref();
661 let off = u32::from_le_bytes(blob[offset_index..offset_index + 4].try_into().ok()?);
662 let len = u32::from_le_bytes(blob[offset_index + 4..offset_index + 8].try_into().ok()?);
663
664 let data_start = hs + 4 + count * 8;
665 let start = data_start + (off as usize);
666 let end = start + len as usize;
667
668 postcard::from_bytes(&blob[start..end]).ok()
669 }
670}
671
672pub struct EntryIter<'d> {
674 dict: &'d Dict,
675 next: u64,
676 end: u64,
677}
678
679impl<'d> Iterator for EntryIter<'d> {
680 type Item = Entry;
681
682 fn next(&mut self) -> Option<Self::Item> {
683 while self.next < self.end {
684 let id = self.next;
685 self.next += 1;
686 if let Some(e) = self.dict.load_entry(id) {
687 return Some(e);
688 }
689 }
690 None
691 }
692
693 fn size_hint(&self) -> (usize, Option<usize>) {
694 let remaining = (self.end - self.next) as usize;
695 (0, Some(remaining))
696 }
697}
698
699#[cfg(test)]
700mod tests {
701 use super::*;
702
703 fn pack(ids: &[u64]) -> Vec<u8> {
704 let mut v = Vec::with_capacity(ids.len() * 8);
705 for id in ids {
706 v.extend_from_slice(&id.to_le_bytes());
707 }
708 v
709 }
710
711 #[test]
712 fn postings_contains_hits_and_misses() {
713 let bytes = pack(&[1, 5, 10, 100, 1_000_000]);
714 assert!(postings_contains(&bytes, 1));
715 assert!(postings_contains(&bytes, 10));
716 assert!(postings_contains(&bytes, 1_000_000));
717 assert!(!postings_contains(&bytes, 0));
718 assert!(!postings_contains(&bytes, 2));
719 assert!(!postings_contains(&bytes, 99));
720 assert!(!postings_contains(&bytes, 1_000_001));
721 }
722
723 #[test]
724 fn postings_contains_empty_slice() {
725 assert!(!postings_contains(&[], 0));
726 assert!(!postings_contains(&[], 42));
727 }
728
729 #[test]
730 fn dict_storage_as_ref_owned() {
731 let storage = DictStorage::Owned(Arc::new(vec![1, 2, 3]));
732 assert_eq!(storage.as_ref(), &[1, 2, 3][..]);
733 }
734
735 #[test]
736 fn dict_storage_as_ref_static() {
737 let storage = DictStorage::Static(b"hello");
738 assert_eq!(storage.as_ref(), b"hello");
739 }
740
741 #[test]
742 fn parse_entries_header_rejects_bad_magic() {
743 let bad = b"XXXX\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
744 assert!(matches!(
745 parse_entries_header(bad),
746 Err(JmdictError::DataCorrupted)
747 ));
748 }
749
750 #[test]
751 fn parse_entries_header_rejects_short_buffer() {
752 assert!(matches!(
753 parse_entries_header(&[]),
754 Err(JmdictError::DataCorrupted)
755 ));
756 assert!(matches!(
757 parse_entries_header(b"JMD"),
758 Err(JmdictError::DataCorrupted)
759 ));
760 }
761
762 #[test]
763 fn parse_entries_header_rejects_version_mismatch() {
764 let mut buf = Vec::new();
765 buf.extend_from_slice(MAGIC);
766 buf.extend_from_slice(&(FORMAT_VERSION + 1).to_le_bytes());
767 match parse_entries_header(&buf) {
768 Err(JmdictError::DataVersionMismatch { expected, found }) => {
769 assert_eq!(expected, FORMAT_VERSION);
770 assert_eq!(found, FORMAT_VERSION + 1);
771 }
772 _ => panic!("expected DataVersionMismatch"),
773 }
774 }
775
776 #[test]
777 #[cfg(feature = "embedded")]
778 fn load_dict_embedded() {
779 let dict = Dict::load_embedded().expect("load failed");
780 assert!(dict.kana_fst.contains_key("ねこ"));
781 assert!(dict.kanji_fst.contains_key("猫"));
782 assert!(dict.romaji_fst.contains_key("neko"));
783
784 assert!(dict.kana_fst.contains_key("たべる"));
785 assert!(dict.kanji_fst.contains_key("食べる"));
786
787 assert!(dict.kana_fst.contains_key("にゃんこ"));
789 assert!(dict.kanji_fst.contains_key("鯉"));
791 }
792}