Skip to main content

jmdict_fast/
model.rs

1use serde::Deserialize;
2
3/// How a lookup result matched the query term.
4#[derive(Debug, Clone, PartialEq)]
5pub enum MatchType {
6    Exact,
7    Prefix,
8    Deinflected,
9    Fuzzy,
10    /// Matched via reverse lookup in English glosses (see [`crate::Dict::lookup_gloss`]).
11    Gloss,
12}
13
14/// The search mode for a query.
15#[derive(Debug, Clone, PartialEq)]
16pub enum MatchMode {
17    /// Exact match only.
18    Exact,
19    /// Prefix (starts-with) search.
20    Prefix,
21    /// Exact match with deinflection fallback.
22    Deinflect,
23    /// Fuzzy (approximate) match.
24    Fuzzy,
25}
26
27/// Information about how a term was deinflected to find its base form.
28#[derive(Debug, Clone)]
29pub struct DeinflectionInfo {
30    pub original_form: String,
31    pub base_form: String,
32    pub rules: Vec<String>,
33}
34
35/// A structured lookup result with metadata about how it matched.
36#[derive(Debug, Clone)]
37pub struct LookupResult {
38    pub entry: Entry,
39    pub match_type: MatchType,
40    pub match_key: String,
41    pub score: f64,
42    pub deinflection: Option<DeinflectionInfo>,
43}
44
45/// Magic bytes that prefix every `entries.bin`. The library uses these to
46/// distinguish a valid jmdict-fast data file from arbitrary input before
47/// touching the format version.
48pub const MAGIC: &[u8; 4] = b"JMDF";
49
50/// Binary format version for entries.bin. Bump whenever the on-disk layout,
51/// the serialized `Entry` struct, or the set of expected sibling files
52/// changes — `Dict::load` rejects mismatched versions instead of attempting a
53/// deserialize that may silently succeed.
54///
55/// History:
56/// - 3 → 4: added `gloss.fst` and `gloss_postings.bin` for English reverse
57///   lookup. `entries.bin` layout unchanged but loaders now require both new
58///   files to exist alongside it.
59pub const FORMAT_VERSION: u32 = 4;
60
61/// JMdict source version this build was generated against. Surfaces to the
62/// `install` feature so `Dict::install()` can resolve the right release
63/// tarball without the caller hardcoding a version. Kept in lock-step with
64/// `xtask::JMDICT_VERSION`; bump together when upgrading the upstream
65/// JMdict release.
66pub const JMDICT_VERSION: &str = "3.6.1";
67
68/// Dictionary data version information.
69#[derive(Debug, Clone)]
70pub struct DataVersion {
71    pub format_version: u32,
72    pub jmdict_version: String,
73    pub generated_at: String,
74}
75
76#[derive(Debug, Deserialize, Clone)]
77pub struct Entry {
78    pub id: String,
79    pub kanji: Vec<KanjiEntry>,
80    pub kana: Vec<KanaEntry>,
81    pub sense: Vec<SenseEntry>,
82}
83
84impl Entry {
85    /// The first kanji writing, if any. Many entries (kana-only words, names)
86    /// have no kanji form, so callers should not assume `kanji[0]` exists.
87    pub fn primary_kanji(&self) -> Option<&str> {
88        self.kanji.first().map(|k| k.text.as_str())
89    }
90
91    /// The first kana reading. Every JMdict entry has at least one kana
92    /// reading, but the field is technically a `Vec`, so this returns `Option`
93    /// to stay honest about the data model.
94    pub fn primary_kana(&self) -> Option<&str> {
95        self.kana.first().map(|k| k.text.as_str())
96    }
97
98    /// The preferred headword for display: kanji if present, otherwise kana.
99    pub fn headword(&self) -> Option<&str> {
100        self.primary_kanji().or_else(|| self.primary_kana())
101    }
102
103    /// `true` if any kanji or kana form is marked common by JMdict.
104    pub fn is_common(&self) -> bool {
105        self.kanji.iter().any(|k| k.common) || self.kana.iter().any(|k| k.common)
106    }
107
108    /// Iterate over every gloss matching `lang` (e.g. `"eng"`), across all senses.
109    pub fn glosses<'a>(&'a self, lang: &'a str) -> impl Iterator<Item = &'a str> + 'a {
110        self.sense
111            .iter()
112            .flat_map(move |s| s.gloss.iter())
113            .filter(move |g| g.lang == lang)
114            .map(|g| g.text.as_str())
115    }
116
117    /// Every distinct part-of-speech tag across all senses, preserving first-seen order.
118    pub fn parts_of_speech(&self) -> Vec<&str> {
119        let mut seen = Vec::new();
120        for s in &self.sense {
121            for p in &s.part_of_speech {
122                if !seen.iter().any(|x: &&str| *x == p.as_str()) {
123                    seen.push(p.as_str());
124                }
125            }
126        }
127        seen
128    }
129}
130
131#[derive(Debug, Deserialize, Clone)]
132pub struct KanjiEntry {
133    pub common: bool,
134    pub text: String,
135    pub tags: Vec<String>,
136}
137
138#[derive(Debug, Deserialize, Clone)]
139pub struct KanaEntry {
140    pub common: bool,
141    pub text: String,
142    pub tags: Vec<String>,
143    #[serde(rename = "appliesToKanji")]
144    pub applies_to_kanji: Vec<String>,
145}
146
147#[derive(Debug, Deserialize, Clone)]
148pub struct Xref {
149    pub term: String,
150    pub reading: Option<String>,
151    pub sense_index: Option<u32>,
152}
153
154#[derive(Debug, Deserialize, Clone)]
155pub struct LanguageSource {
156    pub lang: String,
157    pub full: bool,
158    pub wasei: bool,
159    pub text: Option<String>,
160}
161
162#[derive(Debug, Deserialize, Clone)]
163pub struct SenseEntry {
164    #[serde(rename = "partOfSpeech")]
165    pub part_of_speech: Vec<String>,
166    #[serde(rename = "appliesToKanji")]
167    pub applies_to_kanji: Vec<String>,
168    #[serde(rename = "appliesToKana")]
169    pub applies_to_kana: Vec<String>,
170    pub related: Vec<Xref>,
171    pub antonym: Vec<Xref>,
172    pub field: Vec<String>,
173    pub dialect: Vec<String>,
174    pub misc: Vec<String>,
175    pub info: Vec<String>,
176    pub language_source: Vec<LanguageSource>,
177    pub gloss: Vec<GlossEntry>,
178}
179
180#[derive(Debug, Deserialize, Clone)]
181pub struct GlossEntry {
182    pub lang: String,
183    pub gender: Option<String>,
184    #[serde(rename = "type")]
185    pub gloss_type: Option<String>,
186    pub text: String,
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192
193    fn gloss(lang: &str, text: &str) -> GlossEntry {
194        GlossEntry {
195            lang: lang.into(),
196            gender: None,
197            gloss_type: None,
198            text: text.into(),
199        }
200    }
201
202    fn sense(pos: &[&str], glosses: Vec<GlossEntry>) -> SenseEntry {
203        SenseEntry {
204            part_of_speech: pos.iter().map(|s| s.to_string()).collect(),
205            applies_to_kanji: Vec::new(),
206            applies_to_kana: Vec::new(),
207            related: Vec::new(),
208            antonym: Vec::new(),
209            field: Vec::new(),
210            dialect: Vec::new(),
211            misc: Vec::new(),
212            info: Vec::new(),
213            language_source: Vec::new(),
214            gloss: glosses,
215        }
216    }
217
218    fn kanji(text: &str, common: bool) -> KanjiEntry {
219        KanjiEntry {
220            common,
221            text: text.into(),
222            tags: Vec::new(),
223        }
224    }
225
226    fn kana(text: &str, common: bool) -> KanaEntry {
227        KanaEntry {
228            common,
229            text: text.into(),
230            tags: Vec::new(),
231            applies_to_kanji: Vec::new(),
232        }
233    }
234
235    fn entry(kanji: Vec<KanjiEntry>, kana: Vec<KanaEntry>, sense: Vec<SenseEntry>) -> Entry {
236        Entry {
237            id: "test".into(),
238            kanji,
239            kana,
240            sense,
241        }
242    }
243
244    #[test]
245    fn primary_kanji_and_kana_first_element() {
246        let e = entry(
247            vec![kanji("猫", true), kanji("ねこ", false)],
248            vec![kana("ねこ", true)],
249            vec![],
250        );
251        assert_eq!(e.primary_kanji(), Some("猫"));
252        assert_eq!(e.primary_kana(), Some("ねこ"));
253    }
254
255    #[test]
256    fn primary_kanji_none_when_kana_only() {
257        let e = entry(vec![], vec![kana("にゃんこ", false)], vec![]);
258        assert!(e.primary_kanji().is_none());
259        assert_eq!(e.primary_kana(), Some("にゃんこ"));
260    }
261
262    #[test]
263    fn headword_prefers_kanji_falls_back_to_kana() {
264        let with_kanji = entry(vec![kanji("猫", false)], vec![kana("ねこ", false)], vec![]);
265        assert_eq!(with_kanji.headword(), Some("猫"));
266
267        let kana_only = entry(vec![], vec![kana("にゃんこ", false)], vec![]);
268        assert_eq!(kana_only.headword(), Some("にゃんこ"));
269
270        let empty = entry(vec![], vec![], vec![]);
271        assert!(empty.headword().is_none());
272    }
273
274    #[test]
275    fn is_common_true_if_any_form_is_common() {
276        let kanji_common = entry(vec![kanji("猫", true)], vec![kana("ねこ", false)], vec![]);
277        assert!(kanji_common.is_common());
278
279        let kana_common = entry(vec![kanji("猫", false)], vec![kana("ねこ", true)], vec![]);
280        assert!(kana_common.is_common());
281
282        let neither = entry(vec![kanji("猫", false)], vec![kana("ねこ", false)], vec![]);
283        assert!(!neither.is_common());
284    }
285
286    #[test]
287    fn glosses_filter_by_lang() {
288        let e = entry(
289            vec![],
290            vec![kana("ねこ", false)],
291            vec![sense(
292                &["n"],
293                vec![gloss("eng", "cat"), gloss("fre", "chat"), gloss("eng", "feline")],
294            )],
295        );
296        let eng: Vec<&str> = e.glosses("eng").collect();
297        assert_eq!(eng, vec!["cat", "feline"]);
298
299        let fre: Vec<&str> = e.glosses("fre").collect();
300        assert_eq!(fre, vec!["chat"]);
301
302        let missing: Vec<&str> = e.glosses("jpn").collect();
303        assert!(missing.is_empty());
304    }
305
306    #[test]
307    fn parts_of_speech_dedup_in_first_seen_order() {
308        let e = entry(
309            vec![],
310            vec![kana("ねこ", false)],
311            vec![
312                sense(&["v1", "vt"], vec![]),
313                sense(&["vt", "vi"], vec![]),
314                sense(&["v1"], vec![]),
315            ],
316        );
317        assert_eq!(e.parts_of_speech(), vec!["v1", "vt", "vi"]);
318    }
319}