jmdict_traverse/
lib.rs

1/*******************************************************************************
2* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
3* SPDX-License-Identifier: Apache-2.0
4* Refer to the file "LICENSE" for details.
5*******************************************************************************/
6
7//! Parsing utilities for the build and test phases of the `jmdict` crate.
8//!
9//! This code is in a separate crate because, if we put it in the `jmdict` crate itself, its
10//! `build.rs` could not import it.
11//!
12//! # Compatibility promise
13//!
14//! **There is none.** Although this crate is published on crates.io for technical reasons, this
15//! crate is internal to the `jmdict` crate. Its API may change at any time, including in
16//! bugfix releases. Use the [API provided by the `jmdict` crate](https://docs.rs/jmdict/) instead.
17
18use jmdict_enums::{
19    AllGlossLanguage, AllPartOfSpeech, Dialect, Enum, GlossLanguage, GlossType, KanjiInfo,
20    PartOfSpeech, Priority, PriorityInCorpus, ReadingInfo, SenseInfo, SenseTopic,
21};
22use json::JsonValue;
23use std::convert::TryInto;
24
25mod entrypack;
26use entrypack::EntryPack;
27
28pub struct RawEntry<'a> {
29    pub ent_seq: u32,
30    pub k_ele: Vec<RawKanjiElement<'a>>,
31    pub r_ele: Vec<RawReadingElement<'a>>,
32    pub sense: Vec<RawSense<'a>>,
33}
34
35pub struct RawKanjiElement<'a> {
36    pub keb: &'a str,
37    pub ke_inf: Vec<KanjiInfo>,
38    pub ke_pri: Priority,
39}
40
41pub struct RawReadingElement<'a> {
42    pub reb: &'a str,
43    pub re_nokanji: bool,
44    pub re_restr: Vec<&'a str>,
45    pub re_inf: Vec<ReadingInfo>,
46    pub re_pri: Priority,
47}
48
49pub struct RawSense<'a> {
50    pub stagk: Vec<&'a str>,
51    pub stagr: Vec<&'a str>,
52    pub pos: Vec<PartOfSpeech>,
53    pub xref: Vec<&'a str>,
54    pub ant: Vec<&'a str>,
55    pub field: Vec<SenseTopic>,
56    pub misc: Vec<SenseInfo>,
57    pub s_inf: Vec<&'a str>,
58    pub lsource: Vec<RawLSource<'a>>,
59    pub dial: Vec<Dialect>,
60    pub gloss: Vec<RawGloss<'a>>,
61}
62
63pub struct RawLSource<'a> {
64    //NOTE: We do not use the GlossLanguage enum for the lang attribute, because doing so would add
65    //a very long tail of rare loanword source languages to that enum. (Also, we could not restrict
66    //variants of GlossLanguage to feature flags in the way we currently do.)
67    pub text: &'a str,
68    pub lang: &'a str,
69    pub is_partial: bool,
70    pub is_wasei: bool,
71}
72
73pub struct RawGloss<'a> {
74    //NOTE: g_gend and pri are not mapped since they do not actually occur in any entries
75    pub text: &'a str,
76    pub lang: GlossLanguage,
77    pub g_type: GlossType,
78}
79
80///Strategy for processing a JMdict file.
81pub trait Visitor {
82    fn process_entry(&mut self, entry: &RawEntry);
83
84    ///This is called once for each file that was read from disk. The build script uses this to
85    ///generate `cargo:rerun-if-changed` directives.
86    fn notify_data_file_path(&mut self, _path: &str) {}
87}
88
89///Options for traversing a JMdict file. This controls which entries the [Visitor] visits, and
90///which parts of the entries it sees.
91pub struct Options {
92    pub is_db_minimal: bool,
93    pub with_uncommon: bool,
94    pub with_archaic: bool,
95}
96
97///Entry point for this file. All other functions are called directly or indirectly from this fn.
98pub fn process_dictionary<V: Visitor>(v: &mut V, opts: Options) {
99    let entrypack = EntryPack::locate_or_download();
100    v.notify_data_file_path(&entrypack.path.to_string_lossy());
101
102    for entry_str in entrypack.contents().split('\n') {
103        if !entry_str.is_empty() {
104            let entry_obj = json::parse(entry_str).unwrap();
105            if let Some(entry_raw) = RawEntry::from_obj(&entry_obj, &opts) {
106                if opts.is_db_minimal && entry_raw.ent_seq >= 1010000 {
107                    //for db-minimal, only process entries from data/entries-100.json
108                    return;
109                }
110                v.process_entry(&entry_raw);
111            }
112        }
113    }
114}
115
116trait Object<'a>: Sized {
117    fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self>;
118
119    fn collect(array: &'a JsonValue, opts: &'_ Options) -> Vec<Self> {
120        assert!(array.is_null() || array.is_array());
121        array
122            .members()
123            .filter_map(|obj| Self::from_obj(obj, opts))
124            .collect()
125    }
126
127    fn collect_or_none(array: &'a JsonValue, opts: &'_ Options) -> Option<Vec<Self>> {
128        let vec = Self::collect(array, opts);
129        if vec.is_empty() {
130            None
131        } else {
132            Some(vec)
133        }
134    }
135}
136
137impl<'a> Object<'a> for RawEntry<'a> {
138    fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
139        Some(Self {
140            ent_seq: obj["n"].as_u32().unwrap(),
141            k_ele: RawKanjiElement::collect(&obj["K"], opts),
142            r_ele: RawReadingElement::collect_or_none(&obj["R"], opts)?,
143            sense: RawSense::collect_or_none(&obj["S"], opts)?,
144        })
145    }
146}
147
148impl<'a> Object<'a> for RawKanjiElement<'a> {
149    fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
150        if !opts.with_uncommon && obj["p"].is_empty() {
151            return None;
152        }
153        Some(Self {
154            keb: obj["t"].as_str().unwrap(),
155            ke_inf: Object::collect(&obj["i"], opts),
156            ke_pri: parse_prio(Object::collect(&obj["p"], opts)),
157        })
158    }
159}
160
161impl<'a> Object<'a> for RawReadingElement<'a> {
162    fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
163        if !opts.with_uncommon && obj["p"].is_empty() {
164            return None;
165        }
166        Some(Self {
167            reb: obj["t"].as_str().unwrap(),
168            re_nokanji: obj["n"].as_bool().unwrap_or(false),
169            re_restr: Object::collect(&obj["r"], opts),
170            re_inf: Object::collect(&obj["i"], opts),
171            re_pri: parse_prio(Object::collect(&obj["p"], opts)),
172        })
173    }
174}
175
176fn parse_prio(markers: Vec<&str>) -> Priority {
177    use PriorityInCorpus::*;
178    let mut result = Priority {
179        news: Absent,
180        ichimango: Absent,
181        loanwords: Absent,
182        additional: Absent,
183        frequency_bucket: 0,
184    };
185    for marker in markers {
186        match marker {
187            "news1" => result.news = merge_cprio(result.news, Primary),
188            "news2" => result.news = merge_cprio(result.news, Secondary),
189            "ichi1" => result.ichimango = merge_cprio(result.ichimango, Primary),
190            "ichi2" => result.ichimango = merge_cprio(result.ichimango, Secondary),
191            "gai1" => result.loanwords = merge_cprio(result.loanwords, Primary),
192            "gai2" => result.loanwords = merge_cprio(result.loanwords, Secondary),
193            "spec1" => result.additional = merge_cprio(result.additional, Primary),
194            "spec2" => result.additional = merge_cprio(result.additional, Secondary),
195            _ => match parse_freq_bucket(marker) {
196                Some(bucket) => {
197                    if result.frequency_bucket == 0 || result.frequency_bucket > bucket {
198                        result.frequency_bucket = bucket;
199                    }
200                }
201                None => {
202                    panic!("unknown priority marker: {}", marker);
203                }
204            },
205        };
206    }
207    result
208}
209
210fn merge_cprio(old: PriorityInCorpus, new: PriorityInCorpus) -> PriorityInCorpus {
211    use PriorityInCorpus::*;
212    match (old, new) {
213        (Absent, _) => new,
214        (_, Primary) => Primary,
215        (Primary, _) => Primary,
216        (Secondary, _) => Secondary,
217    }
218}
219
220///Parses a frequency bucket marker for the news corpus, e.g. "nf18" => Some(18).
221fn parse_freq_bucket(marker: &str) -> Option<u16> {
222    //NOTE: This would be easier with a regex library, but I'm definitely not pulling in an entire
223    //regex crate for just this one thing.
224
225    let mut c = marker.chars();
226    if c.next()? != 'n' {
227        return None;
228    }
229    if c.next()? != 'f' {
230        return None;
231    }
232    let tens = c.next()?.to_digit(10)? as u16;
233    let ones = c.next()?.to_digit(10)? as u16;
234    if c.next().is_some() {
235        return None;
236    }
237    let result = 10 * tens + ones;
238
239    //only nf01..nf48 are allowed
240    if result == 0 || result > 48 {
241        None
242    } else {
243        Some(result)
244    }
245}
246
247impl<'a> Object<'a> for RawSense<'a> {
248    fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
249        let misc = Object::collect(&obj["m"], opts);
250        if !opts.with_archaic && misc.contains(&SenseInfo::Archaism) {
251            return None;
252        }
253
254        Some(Self {
255            stagk: Object::collect(&obj["stagk"], opts),
256            stagr: Object::collect(&obj["stagr"], opts),
257            pos: Object::collect(&obj["p"], opts),
258            xref: Object::collect(&obj["xref"], opts),
259            ant: Object::collect(&obj["ant"], opts),
260            field: Object::collect(&obj["f"], opts),
261            misc,
262            s_inf: Object::collect(&obj["i"], opts),
263            lsource: Object::collect(&obj["L"], opts),
264            dial: Object::collect(&obj["dial"], opts),
265            gloss: Object::collect_or_none(&obj["G"], opts)?,
266        })
267    }
268}
269
270impl<'a> Object<'a> for RawLSource<'a> {
271    fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
272        let is_partial = match obj["type"].as_str().unwrap_or("full") {
273            "full" => false,
274            "part" => true,
275            val => panic!("unknown ls_type: {}", val),
276        };
277        let is_wasei = match obj["wasei"].as_str().unwrap_or("n") {
278            "n" => false,
279            "y" => true,
280            val => panic!("unknown ls_wasei: {}", val),
281        };
282        Some(Self {
283            text: obj["t"].as_str().unwrap(),
284            lang: obj["l"].as_str().unwrap_or("eng"),
285            is_partial,
286            is_wasei,
287        })
288    }
289}
290
291impl<'a> Object<'a> for RawGloss<'a> {
292    fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
293        Some(Self {
294            text: obj["t"].as_str().unwrap(),
295            lang: GlossLanguage::from_obj(&obj["l"], opts)?,
296            g_type: optional_enum(&obj["g_type"], "", "GlossType"),
297        })
298    }
299}
300
301impl<'a> Object<'a> for &'a str {
302    fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
303        Some(obj.as_str().unwrap())
304    }
305}
306
307impl<'a> Object<'a> for Dialect {
308    fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
309        Some(required_enum(obj, "Dialect"))
310    }
311}
312
313impl<'a> Object<'a> for GlossLanguage {
314    fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
315        let lang: AllGlossLanguage = optional_enum(obj, "eng", "AllGlossLanguage");
316        lang.try_into().ok()
317    }
318}
319
320impl<'a> Object<'a> for KanjiInfo {
321    fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
322        Some(required_enum(obj, "KanjiInfo"))
323    }
324}
325
326impl<'a> Object<'a> for PartOfSpeech {
327    fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
328        let lang: AllPartOfSpeech = optional_enum(obj, "eng", "AllPartOfSpeech");
329        lang.try_into().ok()
330    }
331}
332
333impl<'a> Object<'a> for ReadingInfo {
334    fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
335        Some(required_enum(obj, "ReadingInfo"))
336    }
337}
338
339impl<'a> Object<'a> for SenseInfo {
340    fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
341        Some(required_enum(obj, "SenseInfo"))
342    }
343}
344
345impl<'a> Object<'a> for SenseTopic {
346    fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
347        Some(required_enum(obj, "SenseTopic"))
348    }
349}
350
351fn optional_enum<E: Enum>(obj: &JsonValue, default: &'static str, enum_name: &'static str) -> E {
352    let code = obj.as_str().unwrap_or(default);
353    match E::from_code(code) {
354        Some(val) => val,
355        None => panic!("unknown {} representation: {}", enum_name, code),
356    }
357}
358
359fn required_enum<E: Enum>(obj: &JsonValue, enum_name: &'static str) -> E {
360    let code = obj.as_str().unwrap();
361    match E::from_code(code) {
362        Some(val) => val,
363        None => panic!("unknown {} representation: {}", enum_name, code),
364    }
365}