1use jmdict_enums::{
19 AllGlossLanguage, AllPartOfSpeech, Dialect, Enum, GlossLanguage, GlossType, KanjiInfo,
20 PartOfSpeech, Priority, PriorityInCorpus, ReadingInfo, SenseInfo, SenseTopic,
21};
22use json::JsonValue;
23use std::convert::TryInto;
24
25mod entrypack;
26use entrypack::EntryPack;
27
28pub struct RawEntry<'a> {
29 pub ent_seq: u32,
30 pub k_ele: Vec<RawKanjiElement<'a>>,
31 pub r_ele: Vec<RawReadingElement<'a>>,
32 pub sense: Vec<RawSense<'a>>,
33}
34
35pub struct RawKanjiElement<'a> {
36 pub keb: &'a str,
37 pub ke_inf: Vec<KanjiInfo>,
38 pub ke_pri: Priority,
39}
40
41pub struct RawReadingElement<'a> {
42 pub reb: &'a str,
43 pub re_nokanji: bool,
44 pub re_restr: Vec<&'a str>,
45 pub re_inf: Vec<ReadingInfo>,
46 pub re_pri: Priority,
47}
48
49pub struct RawSense<'a> {
50 pub stagk: Vec<&'a str>,
51 pub stagr: Vec<&'a str>,
52 pub pos: Vec<PartOfSpeech>,
53 pub xref: Vec<&'a str>,
54 pub ant: Vec<&'a str>,
55 pub field: Vec<SenseTopic>,
56 pub misc: Vec<SenseInfo>,
57 pub s_inf: Vec<&'a str>,
58 pub lsource: Vec<RawLSource<'a>>,
59 pub dial: Vec<Dialect>,
60 pub gloss: Vec<RawGloss<'a>>,
61}
62
63pub struct RawLSource<'a> {
64 pub text: &'a str,
68 pub lang: &'a str,
69 pub is_partial: bool,
70 pub is_wasei: bool,
71}
72
73pub struct RawGloss<'a> {
74 pub text: &'a str,
76 pub lang: GlossLanguage,
77 pub g_type: GlossType,
78}
79
80pub trait Visitor {
82 fn process_entry(&mut self, entry: &RawEntry);
83
84 fn notify_data_file_path(&mut self, _path: &str) {}
87}
88
89pub struct Options {
92 pub is_db_minimal: bool,
93 pub with_uncommon: bool,
94 pub with_archaic: bool,
95}
96
97pub fn process_dictionary<V: Visitor>(v: &mut V, opts: Options) {
99 let entrypack = EntryPack::locate_or_download();
100 v.notify_data_file_path(&entrypack.path.to_string_lossy());
101
102 for entry_str in entrypack.contents().split('\n') {
103 if !entry_str.is_empty() {
104 let entry_obj = json::parse(entry_str).unwrap();
105 if let Some(entry_raw) = RawEntry::from_obj(&entry_obj, &opts) {
106 if opts.is_db_minimal && entry_raw.ent_seq >= 1010000 {
107 return;
109 }
110 v.process_entry(&entry_raw);
111 }
112 }
113 }
114}
115
116trait Object<'a>: Sized {
117 fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self>;
118
119 fn collect(array: &'a JsonValue, opts: &'_ Options) -> Vec<Self> {
120 assert!(array.is_null() || array.is_array());
121 array
122 .members()
123 .filter_map(|obj| Self::from_obj(obj, opts))
124 .collect()
125 }
126
127 fn collect_or_none(array: &'a JsonValue, opts: &'_ Options) -> Option<Vec<Self>> {
128 let vec = Self::collect(array, opts);
129 if vec.is_empty() {
130 None
131 } else {
132 Some(vec)
133 }
134 }
135}
136
137impl<'a> Object<'a> for RawEntry<'a> {
138 fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
139 Some(Self {
140 ent_seq: obj["n"].as_u32().unwrap(),
141 k_ele: RawKanjiElement::collect(&obj["K"], opts),
142 r_ele: RawReadingElement::collect_or_none(&obj["R"], opts)?,
143 sense: RawSense::collect_or_none(&obj["S"], opts)?,
144 })
145 }
146}
147
148impl<'a> Object<'a> for RawKanjiElement<'a> {
149 fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
150 if !opts.with_uncommon && obj["p"].is_empty() {
151 return None;
152 }
153 Some(Self {
154 keb: obj["t"].as_str().unwrap(),
155 ke_inf: Object::collect(&obj["i"], opts),
156 ke_pri: parse_prio(Object::collect(&obj["p"], opts)),
157 })
158 }
159}
160
161impl<'a> Object<'a> for RawReadingElement<'a> {
162 fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
163 if !opts.with_uncommon && obj["p"].is_empty() {
164 return None;
165 }
166 Some(Self {
167 reb: obj["t"].as_str().unwrap(),
168 re_nokanji: obj["n"].as_bool().unwrap_or(false),
169 re_restr: Object::collect(&obj["r"], opts),
170 re_inf: Object::collect(&obj["i"], opts),
171 re_pri: parse_prio(Object::collect(&obj["p"], opts)),
172 })
173 }
174}
175
176fn parse_prio(markers: Vec<&str>) -> Priority {
177 use PriorityInCorpus::*;
178 let mut result = Priority {
179 news: Absent,
180 ichimango: Absent,
181 loanwords: Absent,
182 additional: Absent,
183 frequency_bucket: 0,
184 };
185 for marker in markers {
186 match marker {
187 "news1" => result.news = merge_cprio(result.news, Primary),
188 "news2" => result.news = merge_cprio(result.news, Secondary),
189 "ichi1" => result.ichimango = merge_cprio(result.ichimango, Primary),
190 "ichi2" => result.ichimango = merge_cprio(result.ichimango, Secondary),
191 "gai1" => result.loanwords = merge_cprio(result.loanwords, Primary),
192 "gai2" => result.loanwords = merge_cprio(result.loanwords, Secondary),
193 "spec1" => result.additional = merge_cprio(result.additional, Primary),
194 "spec2" => result.additional = merge_cprio(result.additional, Secondary),
195 _ => match parse_freq_bucket(marker) {
196 Some(bucket) => {
197 if result.frequency_bucket == 0 || result.frequency_bucket > bucket {
198 result.frequency_bucket = bucket;
199 }
200 }
201 None => {
202 panic!("unknown priority marker: {}", marker);
203 }
204 },
205 };
206 }
207 result
208}
209
210fn merge_cprio(old: PriorityInCorpus, new: PriorityInCorpus) -> PriorityInCorpus {
211 use PriorityInCorpus::*;
212 match (old, new) {
213 (Absent, _) => new,
214 (_, Primary) => Primary,
215 (Primary, _) => Primary,
216 (Secondary, _) => Secondary,
217 }
218}
219
220fn parse_freq_bucket(marker: &str) -> Option<u16> {
222 let mut c = marker.chars();
226 if c.next()? != 'n' {
227 return None;
228 }
229 if c.next()? != 'f' {
230 return None;
231 }
232 let tens = c.next()?.to_digit(10)? as u16;
233 let ones = c.next()?.to_digit(10)? as u16;
234 if c.next().is_some() {
235 return None;
236 }
237 let result = 10 * tens + ones;
238
239 if result == 0 || result > 48 {
241 None
242 } else {
243 Some(result)
244 }
245}
246
247impl<'a> Object<'a> for RawSense<'a> {
248 fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
249 let misc = Object::collect(&obj["m"], opts);
250 if !opts.with_archaic && misc.contains(&SenseInfo::Archaism) {
251 return None;
252 }
253
254 Some(Self {
255 stagk: Object::collect(&obj["stagk"], opts),
256 stagr: Object::collect(&obj["stagr"], opts),
257 pos: Object::collect(&obj["p"], opts),
258 xref: Object::collect(&obj["xref"], opts),
259 ant: Object::collect(&obj["ant"], opts),
260 field: Object::collect(&obj["f"], opts),
261 misc,
262 s_inf: Object::collect(&obj["i"], opts),
263 lsource: Object::collect(&obj["L"], opts),
264 dial: Object::collect(&obj["dial"], opts),
265 gloss: Object::collect_or_none(&obj["G"], opts)?,
266 })
267 }
268}
269
270impl<'a> Object<'a> for RawLSource<'a> {
271 fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
272 let is_partial = match obj["type"].as_str().unwrap_or("full") {
273 "full" => false,
274 "part" => true,
275 val => panic!("unknown ls_type: {}", val),
276 };
277 let is_wasei = match obj["wasei"].as_str().unwrap_or("n") {
278 "n" => false,
279 "y" => true,
280 val => panic!("unknown ls_wasei: {}", val),
281 };
282 Some(Self {
283 text: obj["t"].as_str().unwrap(),
284 lang: obj["l"].as_str().unwrap_or("eng"),
285 is_partial,
286 is_wasei,
287 })
288 }
289}
290
291impl<'a> Object<'a> for RawGloss<'a> {
292 fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
293 Some(Self {
294 text: obj["t"].as_str().unwrap(),
295 lang: GlossLanguage::from_obj(&obj["l"], opts)?,
296 g_type: optional_enum(&obj["g_type"], "", "GlossType"),
297 })
298 }
299}
300
301impl<'a> Object<'a> for &'a str {
302 fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
303 Some(obj.as_str().unwrap())
304 }
305}
306
307impl<'a> Object<'a> for Dialect {
308 fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
309 Some(required_enum(obj, "Dialect"))
310 }
311}
312
313impl<'a> Object<'a> for GlossLanguage {
314 fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
315 let lang: AllGlossLanguage = optional_enum(obj, "eng", "AllGlossLanguage");
316 lang.try_into().ok()
317 }
318}
319
320impl<'a> Object<'a> for KanjiInfo {
321 fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
322 Some(required_enum(obj, "KanjiInfo"))
323 }
324}
325
326impl<'a> Object<'a> for PartOfSpeech {
327 fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
328 let lang: AllPartOfSpeech = optional_enum(obj, "eng", "AllPartOfSpeech");
329 lang.try_into().ok()
330 }
331}
332
333impl<'a> Object<'a> for ReadingInfo {
334 fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
335 Some(required_enum(obj, "ReadingInfo"))
336 }
337}
338
339impl<'a> Object<'a> for SenseInfo {
340 fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
341 Some(required_enum(obj, "SenseInfo"))
342 }
343}
344
345impl<'a> Object<'a> for SenseTopic {
346 fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
347 Some(required_enum(obj, "SenseTopic"))
348 }
349}
350
351fn optional_enum<E: Enum>(obj: &JsonValue, default: &'static str, enum_name: &'static str) -> E {
352 let code = obj.as_str().unwrap_or(default);
353 match E::from_code(code) {
354 Some(val) => val,
355 None => panic!("unknown {} representation: {}", enum_name, code),
356 }
357}
358
359fn required_enum<E: Enum>(obj: &JsonValue, enum_name: &'static str) -> E {
360 let code = obj.as_str().unwrap();
361 match E::from_code(code) {
362 Some(val) => val,
363 None => panic!("unknown {} representation: {}", enum_name, code),
364 }
365}