jmdict/lib.rs
1/*******************************************************************************
2* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
3* SPDX-License-Identifier: Apache-2.0
4* Refer to the file "LICENSE" for details.
5*******************************************************************************/
6
7//! The [JMdict file](https://www.edrdg.org/jmdict/j_jmdict.html) is a comprehensive multilingual
8//! dictionary of the Japanese language. The original JMdict file, included in this repository (and
9//! hence, in releases of this crate) comes as XML. Instead of stuffing the XML in the binary
10//! directly, this crate parses the XML at compile-time and generates an optimized representation
11//! that is compiled into the binary. The crate's API affords type-safe access to this embedded
12//! database.
13//!
14//! # WARNING: Licensing on database files
15//!
16//! The database files compiled into the crate are licensed from the Electronic Dictionary Research
17//! and Development Group under Creative Commons licenses. Applications linking this crate directly
18//! oder indirectly must display appropriate copyright notices to users. Please refer to the
19//! [EDRDG's license statement](https://www.edrdg.org/edrdg/licence.html) for details.
20//!
21//! # Basic usage
22//!
23//! The database is accessed through the [entries() function](entries) which provides an iterator
24//! over all database entries compiled into the application. While traversing the database and its
25//! entries, you will find that, whenever you expect a list of something, you will get an iterator
26//! instead. These iterators provide an abstraction between you as the user of the library, and the
27//! physical representation of the database as embedded in the binary.
28//!
29//! The following example looks up the reading for お母さん in the database:
30//!
31//! ```
32//! let kanji_form = "お母さん";
33//!
34//! let entry = jmdict::entries().find(|e| {
35//! e.kanji_elements().any(|k| k.text == kanji_form)
36//! }).unwrap();
37//!
38//! let reading_form = entry.reading_elements().next().unwrap().text;
39//! assert_eq!(reading_form, "おかあさん");
40//! ```
41//!
42//! # Cargo features
43//!
44//! ### Common configurations
45//!
46//! * The `default` feature includes the most common words (about 30000 entries) and only their
47//! English translations.
48//! * The `full` feature includes everything in the JMdict.
49//!
50//! ### Entry selection
51//!
52//! * The `scope-uncommon` feature includes uncommon words and glosses.
53//! * The `scope-archaic` feature includes glosses with the "archaic" label. If disabled, the
54//! [PartOfSpeech] enum will not include variants that are only relevant for archaic vocabulary,
55//! such as obsolete conjugation patterns. (The [AllPartOfSpeech] enum always contains all
56//! variants.)
57//!
58//! ### Target languages
59//!
60//! At least one target language must be selected. Selecting a target language will include all
61//! available translations in that language. Entries that do not have any translation in any of the
62//! selected languages will be skipped.
63//!
64//! * `translations-eng`: English (included in `default`)
65//! * `translations-dut`: Dutch
66//! * `translations-fre`: French
67//! * `translations-ger`: German
68//! * `translations-hun`: Hungarian
69//! * `translations-rus`: Russian
70//! * `translations-slv`: Slovenian
71//! * `translations-spa`: Spanish
72//! * `translations-swe`: Swedish
73//!
74//! The [GlossLanguage] enum will only contain variants corresponding to the enabled target
75//! languages. For example, in the default configuration, `GlossLanguage::English` will be the only
76//! variant. (The [AllGlossLanguage] enum always contains all variants.)
77//!
78//! ### Crippled builds: `db-minimal`
79//!
80//! When the `db-minimal` feature is enabled, only a severly reduced portion of the JMdict will
81//! be parsed (to be exact, only chunks 000, 100 and 999). This is also completely useless for
82//! actual usage, but allows for quick edit-compile-test cycles while working on this crate's
83//! code.
84//!
85//! ### Crippled builds: `db-empty`
86//!
87//! When the `db-empty` feature is enabled, downloading and parsing of the JMdict contents is
88//! disabled entirely. The crate is compiled as usual, but `entries()` will be an empty list.
89//! This is useful for documentation builds like for `docs.rs`, where `--all-features` is given.
90
91pub use jmdict_enums::{
92 AllGlossLanguage, AllPartOfSpeech, Dialect, DisabledVariant, Enum, GlossLanguage, GlossType,
93 KanjiInfo, PartOfSpeech, Priority, PriorityInCorpus, ReadingInfo, SenseInfo, SenseTopic,
94};
95mod payload;
96use payload::*;
97
98#[cfg(test)]
99mod test_consistency;
100#[cfg(test)]
101mod test_feature_matrix;
102#[cfg(test)]
103mod test_ordering;
104
105///Returns an iterator over all entries in the database.
106pub fn entries() -> Entries {
107 Entries::new()
108}
109
110///An entry in the JMdict dictionary.
111///
112///Each entry has zero or more [kanji elements](KanjiElement), one or more
113///[reading elements](ReadingElement) and one or more [senses](Sense). Elements contain the
114///Japanese representation of the vocabulary or phrase. Whereas reading elements consist of only
115///kana, kanji elements will contain characters from non-kana scripts, most commonly kanji. Senses
116///contain the translation of the vocabulary or phrase in other languages, most commonly English.
117#[derive(Clone, Copy, Debug)]
118pub struct Entry {
119 ///The sequence number for this Entry as it appears in the JMdict. Numbers start around 1000000
120 ///and typically increment in steps of 5 or 10. (It's like BASIC line numbers, if you're old
121 ///enough to understand that reference.) The [Entries] iterator guarantees entries to appear
122 ///ordered by sequence number.
123 pub number: u32,
124 kanji_elements_iter: KanjiElements,
125 reading_elements_iter: ReadingElements,
126 senses_iter: Senses,
127}
128
129impl Entry {
130 pub fn kanji_elements(&self) -> KanjiElements {
131 self.kanji_elements_iter
132 }
133
134 pub fn reading_elements(&self) -> ReadingElements {
135 self.reading_elements_iter
136 }
137
138 pub fn senses(&self) -> Senses {
139 self.senses_iter
140 }
141}
142
143///A representation of a dictionary entry using kanji or other non-kana scripts.
144///
145///Each [Entry] may have any number of these (including none). For each kanji element, the entry
146///will also have [reading elements](ReadingElement) to indicate how to read this kanji element.
147#[derive(Clone, Copy, Debug)]
148pub struct KanjiElement {
149 pub text: &'static str,
150 pub priority: Priority,
151 info_iter: KanjiInfos,
152}
153
154impl KanjiElement {
155 pub fn infos(&self) -> KanjiInfos {
156 self.info_iter
157 }
158}
159
160///A representation of a dictionary entry using only kana.
161///
162///Each [Entry] will have zero or more of these. When an entry has both kanji elements and reading
163///elements, the kana usage will be consistent between them, that is: If the kanji element contains
164///katakana, there is also a corresponding reading element that contains katakana as well.
165#[derive(Clone, Copy, Debug)]
166pub struct ReadingElement {
167 pub text: &'static str,
168 pub priority: Priority,
169 info_iter: ReadingInfos,
170}
171
172impl ReadingElement {
173 pub fn infos(&self) -> ReadingInfos {
174 self.info_iter
175 }
176}
177
178///The translational equivalent of a Japanese word or phrase.
179///
180///Where there are several distinctly different meanings of the word, its [Entry] will have
181///multiple senses. Each particular translation is a [Gloss], of which there may be multiple within
182///a single sense.
183///
184///For instance, the entry for 折角 contains one sense with the glosses "with trouble" and "at
185///great pains". Those glosses all represent the same meaning, so they appear in one sense. There
186///is also a sense with the glosses "rare", "precious", "valuable" and "long-awaited". Those
187///glosses represent a different meaning from "with trouble" or "at great pains", so they appear in
188///a separate sense. (And in fact, 折角 has even more senses.)
189#[derive(Clone, Copy, Debug)]
190pub struct Sense {
191 stagk_iter: Strings,
192 stagr_iter: Strings,
193 pos_iter: PartsOfSpeech,
194 cross_refs_iter: Strings,
195 antonyms_iter: Strings,
196 topics_iter: SenseTopics,
197 info_iter: SenseInfos,
198 freetext_info_iter: Strings,
199 loanword_sources_iter: LoanwordSources,
200 dialects_iter: Dialects,
201 glosses_iter: Glosses,
202}
203
204impl Sense {
205 ///If not empty, this sense only applies to these [KanjiElements] out of all the
206 ///[KanjiElements] in this [Entry].
207 pub fn applicable_kanji_elements(&self) -> Strings {
208 self.stagk_iter
209 }
210
211 ///If not empty, this sense only applies to these [ReadingElements] out of all the
212 ///[ReadingElements] in this [Entry].
213 pub fn applicable_reading_elements(&self) -> Strings {
214 self.stagr_iter
215 }
216
217 pub fn parts_of_speech(&self) -> PartsOfSpeech {
218 self.pos_iter
219 }
220
221 ///If not empty, contains the text of [KanjiElements] or [ReadingElements] of other [Entries]
222 ///with a similar meaning or sense. In some cases, a [KanjiElement]'s text will be followed by
223 ///a [Reading Element]'s text and/or a sense number to provide a precise target for the
224 ///cross-reference. Where this happens, a katakana middle dot (`・`, U+30FB) is placed between
225 ///the components of the cross-reference.
226 ///
227 ///TODO: Provide a structured type for these kinds of references.
228 pub fn cross_references(&self) -> Strings {
229 self.cross_refs_iter
230 }
231
232 ///If not empty, contains the text of [KanjiElements] or [ReadingElements] of other [Entries]
233 ///which are antonyms of this sense.
234 pub fn antonyms(&self) -> Strings {
235 self.antonyms_iter
236 }
237
238 pub fn topics(&self) -> SenseTopics {
239 self.topics_iter
240 }
241
242 pub fn infos(&self) -> SenseInfos {
243 self.info_iter
244 }
245
246 ///If not empty, contains additional information about this sence (e.g. level of currency or
247 ///other nuances) that cannot be expressed by the other, more structured fields.
248 pub fn freetext_infos(&self) -> Strings {
249 self.freetext_info_iter
250 }
251
252 ///If not empty, contains source words in other languages from which this vocabulary has been
253 ///borrowed in this sense.
254 pub fn loanword_sources(&self) -> LoanwordSources {
255 self.loanword_sources_iter
256 }
257
258 ///If not empty, this [Sense] of the [Entry] only appears in the given [Dialects] of Japanese.
259 pub fn dialects(&self) -> Dialects {
260 self.dialects_iter
261 }
262
263 pub fn glosses(&self) -> Glosses {
264 self.glosses_iter
265 }
266}
267
268///A source word in other language which a particular [Sense] of an [Entry] has been borrowed from.
269///
270///There may be multiple sources for a single [Sense] when it is not clear from which language a
271///word has been borrowed (e.g. "セレナーデ" lists both the French word "sérénade" and the German
272///word "Serenade" as loanword sources), or if the vocabulary is a composite word with multiple
273///distinct sources (e.g. "サブリュック" is a combination of the English prefix "sub-" and the
274///German word "Rucksack").
275///
276///Within an [Entry], glosses appear in the [Sense].
277#[derive(Clone, Copy, Debug, PartialEq, Eq)]
278pub struct LoanwordSource {
279 pub text: &'static str,
280 ///The [ISO 639-2/B code](https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes) for the
281 ///language from which the word was borrowed, e.g. "ger" for German or "chi" for Chinese.
282 pub language: &'static str,
283 ///Whether this source applies only to part of the loanword. Note that this flag is not always
284 ///present in the JMdict when it should be.
285 pub is_partial: bool,
286 ///Whether this loanword is a [Wasei-eigo](https://en.wikipedia.org/wiki/Wasei-eigo).
287 pub is_wasei: bool,
288}
289
290///A particular translation or explanation for a Japanese word or phrase in a different language.
291///
292///Within an [Entry], glosses appear in the [Sense].
293#[derive(Clone, Copy, Debug, PartialEq, Eq)]
294pub struct Gloss {
295 pub language: GlossLanguage,
296 pub text: &'static str,
297 pub gloss_type: GlossType,
298}
299
300///We cannot do `pub type KanjiElements = Range<KanjiElement, N>` etc. because Range<T, N> is
301///private to the crate, so instead we declare a bunch of iterator types that wrap Range<T, N>.
302macro_rules! wrap_iterator {
303 ($val: ty, $size: literal, $iter: ident) => {
304 ///An iterator providing fast access to objects in the database. Instances of this iterator
305 ///can be copied cheaply.
306 #[derive(Clone, Copy, Debug)]
307 pub struct $iter(Range<$val, $size>);
308
309 impl From<Range<$val, $size>> for $iter {
310 fn from(r: Range<$val, $size>) -> $iter {
311 $iter(r)
312 }
313 }
314
315 impl std::iter::Iterator for $iter {
316 type Item = $val;
317
318 fn next(&mut self) -> Option<Self::Item> {
319 self.0.next()
320 }
321
322 fn size_hint(&self) -> (usize, Option<usize>) {
323 self.0.size_hint()
324 }
325 }
326
327 impl std::iter::ExactSizeIterator for $iter {
328 fn len(&self) -> usize {
329 self.0.len()
330 }
331 }
332 };
333}
334
335wrap_iterator!(KanjiElement, 5, KanjiElements);
336wrap_iterator!(KanjiInfo, 1, KanjiInfos);
337wrap_iterator!(ReadingElement, 5, ReadingElements);
338wrap_iterator!(ReadingInfo, 1, ReadingInfos);
339wrap_iterator!(Sense, 5, Senses);
340wrap_iterator!(&'static str, 2, Strings);
341wrap_iterator!(PartOfSpeech, 1, PartsOfSpeech);
342wrap_iterator!(SenseTopic, 1, SenseTopics);
343wrap_iterator!(SenseInfo, 1, SenseInfos);
344wrap_iterator!(LoanwordSource, 4, LoanwordSources);
345wrap_iterator!(Dialect, 1, Dialects);
346wrap_iterator!(Gloss, 2, Glosses);
347
348///An iterator providing fast access to objects in the database. Instances of this iterator
349///can be copied cheaply.
350#[derive(Clone, Copy)]
351pub struct Entries {
352 //This iterator is very similar to Range<T, N>, but cannot be implemented in terms of it
353 //because it iterates over ALL_ENTRY_OFFSETS instead of ALL_DATA.
354 start: usize,
355 end: usize,
356}
357
358impl Entries {
359 fn new() -> Self {
360 Self {
361 start: 0,
362 end: entry_count(),
363 }
364 }
365}
366
367impl std::iter::Iterator for Entries {
368 type Item = Entry;
369
370 fn next(&mut self) -> Option<Self::Item> {
371 if self.start < self.end {
372 let entry = get_entry(self.start);
373 self.start += 1;
374 Some(entry)
375 } else {
376 None
377 }
378 }
379
380 fn size_hint(&self) -> (usize, Option<usize>) {
381 let count = self.end - self.start;
382 (count, Some(count))
383 }
384}
385
386impl std::iter::ExactSizeIterator for Entries {
387 fn len(&self) -> usize {
388 self.end - self.start
389 }
390}