wiktionary_dump_parser/parser/words/
mod.rs1use lazy_static::lazy_static;
2use regex::Regex;
3use std::future::Future;
4use wikitext_parser::Section;
5
6use crate::error::{Error, Result};
7use crate::parser::Wikitext;
8
9lazy_static! {
10 static ref IGNORED_PATTERN: Regex =
11 Regex::new("(Wiktionary:|Appendix:|Help:|Rhymes:|Template:|MediaWiki:|Citations:|Module:|Reconstruction:|Thesaurus:|Concordance:).*|.*(/derived terms)").unwrap();
12 static ref WORD_TYPE_PATTERN: Regex =
13 Regex::new("Word|Noun|Proper noun|Dependent noun|Prenoun|Participle|Gerund(ive)?|Verb|Preverb|Predicative|Conjugation|Adjective|Comparative-only adjectives|Determinative|Adverb|Adnominal|Inflection|Pronoun|Preposition|Postposition|Ambiposition|Circumposition|Conjunction|Initial|Prefix|Suffix|Final|Affix|Infix|Interfix|Circumfix|Clitic|Article|Particle|Locative|Determiner|Classifier|Subordinate modifier|Contraction|Combining form|Compound part|Enclitic|Relative|Phrase|Propositional phrase|Proverb|Idiom|Honorific title|Ideophone|Phonogram|Onomatopoeia|Phoneme|Ligature|Syllable|Letter|Symbol|Counter|Number|Numeral|Multiple parts of speech|Punctuation mark|Diacritical mark|Root")
14 .unwrap();
15 static ref IGNORED_LANGUAGE_PATTERN: Regex = Regex::new("Translingual").unwrap();
16 static ref IGNORED_SUBSECTION_PATTERN: Regex = Regex::new("Variant spellings|Relational forms|Spelling variants|Other usage|Other versions|Possessed forms|Graphical notes|Design|Echo word|From|Description|Derived characters|Derived|Derivatives|Alternate spelling|Accentuation notes|Accentological notes|Usage|Citations?|Examples?|Sources|User notes?|Work to be done|Stem|Sign values|Reconstruction|Production|Logogram|Holonyms?|Meronyms|Forms?|Dialectal synonyms?|Decadents?|Abbreviations?|Borrowed terms?|External (L|l)inks?|Related words?|Standard form|Nom glyph origin|Readings?|Synonyms?|Antonyms?|Hyponyms?|Hypernyms?|Paronyms?|Translations?|Coordinate terms?|Dialectal variants?|Romanization|Statistics?|Declension|Alternative scripts?|Phrasal verbs?|Trivia|Han character|Hanzi|Glyph origin|Definitions?|Compounds?|Descendants?|Kanji|Hanja|Notes?|Derived (t|T)erms?|Usage notes|Alternative forms|Alternative|Etymology|Pronunciation( [1-9][0-9]*)?|Further reading|Anagrams|References?|Refs|Further references?|See ?(a|A)lso|Mutation|Interjection|Quotations|Gallery|Related (t|T)erms?").unwrap();
17}
18
19pub struct Word {
20 pub word: String,
23
24 pub language_english_name: String,
27
28 pub word_type: String,
31}
32
33pub async fn wikitext_to_words<
37 WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
38>(
39 title: &str,
40 wikitext: &Wikitext,
41 mut result_consumer: impl FnMut(Word) -> WordConsumerResult,
42 mut error_consumer: impl FnMut(Error),
43) -> Result<()> {
44 if IGNORED_PATTERN.is_match(title) {
45 return Ok(());
47 }
48
49 let root_section = &wikitext.root_section;
50
51 if root_section.headline.level == 1 {
52 let word = &root_section.headline.label;
53
54 for subsection in &root_section.subsections {
55 parse_language_subsection(word, subsection, &mut result_consumer, &mut error_consumer)
56 .await?;
57 }
58 } else {
59 error_consumer(Error::Other(
60 "Root section is not at headline level 1".to_string(),
61 ));
62 }
63
64 Ok(())
65}
66
67async fn parse_language_subsection<
68 WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
69>(
70 word: &str,
71 language_subsection: &Section,
72 result_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
73 error_consumer: &mut impl FnMut(Error),
74) -> Result<()> {
75 let language_english_name = language_subsection.headline.label.as_str();
76 if IGNORED_LANGUAGE_PATTERN.is_match(language_english_name) {
77 return Ok(());
79 }
80
81 if language_subsection.subsections.is_empty() {
82 result_consumer(Word {
83 word: word.to_string(),
84 language_english_name: language_english_name.to_string(),
85 word_type: "Unknown".to_string(),
86 })
87 .await
88 .map_err(|error| Error::WordConsumer { source: error })?;
89 } else {
90 let mut toplevel_details = false;
91 let mut bottomlevel_details = false;
92 let mut bottomlevel_errors = Vec::new();
93
94 for unknown_subsection in &language_subsection.subsections {
95 if unknown_subsection.headline.label == "Etymology"
96 || WORD_TYPE_PATTERN.is_match(&unknown_subsection.headline.label)
97 {
98 toplevel_details = true;
99 } else if unknown_subsection.headline.label != "Etymology"
100 && unknown_subsection.headline.label.starts_with("Etymology")
101 {
102 bottomlevel_details = true;
103 parse_details_subsection(
104 word,
105 language_english_name,
106 unknown_subsection,
107 result_consumer,
108 error_consumer,
109 )
110 .await?;
111 } else if IGNORED_SUBSECTION_PATTERN.is_match(&unknown_subsection.headline.label) {
112 } else {
114 bottomlevel_errors.push(Error::Other(format!(
115 "Unknown subsection of language: {}",
116 unknown_subsection.headline.label
117 )));
118 }
119 }
120
121 if toplevel_details {
122 parse_details_subsection(
123 word,
124 language_english_name,
125 language_subsection,
126 result_consumer,
127 error_consumer,
128 )
129 .await?;
130 }
131
132 if toplevel_details && bottomlevel_details {
133 error_consumer(Error::Other(format!(
134 "Found both toplevel and bottomlevel details for language {language_english_name}"
135 )));
136 }
137
138 if bottomlevel_details {
139 for error in bottomlevel_errors {
140 error_consumer(error);
141 }
142 }
143 }
144
145 Ok(())
146}
147
148async fn parse_details_subsection<
149 WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
150>(
151 word: &str,
152 language_english_name: &str,
153 details_subsection: &Section,
154 result_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
155 error_consumer: &mut impl FnMut(Error),
156) -> Result<()> {
157 for details_section in &details_subsection.subsections {
158 let word_type = &details_section.headline.label;
159 if WORD_TYPE_PATTERN.is_match(word_type) {
160 result_consumer(Word {
161 word: word.to_string(),
162 language_english_name: language_english_name.to_string(),
163 word_type: word_type.clone(),
164 })
165 .await
166 .map_err(|error| Error::WordConsumer { source: error })?;
167 } else if IGNORED_SUBSECTION_PATTERN.is_match(word_type) {
168 } else {
170 error_consumer(Error::Other(format!(
171 "Unknown details subsection: {word_type}"
172 )));
173 }
174 }
175
176 Ok(())
177}