wiktionary_dump_parser/parser/words/
mod.rs

1use lazy_static::lazy_static;
2use regex::Regex;
3use std::future::Future;
4use wikitext_parser::Section;
5
6use crate::error::{Error, Result};
7use crate::parser::Wikitext;
8
9lazy_static! {
10    static ref IGNORED_PATTERN: Regex =
11        Regex::new("(Wiktionary:|Appendix:|Help:|Rhymes:|Template:|MediaWiki:|Citations:|Module:|Reconstruction:|Thesaurus:|Concordance:).*|.*(/derived terms)").unwrap();
12    static ref WORD_TYPE_PATTERN: Regex =
13        Regex::new("Word|Noun|Proper noun|Dependent noun|Prenoun|Participle|Gerund(ive)?|Verb|Preverb|Predicative|Conjugation|Adjective|Comparative-only adjectives|Determinative|Adverb|Adnominal|Inflection|Pronoun|Preposition|Postposition|Ambiposition|Circumposition|Conjunction|Initial|Prefix|Suffix|Final|Affix|Infix|Interfix|Circumfix|Clitic|Article|Particle|Locative|Determiner|Classifier|Subordinate modifier|Contraction|Combining form|Compound part|Enclitic|Relative|Phrase|Propositional phrase|Proverb|Idiom|Honorific title|Ideophone|Phonogram|Onomatopoeia|Phoneme|Ligature|Syllable|Letter|Symbol|Counter|Number|Numeral|Multiple parts of speech|Punctuation mark|Diacritical mark|Root")
14            .unwrap();
15    static ref IGNORED_LANGUAGE_PATTERN: Regex = Regex::new("Translingual").unwrap();
16    static ref IGNORED_SUBSECTION_PATTERN: Regex = Regex::new("Variant spellings|Relational forms|Spelling variants|Other usage|Other versions|Possessed forms|Graphical notes|Design|Echo word|From|Description|Derived characters|Derived|Derivatives|Alternate spelling|Accentuation notes|Accentological notes|Usage|Citations?|Examples?|Sources|User notes?|Work to be done|Stem|Sign values|Reconstruction|Production|Logogram|Holonyms?|Meronyms|Forms?|Dialectal synonyms?|Decadents?|Abbreviations?|Borrowed terms?|External (L|l)inks?|Related words?|Standard form|Nom glyph origin|Readings?|Synonyms?|Antonyms?|Hyponyms?|Hypernyms?|Paronyms?|Translations?|Coordinate terms?|Dialectal variants?|Romanization|Statistics?|Declension|Alternative scripts?|Phrasal verbs?|Trivia|Han character|Hanzi|Glyph origin|Definitions?|Compounds?|Descendants?|Kanji|Hanja|Notes?|Derived (t|T)erms?|Usage notes|Alternative forms|Alternative|Etymology|Pronunciation( [1-9][0-9]*)?|Further reading|Anagrams|References?|Refs|Further references?|See ?(a|A)lso|Mutation|Interjection|Quotations|Gallery|Related (t|T)erms?").unwrap();
17}
18
19pub struct Word {
20    /// The word itself.
21    /// Multiple `Word`s may have the same `word` if they are of a different language or type.
22    pub word: String,
23
24    /// The english name of the language this word is from.
25    /// While different languages may contain the same words, there will be a separate word instance for each.
26    pub language_english_name: String,
27
28    /// The word type, as declared by wiktionary.
29    /// While a word may have multiple types, there will be a separate word instance for each.
30    pub word_type: String,
31}
32
33/// Extract words from a wiktionary page.
34/// Errors while extracting are handed to `error_consumer`,
35/// while errors while consuming results are returned.
36pub async fn wikitext_to_words<
37    WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
38>(
39    title: &str,
40    wikitext: &Wikitext,
41    mut result_consumer: impl FnMut(Word) -> WordConsumerResult,
42    mut error_consumer: impl FnMut(Error),
43) -> Result<()> {
44    if IGNORED_PATTERN.is_match(title) {
45        // silently ignore non-words
46        return Ok(());
47    }
48
49    let root_section = &wikitext.root_section;
50
51    if root_section.headline.level == 1 {
52        let word = &root_section.headline.label;
53
54        for subsection in &root_section.subsections {
55            parse_language_subsection(word, subsection, &mut result_consumer, &mut error_consumer)
56                .await?;
57        }
58    } else {
59        error_consumer(Error::Other(
60            "Root section is not at headline level 1".to_string(),
61        ));
62    }
63
64    Ok(())
65}
66
67async fn parse_language_subsection<
68    WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
69>(
70    word: &str,
71    language_subsection: &Section,
72    result_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
73    error_consumer: &mut impl FnMut(Error),
74) -> Result<()> {
75    let language_english_name = language_subsection.headline.label.as_str();
76    if IGNORED_LANGUAGE_PATTERN.is_match(language_english_name) {
77        // silently ignore high-level metalanguages
78        return Ok(());
79    }
80
81    if language_subsection.subsections.is_empty() {
82        result_consumer(Word {
83            word: word.to_string(),
84            language_english_name: language_english_name.to_string(),
85            word_type: "Unknown".to_string(),
86        })
87        .await
88        .map_err(|error| Error::WordConsumer { source: error })?;
89    } else {
90        let mut toplevel_details = false;
91        let mut bottomlevel_details = false;
92        let mut bottomlevel_errors = Vec::new();
93
94        for unknown_subsection in &language_subsection.subsections {
95            if unknown_subsection.headline.label == "Etymology"
96                || WORD_TYPE_PATTERN.is_match(&unknown_subsection.headline.label)
97            {
98                toplevel_details = true;
99            } else if unknown_subsection.headline.label != "Etymology"
100                && unknown_subsection.headline.label.starts_with("Etymology")
101            {
102                bottomlevel_details = true;
103                parse_details_subsection(
104                    word,
105                    language_english_name,
106                    unknown_subsection,
107                    result_consumer,
108                    error_consumer,
109                )
110                .await?;
111            } else if IGNORED_SUBSECTION_PATTERN.is_match(&unknown_subsection.headline.label) {
112                // ignore
113            } else {
114                bottomlevel_errors.push(Error::Other(format!(
115                    "Unknown subsection of language: {}",
116                    unknown_subsection.headline.label
117                )));
118            }
119        }
120
121        if toplevel_details {
122            parse_details_subsection(
123                word,
124                language_english_name,
125                language_subsection,
126                result_consumer,
127                error_consumer,
128            )
129            .await?;
130        }
131
132        if toplevel_details && bottomlevel_details {
133            error_consumer(Error::Other(format!(
134                "Found both toplevel and bottomlevel details for language {language_english_name}"
135            )));
136        }
137
138        if bottomlevel_details {
139            for error in bottomlevel_errors {
140                error_consumer(error);
141            }
142        }
143    }
144
145    Ok(())
146}
147
148async fn parse_details_subsection<
149    WordConsumerResult: Future<Output = std::result::Result<(), Box<dyn std::error::Error + Send + Sync>>>,
150>(
151    word: &str,
152    language_english_name: &str,
153    details_subsection: &Section,
154    result_consumer: &mut impl FnMut(Word) -> WordConsumerResult,
155    error_consumer: &mut impl FnMut(Error),
156) -> Result<()> {
157    for details_section in &details_subsection.subsections {
158        let word_type = &details_section.headline.label;
159        if WORD_TYPE_PATTERN.is_match(word_type) {
160            result_consumer(Word {
161                word: word.to_string(),
162                language_english_name: language_english_name.to_string(),
163                word_type: word_type.clone(),
164            })
165            .await
166            .map_err(|error| Error::WordConsumer { source: error })?;
167        } else if IGNORED_SUBSECTION_PATTERN.is_match(word_type) {
168            // ignore
169        } else {
170            error_consumer(Error::Other(format!(
171                "Unknown details subsection: {word_type}"
172            )));
173        }
174    }
175
176    Ok(())
177}
wiktionary_dump_parser/parser/words/mod.rs

wiktionary_dump_parser/parser/words/
mod.rs