harper_core/
word_metadata.rs

1use harper_brill::UPOS;
2use is_macro::Is;
3use itertools::Itertools;
4use paste::paste;
5use serde::{Deserialize, Serialize};
6use strum::{EnumCount, VariantArray};
7use strum_macros::{Display, EnumCount, EnumString, VariantArray};
8
9use std::convert::TryFrom;
10
11use crate::{Document, TokenKind, TokenStringExt, WordId};
12
13#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)]
14pub struct WordMetadata {
15    pub noun: Option<NounData>,
16    pub pronoun: Option<PronounData>,
17    pub verb: Option<VerbData>,
18    pub adjective: Option<AdjectiveData>,
19    pub adverb: Option<AdverbData>,
20    pub conjunction: Option<ConjunctionData>,
21    pub swear: Option<bool>,
22    /// The dialects this word belongs to.
23    /// If no dialects are defined, it can be assumed that the word is
24    /// valid in all dialects of English.
25    #[serde(default = "default_default")]
26    pub dialects: DialectFlags,
27    /// Whether the word is a [determiner](https://en.wikipedia.org/wiki/English_determiners).
28    pub determiner: Option<DeterminerData>,
29    /// Whether the word is a [preposition](https://www.merriam-webster.com/dictionary/preposition).
30    #[serde(default = "default_false")]
31    pub preposition: bool,
32    /// Whether the word is considered especially common.
33    #[serde(default = "default_false")]
34    pub common: bool,
35    #[serde(default = "default_none")]
36    pub derived_from: Option<WordId>,
37    /// Generated by a chunker
38    pub np_member: Option<bool>,
39    /// Generated by a POS tagger
40    pub pos_tag: Option<UPOS>,
41}
42
43/// Needed for `serde`
44fn default_false() -> bool {
45    false
46}
47
48/// Needed for `serde`
49fn default_none<T>() -> Option<T> {
50    None
51}
52
53/// Needed for `serde`
54fn default_default<T: Default>() -> T {
55    T::default()
56}
57
58macro_rules! generate_metadata_queries {
59    ($($category:ident has $($sub:ident),*).*) => {
60        paste! {
61            pub fn is_likely_homograph(&self) -> bool {
62                [self.is_determiner(), self.preposition, $(
63                    self.[< is_ $category >](),
64                )*].iter().map(|b| *b as u8).sum::<u8>() > 1
65            }
66
67            $(
68                #[doc = concat!("Checks if the word is definitely a ", stringify!($category), ".")]
69                pub fn [< is_ $category >](&self) -> bool {
70                    self.$category.is_some()
71                }
72
73                $(
74                    #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as (a) ", stringify!($sub), ".")]
75                    pub fn [< is_ $sub _ $category >](&self) -> bool {
76                        matches!(
77                            self.$category,
78                            Some([< $category:camel Data >]{
79                                [< is_ $sub >]: Some(true),
80                                ..
81                            })
82                        ) }
83
84                    #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as __not__ (a) ", stringify!($sub), ".")]
85                    pub fn [< is_non_ $sub _ $category >](&self) -> bool {
86                        matches!(
87                            self.$category,
88                            Some([< $category:camel Data >]{
89                                [< is_ $sub >]: None | Some(false),
90                                ..
91                            })
92                        )
93                    }
94                )*
95            )*
96        }
97    };
98}
99
100impl WordMetadata {
101    /// Produce a copy of `self` with the known properties of `other` set.
102    pub fn or(&self, other: &Self) -> Self {
103        macro_rules! merge {
104            ($a:expr, $b:expr) => {
105                match ($a, $b) {
106                    (Some(a), Some(b)) => Some(a.or(&b)),
107                    (Some(a), None) => Some(a),
108                    (None, Some(b)) => Some(b),
109                    (None, None) => None,
110                }
111            };
112        }
113
114        Self {
115            noun: merge!(self.noun, other.noun),
116            pronoun: merge!(self.pronoun, other.pronoun),
117            verb: merge!(self.verb, other.verb),
118            adjective: merge!(self.adjective, other.adjective),
119            adverb: merge!(self.adverb, other.adverb),
120            conjunction: merge!(self.conjunction, other.conjunction),
121            dialects: self.dialects | other.dialects,
122            swear: self.swear.or(other.swear),
123            determiner: merge!(self.determiner, other.determiner),
124            preposition: self.preposition || other.preposition,
125            common: self.common || other.common,
126            derived_from: self.derived_from.or(other.derived_from),
127            pos_tag: self.pos_tag.or(other.pos_tag),
128            np_member: self.np_member.or(other.np_member),
129        }
130    }
131
132    /// Given a UPOS tag, discard any metadata that would disagree with the given POS tag.
133    /// For example, if the metadata suggests a word could either be a noun or an adjective, and we
134    /// provide a [`UPOS::NOUN`], this function will remove the adjective data.
135    ///
136    /// Additionally, if the metadata does not currently declare the potential of the word to be
137    /// the specific POS, it becomes so. That means if we provide a [`UPOS::ADJ`] to the function
138    /// for a metadata whose `Self::adjective = None`, it will become `Some`.
139    pub fn enforce_pos_exclusivity(&mut self, pos: &UPOS) {
140        use UPOS::*;
141        match pos {
142            NOUN => {
143                if let Some(noun) = self.noun {
144                    self.noun = Some(NounData {
145                        is_proper: Some(false),
146                        ..noun
147                    })
148                } else {
149                    self.noun = Some(NounData {
150                        is_proper: Some(false),
151                        is_singular: None,
152                        is_plural: None,
153                        is_countable: None,
154                        is_mass: None,
155                        is_possessive: None,
156                    })
157                }
158
159                self.pronoun = None;
160                self.verb = None;
161                self.adjective = None;
162                self.adverb = None;
163                self.conjunction = None;
164                self.determiner = None;
165                self.preposition = false;
166            }
167            PROPN => {
168                if let Some(noun) = self.noun {
169                    self.noun = Some(NounData {
170                        is_proper: Some(true),
171                        ..noun
172                    })
173                } else {
174                    self.noun = Some(NounData {
175                        is_proper: Some(true),
176                        is_singular: None,
177                        is_plural: None,
178                        is_countable: None,
179                        is_mass: None,
180                        is_possessive: None,
181                    })
182                }
183
184                self.pronoun = None;
185                self.verb = None;
186                self.adjective = None;
187                self.adverb = None;
188                self.conjunction = None;
189                self.determiner = None;
190                self.preposition = false;
191            }
192            PRON => {
193                if self.pronoun.is_none() {
194                    self.pronoun = Some(PronounData::default())
195                }
196
197                self.noun = None;
198                self.verb = None;
199                self.adjective = None;
200                self.adverb = None;
201                self.conjunction = None;
202                self.determiner = None;
203                self.preposition = false;
204            }
205            VERB => {
206                if let Some(verb) = self.verb {
207                    self.verb = Some(VerbData {
208                        is_auxiliary: Some(false),
209                        ..verb
210                    })
211                } else {
212                    self.verb = Some(VerbData {
213                        is_auxiliary: Some(false),
214                        ..Default::default()
215                    })
216                }
217
218                self.noun = None;
219                self.pronoun = None;
220                self.adjective = None;
221                self.adverb = None;
222                self.conjunction = None;
223                self.determiner = None;
224                self.preposition = false;
225            }
226            AUX => {
227                if let Some(verb) = self.verb {
228                    self.verb = Some(VerbData {
229                        is_auxiliary: Some(true),
230                        ..verb
231                    })
232                } else {
233                    self.verb = Some(VerbData {
234                        is_auxiliary: Some(true),
235                        ..Default::default()
236                    })
237                }
238
239                self.noun = None;
240                self.pronoun = None;
241                self.adjective = None;
242                self.adverb = None;
243                self.conjunction = None;
244                self.determiner = None;
245                self.preposition = false;
246            }
247            ADJ => {
248                if self.adjective.is_none() {
249                    self.adjective = Some(AdjectiveData::default())
250                }
251
252                self.noun = None;
253                self.pronoun = None;
254                self.verb = None;
255                self.adverb = None;
256                self.conjunction = None;
257                self.determiner = None;
258                self.preposition = false;
259            }
260            ADV => {
261                if self.adverb.is_none() {
262                    self.adverb = Some(AdverbData::default())
263                }
264
265                self.noun = None;
266                self.pronoun = None;
267                self.verb = None;
268                self.adjective = None;
269                self.conjunction = None;
270                self.determiner = None;
271                self.preposition = false;
272            }
273            ADP => {
274                self.noun = None;
275                self.pronoun = None;
276                self.verb = None;
277                self.adjective = None;
278                self.adverb = None;
279                self.conjunction = None;
280                self.determiner = None;
281                self.preposition = true;
282            }
283            DET => {
284                self.noun = None;
285                self.pronoun = None;
286                self.verb = None;
287                self.adjective = None;
288                self.adverb = None;
289                self.conjunction = None;
290                self.preposition = false;
291                self.determiner = Some(DeterminerData::default());
292            }
293            CCONJ | SCONJ => {
294                if self.conjunction.is_none() {
295                    self.conjunction = Some(ConjunctionData::default())
296                }
297
298                self.noun = None;
299                self.pronoun = None;
300                self.verb = None;
301                self.adjective = None;
302                self.adverb = None;
303                self.determiner = None;
304                self.preposition = false;
305            }
306            _ => {}
307        }
308    }
309
310    generate_metadata_queries!(
311        // Singular and countable default to true, so their metadata queries are not generated.
312        noun has proper, plural, mass, possessive.
313        pronoun has personal, singular, plural, possessive, reflexive, subject, object.
314        determiner has demonstrative, possessive.
315        verb has linking, auxiliary.
316        conjunction has.
317        adjective has.
318        adverb has
319    );
320
321    // Manual metadata queries
322
323    // Pronoun metadata queries
324
325    pub fn is_first_person_plural_pronoun(&self) -> bool {
326        matches!(
327            self.pronoun,
328            Some(PronounData {
329                person: Some(Person::First),
330                is_plural: Some(true),
331                ..
332            })
333        )
334    }
335
336    pub fn is_first_person_singular_pronoun(&self) -> bool {
337        matches!(
338            self.pronoun,
339            Some(PronounData {
340                person: Some(Person::First),
341                is_singular: Some(true),
342                ..
343            })
344        )
345    }
346
347    pub fn is_third_person_plural_pronoun(&self) -> bool {
348        matches!(
349            self.pronoun,
350            Some(PronounData {
351                person: Some(Person::Third),
352                is_plural: Some(true),
353                ..
354            })
355        )
356    }
357
358    pub fn is_third_person_singular_pronoun(&self) -> bool {
359        matches!(
360            self.pronoun,
361            Some(PronounData {
362                person: Some(Person::Third),
363                is_singular: Some(true),
364                ..
365            })
366        )
367    }
368
369    pub fn is_third_person_pronoun(&self) -> bool {
370        matches!(
371            self.pronoun,
372            Some(PronounData {
373                person: Some(Person::Third),
374                ..
375            })
376        )
377    }
378
379    pub fn is_second_person_pronoun(&self) -> bool {
380        matches!(
381            self.pronoun,
382            Some(PronounData {
383                person: Some(Person::Second),
384                ..
385            })
386        )
387    }
388
389    pub fn is_verb_lemma(&self) -> bool {
390        matches!(
391            self.verb,
392            Some(VerbData {
393                verb_form: Some(VerbForm::LemmaForm),
394                ..
395            })
396        )
397    }
398
399    pub fn is_verb_past_form(&self) -> bool {
400        matches!(
401            self.verb,
402            Some(VerbData {
403                verb_form: Some(VerbForm::PastForm),
404                ..
405            })
406        )
407    }
408
409    pub fn is_verb_progressive_form(&self) -> bool {
410        matches!(
411            self.verb,
412            Some(VerbData {
413                verb_form: Some(VerbForm::ProgressiveForm),
414                ..
415            })
416        )
417    }
418
419    pub fn is_verb_third_person_singular_present_form(&self) -> bool {
420        matches!(
421            self.verb,
422            Some(VerbData {
423                verb_form: Some(VerbForm::ThirdPersonSingularPresentForm),
424                ..
425            })
426        )
427    }
428
429    // Noun metadata queries
430
431    // Singular is default if number is not marked in the dictionary.
432    pub fn is_singular_noun(&self) -> bool {
433        if let Some(noun) = self.noun {
434            matches!(
435                (noun.is_singular, noun.is_plural),
436                (Some(true), _) | (None | Some(false), None | Some(false))
437            )
438        } else {
439            false
440        }
441    }
442    pub fn is_non_singular_noun(&self) -> bool {
443        if let Some(noun) = self.noun {
444            !matches!(
445                (noun.is_singular, noun.is_plural),
446                (Some(true), _) | (None | Some(false), None | Some(false))
447            )
448        } else {
449            false
450        }
451    }
452
453    // Countable is default if countability is not marked in the dictionary.
454    pub fn is_countable_noun(&self) -> bool {
455        if let Some(noun) = self.noun {
456            matches!(
457                (noun.is_singular, noun.is_plural),
458                (Some(true), _) | (None | Some(false), None | Some(false))
459            )
460        } else {
461            false
462        }
463    }
464    pub fn is_non_countable_noun(&self) -> bool {
465        if let Some(noun) = self.noun {
466            !matches!(
467                (noun.is_countable, noun.is_mass),
468                (Some(true), _) | (None | Some(false), None | Some(false))
469            )
470        } else {
471            false
472        }
473    }
474
475    // Nominal metadata queries (noun + pronoun)
476
477    /// Checks if the word is definitely nominal.
478    pub fn is_nominal(&self) -> bool {
479        self.is_noun() || self.is_pronoun()
480    }
481
482    /// Checks if the word is definitely a nominal and more specifically is labeled as (a) singular.
483    pub fn is_singular_nominal(&self) -> bool {
484        self.is_singular_noun() || self.is_singular_pronoun()
485    }
486
487    /// Checks if the word is definitely a nominal and more specifically is labeled as (a) plural.
488    pub fn is_plural_nominal(&self) -> bool {
489        self.is_plural_noun() || self.is_plural_pronoun()
490    }
491
492    /// Checks if the word is definitely a nominal and more specifically is labeled as (a) possessive.
493    pub fn is_possessive_nominal(&self) -> bool {
494        self.is_possessive_noun() || self.is_possessive_pronoun()
495    }
496
497    /// Checks if the word is definitely a nominal and more specifically is labeled as __not__ (a) singular.
498    pub fn is_non_singular_nominal(&self) -> bool {
499        self.is_non_singular_noun() || self.is_non_singular_pronoun()
500    }
501
502    /// Checks if the word is definitely a nominal and more specifically is labeled as __not__ (a) plural.
503    pub fn is_non_plural_nominal(&self) -> bool {
504        self.is_non_plural_noun() || self.is_non_plural_pronoun()
505    }
506
507    /// Checks if the word is definitely a nominal and more specifically is labeled as __not__ (a) possessive.
508    pub fn is_non_possessive_nominal(&self) -> bool {
509        self.is_non_possessive_noun() || self.is_non_possessive_pronoun()
510    }
511
512    /// Checks whether a word is _definitely_ a swear.
513    pub fn is_swear(&self) -> bool {
514        matches!(self.swear, Some(true))
515    }
516
517    /// Same thing as [`Self::or`], except in-place rather than a clone.
518    pub fn append(&mut self, other: &Self) -> &mut Self {
519        *self = self.or(other);
520        self
521    }
522}
523
524// These verb forms are morphological variations, distinct from TAM (Tense-Aspect-Mood)
525// Each form can be used in various TAM combinations:
526// - Lemma form (infinitive, citation form, dictionary form)
527//   Used in infinitives (e.g., "to sleep"), imperatives (e.g., "sleep!"), and with modals (e.g., "will sleep")
528// - Past form (past participle and simple past)
529//   Used as verbs (e.g., "slept") or adjectives (e.g., "closed door")
530// - Progressive form (present participle and gerund)
531//   Used as verbs (e.g., "sleeping"), nouns (e.g., "sleeping is important"), or adjectives (e.g., "sleeping dog")
532// - Third person singular present (-s/-es)
533//   Used for third person singular subjects (e.g., "he sleeps", "she reads")
534//
535// Important notes:
536// 1. English expresses time through auxiliary verbs, not verb form alone
537// 2. Irregular verbs can have different forms for past participle and simple past
538// 3. Future is always expressed through auxiliary verbs (e.g., "will sleep", "going to sleep")
539#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Is, Hash)]
540pub enum VerbForm {
541    LemmaForm,
542    PastForm,
543    ProgressiveForm,
544    ThirdPersonSingularPresentForm,
545}
546
547#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
548pub struct VerbData {
549    pub is_linking: Option<bool>,
550    pub is_auxiliary: Option<bool>,
551    pub verb_form: Option<VerbForm>,
552}
553
554impl VerbData {
555    /// Produce a copy of `self` with the known properties of `other` set.
556    pub fn or(&self, other: &Self) -> Self {
557        Self {
558            is_linking: self.is_linking.or(other.is_linking),
559            is_auxiliary: self.is_auxiliary.or(other.is_auxiliary),
560            verb_form: self.verb_form.or(other.verb_form),
561        }
562    }
563}
564
565// nouns can be both singular and plural: "aircraft", "biceps", "fish", "sheep"
566// TODO other noun properties may be worth adding:
567// TODO count vs mass; abstract
568#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
569pub struct NounData {
570    pub is_proper: Option<bool>,
571    pub is_singular: Option<bool>,
572    pub is_plural: Option<bool>,
573    pub is_countable: Option<bool>,
574    pub is_mass: Option<bool>,
575    pub is_possessive: Option<bool>,
576}
577
578impl NounData {
579    /// Produce a copy of `self` with the known properties of `other` set.
580    pub fn or(&self, other: &Self) -> Self {
581        Self {
582            is_proper: self.is_proper.or(other.is_proper),
583            is_singular: self.is_singular.or(other.is_singular),
584            is_plural: self.is_plural.or(other.is_plural),
585            is_countable: self.is_countable.or(other.is_countable),
586            is_mass: self.is_mass.or(other.is_mass),
587            is_possessive: self.is_possessive.or(other.is_possessive),
588        }
589    }
590}
591
592// Person is a property of pronouns; the verb 'be', plus all verbs reflect 3rd person singular with -s
593#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Is, Hash)]
594pub enum Person {
595    First,
596    Second,
597    Third,
598}
599
600// TODO for now focused on personal pronouns?
601#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
602pub struct PronounData {
603    pub is_personal: Option<bool>,
604    pub is_singular: Option<bool>,
605    pub is_plural: Option<bool>,
606    pub is_possessive: Option<bool>,
607    pub is_reflexive: Option<bool>,
608    pub person: Option<Person>,
609    pub is_subject: Option<bool>,
610    pub is_object: Option<bool>,
611}
612
613impl PronounData {
614    /// Produce a copy of `self` with the known properties of `other` set.
615    pub fn or(&self, other: &Self) -> Self {
616        Self {
617            is_personal: self.is_personal.or(other.is_personal),
618            is_singular: self.is_singular.or(other.is_singular),
619            is_plural: self.is_plural.or(other.is_plural),
620            is_possessive: self.is_possessive.or(other.is_possessive),
621            is_reflexive: self.is_reflexive.or(other.is_reflexive),
622            person: self.person.or(other.person),
623            is_subject: self.is_subject.or(other.is_subject),
624            is_object: self.is_object.or(other.is_object),
625        }
626    }
627}
628
629#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
630pub struct DeterminerData {
631    pub is_demonstrative: Option<bool>,
632    pub is_possessive: Option<bool>,
633}
634
635impl DeterminerData {
636    /// Produce a copy of `self` with the known properties of `other` set.
637    pub fn or(&self, other: &Self) -> Self {
638        Self {
639            is_demonstrative: self.is_demonstrative.or(other.is_demonstrative),
640            is_possessive: self.is_possessive.or(other.is_possessive),
641        }
642    }
643}
644
645// Degree is a property of adjectives: positive is not inflected
646// Comparative is inflected with -er or comes after the word "more"
647// Superlative is inflected with -est or comes after the word "most"
648#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Is, Hash)]
649pub enum Degree {
650    Positive,
651    Comparative,
652    Superlative,
653}
654
655// Some adjectives are not comparable so don't have -er or -est forms and can't be used with "more" or "most".
656// Some adjectives can only be used "attributively" (before a noun); some only predicatively (after "is" etc.).
657// In old grammars words like the articles and determiners are classified as adjectives but behave differently.
658#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
659pub struct AdjectiveData {
660    pub degree: Option<Degree>,
661}
662
663impl AdjectiveData {
664    /// Produce a copy of `self` with the known properties of `other` set.
665    pub fn or(&self, other: &Self) -> Self {
666        Self {
667            degree: self.degree.or(other.degree),
668        }
669    }
670}
671
672// Adverb can be a "junk drawer" category for words which don't fit the other major categories.
673// The typical adverbs are "adverbs of manner", those derived from adjectives in -ly
674// other adverbs (time, place, etc) should probably not be considered adverbs for Harper's purposes
675#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
676pub struct AdverbData {}
677
678impl AdverbData {
679    /// Produce a copy of `self` with the known properties of `other` set.
680    pub fn or(&self, _other: &Self) -> Self {
681        Self {}
682    }
683}
684
685#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
686pub struct ConjunctionData {}
687
688impl ConjunctionData {
689    /// Produce a copy of `self` with the known properties of `other` set.
690    pub fn or(&self, _other: &Self) -> Self {
691        Self {}
692    }
693}
694
695/// A regional dialect.
696#[derive(
697    Debug,
698    Clone,
699    Copy,
700    Serialize,
701    Deserialize,
702    PartialEq,
703    PartialOrd,
704    Eq,
705    Hash,
706    EnumCount,
707    EnumString,
708    Display,
709    VariantArray,
710)]
711pub enum Dialect {
712    // Note: these have bit-shifted values so that they can ergonomically integrate with
713    // `DialectFlags`. Each value here must have a unique bit index inside
714    // `DialectsUnderlyingType`.
715    American = 1 << 0,
716    Canadian = 1 << 1,
717    Australian = 1 << 2,
718    British = 1 << 3,
719}
720impl Dialect {
721    /// Tries to guess the dialect used in the document by finding which dialect is used the most.
722    /// Returns `None` if it fails to find a single dialect that is used the most.
723    #[must_use]
724    pub fn try_guess_from_document(document: &Document) -> Option<Self> {
725        Self::try_from(DialectFlags::get_most_used_dialects_from_document(document)).ok()
726    }
727
728    /// Tries to get a dialect from its abbreviation. Returns `None` if the abbreviation is not
729    /// recognized.
730    ///
731    /// # Examples
732    ///
733    /// ```
734    /// use harper_core::Dialect;
735    ///
736    /// let abbrs = ["US", "CA", "AU", "GB"];
737    /// let mut dialects = abbrs.iter().map(|abbr| Dialect::try_from_abbr(abbr));
738    ///
739    /// assert_eq!(Some(Dialect::American), dialects.next().unwrap()); // US
740    /// assert_eq!(Some(Dialect::Canadian), dialects.next().unwrap()); // CA
741    /// assert_eq!(Some(Dialect::Australian), dialects.next().unwrap()); // AU
742    /// assert_eq!(Some(Dialect::British), dialects.next().unwrap()); // GB
743    /// ```
744    #[must_use]
745    pub fn try_from_abbr(abbr: &str) -> Option<Self> {
746        match abbr {
747            "US" => Some(Self::American),
748            "CA" => Some(Self::Canadian),
749            "AU" => Some(Self::Australian),
750            "GB" => Some(Self::British),
751            _ => None,
752        }
753    }
754}
755impl TryFrom<DialectFlags> for Dialect {
756    type Error = ();
757
758    /// Attempts to convert `DialectFlags` to a single `Dialect`.
759    ///
760    /// # Errors
761    ///
762    /// Will return `Err` if more than one dialect is enabled or if an undefined dialect is
763    /// enabled.
764    fn try_from(dialect_flags: DialectFlags) -> Result<Self, Self::Error> {
765        // Ensure only one dialect is enabled before converting.
766        if dialect_flags.bits().count_ones() == 1 {
767            match dialect_flags {
768                df if df.is_dialect_enabled_strict(Dialect::American) => Ok(Dialect::American),
769                df if df.is_dialect_enabled_strict(Dialect::Canadian) => Ok(Dialect::Canadian),
770                df if df.is_dialect_enabled_strict(Dialect::Australian) => Ok(Dialect::Australian),
771                df if df.is_dialect_enabled_strict(Dialect::British) => Ok(Dialect::British),
772                _ => Err(()),
773            }
774        } else {
775            // More than one dialect enabled; can't soundly convert.
776            Err(())
777        }
778    }
779}
780
781// The underlying type used for DialectFlags.
782// At the time of writing, this is currently a `u8`. If we want to define more than 8 dialects in
783// the future, we will need to switch this to a larger type.
784type DialectFlagsUnderlyingType = u8;
785
786bitflags::bitflags! {
787    /// A collection of bit flags used to represent enabled dialects.
788    ///
789    /// This is generally used to allow a word (or similar) to be tagged with multiple dialects.
790    #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash)]
791    #[serde(transparent)]
792    pub struct DialectFlags: DialectFlagsUnderlyingType {
793        const AMERICAN = Dialect::American as DialectFlagsUnderlyingType;
794        const CANADIAN = Dialect::Canadian as DialectFlagsUnderlyingType;
795        const AUSTRALIAN = Dialect::Australian as DialectFlagsUnderlyingType;
796        const BRITISH = Dialect::British as DialectFlagsUnderlyingType;
797    }
798}
799impl DialectFlags {
800    /// Checks if the provided dialect is enabled.
801    /// If no dialect is explicitly enabled, it is assumed that all dialects are enabled.
802    #[must_use]
803    pub fn is_dialect_enabled(self, dialect: Dialect) -> bool {
804        self.is_empty() || self.intersects(Self::from_dialect(dialect))
805    }
806
807    /// Checks if the provided dialect is ***explicitly*** enabled.
808    ///
809    /// Unlike `is_dialect_enabled`, this will return false when no dialects are explicitly
810    /// enabled.
811    #[must_use]
812    pub fn is_dialect_enabled_strict(self, dialect: Dialect) -> bool {
813        self.intersects(Self::from_dialect(dialect))
814    }
815
816    /// Constructs a `DialectFlags` from the provided `Dialect`, with only that dialect being
817    /// enabled.
818    ///
819    /// # Panics
820    ///
821    /// This will panic if `dialect` represents a dialect that is not defined in
822    /// `DialectFlags`.
823    #[must_use]
824    pub fn from_dialect(dialect: Dialect) -> Self {
825        let Some(out) = Self::from_bits(dialect as DialectFlagsUnderlyingType) else {
826            panic!("The '{dialect}' dialect isn't defined in DialectFlags!");
827        };
828        out
829    }
830
831    /// Gets the most commonly used dialect(s) in the document.
832    ///
833    /// If multiple dialects are used equally often, they will all be enabled in the returned
834    /// `DialectFlags`. On the other hand, if there is a single dialect that is used the most, it
835    /// will be the only one enabled.
836    #[must_use]
837    pub fn get_most_used_dialects_from_document(document: &Document) -> Self {
838        // Initialize counters.
839        let mut dialect_counters: [(Dialect, usize); Dialect::COUNT] = Dialect::VARIANTS
840            .iter()
841            .map(|d| (*d, 0))
842            .collect_array()
843            .unwrap();
844
845        // Count word dialects.
846        document.iter_words().for_each(|w| {
847            if let TokenKind::Word(Some(word_metadata)) = &w.kind {
848                // If the token is a word, iterate though the dialects in `dialect_counters` and
849                // increment those counters where the word has the respective dialect enabled.
850                dialect_counters.iter_mut().for_each(|(dialect, count)| {
851                    if word_metadata.dialects.is_dialect_enabled(*dialect) {
852                        *count += 1;
853                    }
854                });
855            }
856        });
857
858        // Find max counter.
859        let max_counter = dialect_counters
860            .iter()
861            .map(|(_, count)| count)
862            .max()
863            .unwrap();
864        // Get and convert the collection of most used dialects into a `DialectFlags`.
865        dialect_counters
866            .into_iter()
867            .filter(|(_, count)| count == max_counter)
868            .fold(DialectFlags::empty(), |acc, dialect| {
869                // Fold most used dialects into `DialectFlags` via bitwise or.
870                acc | Self::from_dialect(dialect.0)
871            })
872    }
873}
874impl Default for DialectFlags {
875    /// A default value with no dialects explicitly enabled.
876    /// Implicitly, this state corresponds to all dialects being enabled.
877    fn default() -> Self {
878        Self::empty()
879    }
880}
881
882#[cfg(test)]
883mod tests {
884    use crate::{Dictionary, FstDictionary, WordMetadata};
885
886    // Helper function to get word metadata from the curated dictionary
887    fn md(word: &str) -> WordMetadata {
888        FstDictionary::curated()
889            .get_word_metadata_str(word)
890            .unwrap_or_else(|| panic!("Word '{word}' not found in dictionary"))
891            .clone()
892    }
893
894    mod dialect {
895        use super::super::{Dialect, DialectFlags};
896        use crate::Document;
897
898        #[test]
899        fn guess_british_dialect() {
900            let document = Document::new_plain_english_curated("Aluminium was used.");
901            let df = DialectFlags::get_most_used_dialects_from_document(&document);
902            assert!(
903                df.is_dialect_enabled_strict(Dialect::British)
904                    && !df.is_dialect_enabled_strict(Dialect::American)
905            );
906        }
907
908        #[test]
909        fn guess_american_dialect() {
910            let document = Document::new_plain_english_curated("Aluminum was used.");
911            let df = DialectFlags::get_most_used_dialects_from_document(&document);
912            assert!(
913                df.is_dialect_enabled_strict(Dialect::American)
914                    && !df.is_dialect_enabled_strict(Dialect::British)
915            );
916        }
917    }
918
919    mod noun {
920        use crate::word_metadata::tests::md;
921
922        #[test]
923        fn puppy_is_noun() {
924            assert!(md("puppy").is_noun());
925        }
926
927        #[test]
928        fn prepare_is_not_noun() {
929            assert!(!md("prepare").is_noun());
930        }
931
932        #[test]
933        fn paris_is_proper_noun() {
934            assert!(md("Paris").is_proper_noun());
935        }
936
937        #[test]
938        fn permit_is_non_proper_noun() {
939            assert!(md("lapdog").is_non_proper_noun());
940        }
941
942        #[test]
943        fn hound_is_singular_noun() {
944            assert!(md("hound").is_singular_noun());
945        }
946
947        #[test]
948        fn pooches_is_non_singular_noun() {
949            assert!(md("pooches").is_non_singular_noun());
950        }
951
952        // Make sure is_non_xxx_noun methods don't behave like is_not_xxx_noun.
953        // In other words, make sure they don't return true for words that are not nouns.
954        // They must only pass for words that are nouns but not singular etc.
955        #[test]
956        fn loyal_doesnt_pass_is_non_singular_noun() {
957            assert!(!md("loyal").is_non_singular_noun());
958        }
959
960        #[test]
961        fn hounds_is_plural_noun() {
962            assert!(md("hounds").is_plural_noun());
963        }
964
965        #[test]
966        fn pooch_is_non_plural_noun() {
967            assert!(md("pooch").is_non_plural_noun());
968        }
969
970        #[test]
971        fn fish_is_singular_noun() {
972            assert!(md("fish").is_singular_noun());
973        }
974
975        #[test]
976        fn fish_is_plural_noun() {
977            assert!(md("fish").is_plural_noun());
978        }
979
980        #[test]
981        fn fishes_is_plural_noun() {
982            assert!(md("fishes").is_plural_noun());
983        }
984
985        #[test]
986        fn sheep_is_singular_noun() {
987            assert!(md("sheep").is_singular_noun());
988        }
989
990        #[test]
991        fn sheep_is_plural_noun() {
992            assert!(md("sheep").is_plural_noun());
993        }
994
995        #[test]
996        #[should_panic]
997        fn sheeps_is_not_word() {
998            md("sheeps");
999        }
1000
1001        #[test]
1002        fn bicep_is_singular_noun() {
1003            assert!(md("bicep").is_singular_noun());
1004        }
1005
1006        #[test]
1007        fn biceps_is_singular_noun() {
1008            assert!(md("biceps").is_singular_noun());
1009        }
1010
1011        #[test]
1012        fn biceps_is_plural_noun() {
1013            assert!(md("biceps").is_plural_noun());
1014        }
1015
1016        #[test]
1017        fn aircraft_is_singular_noun() {
1018            assert!(md("aircraft").is_singular_noun());
1019        }
1020
1021        #[test]
1022        fn aircraft_is_plural_noun() {
1023            assert!(md("aircraft").is_plural_noun());
1024        }
1025
1026        #[test]
1027        #[should_panic]
1028        fn aircrafts_is_not_word() {
1029            md("aircrafts");
1030        }
1031
1032        #[test]
1033        fn dog_apostrophe_s_is_possessive_noun() {
1034            assert!(md("dog's").is_possessive_noun());
1035        }
1036
1037        #[test]
1038        fn dogs_is_non_possessive_noun() {
1039            assert!(md("dogs").is_non_possessive_noun());
1040        }
1041
1042        // noun countability
1043
1044        #[test]
1045        fn dog_is_countable() {
1046            assert!(md("dog").is_countable_noun());
1047        }
1048        #[test]
1049        fn dog_is_non_mass_noun() {
1050            assert!(md("dog").is_non_mass_noun());
1051        }
1052
1053        #[test]
1054        fn furniture_is_mass_noun() {
1055            assert!(md("furniture").is_mass_noun());
1056        }
1057        #[test]
1058        fn furniture_is_not_countable_noun() {
1059            assert!(md("furniture").is_non_countable_noun());
1060        }
1061
1062        #[test]
1063        fn beer_is_countable_noun() {
1064            assert!(md("beer").is_countable_noun());
1065        }
1066        #[test]
1067        fn beer_is_mass_noun() {
1068            assert!(md("beer").is_mass_noun());
1069        }
1070    }
1071
1072    mod pronoun {
1073        use crate::word_metadata::tests::md;
1074
1075        mod i_me_myself {
1076            use crate::word_metadata::tests::md;
1077
1078            #[test]
1079            fn i_is_pronoun() {
1080                assert!(md("I").is_pronoun());
1081            }
1082            #[test]
1083            fn i_is_personal_pronoun() {
1084                assert!(md("I").is_personal_pronoun());
1085            }
1086            #[test]
1087            fn i_is_singular_pronoun() {
1088                assert!(md("I").is_singular_pronoun());
1089            }
1090            #[test]
1091            fn i_is_subject_pronoun() {
1092                assert!(md("I").is_subject_pronoun());
1093            }
1094
1095            #[test]
1096            fn me_is_pronoun() {
1097                assert!(md("me").is_pronoun());
1098            }
1099            #[test]
1100            fn me_is_personal_pronoun() {
1101                assert!(md("me").is_personal_pronoun());
1102            }
1103            #[test]
1104            fn me_is_singular_pronoun() {
1105                assert!(md("me").is_singular_pronoun());
1106            }
1107            #[test]
1108            fn me_is_object_pronoun() {
1109                assert!(md("me").is_object_pronoun());
1110            }
1111
1112            #[test]
1113            fn myself_is_pronoun() {
1114                assert!(md("myself").is_pronoun());
1115            }
1116            #[test]
1117            fn myself_is_personal_pronoun() {
1118                assert!(md("myself").is_personal_pronoun());
1119            }
1120            #[test]
1121            fn myself_is_singular_pronoun() {
1122                assert!(md("myself").is_singular_pronoun());
1123            }
1124            #[test]
1125            fn myself_is_reflexive_pronoun() {
1126                assert!(md("myself").is_reflexive_pronoun());
1127            }
1128        }
1129
1130        mod we_us_ourselves {
1131            use crate::word_metadata::tests::md;
1132
1133            #[test]
1134            fn we_is_pronoun() {
1135                assert!(md("we").is_pronoun());
1136            }
1137            #[test]
1138            fn we_is_personal_pronoun() {
1139                assert!(md("we").is_personal_pronoun());
1140            }
1141            #[test]
1142            fn we_is_plural_pronoun() {
1143                assert!(md("we").is_plural_pronoun());
1144            }
1145            #[test]
1146            fn we_is_subject_pronoun() {
1147                assert!(md("we").is_subject_pronoun());
1148            }
1149
1150            #[test]
1151            fn us_is_pronoun() {
1152                assert!(md("us").is_pronoun());
1153            }
1154            #[test]
1155            fn us_is_personal_pronoun() {
1156                assert!(md("us").is_personal_pronoun());
1157            }
1158            #[test]
1159            fn us_is_plural_pronoun() {
1160                assert!(md("us").is_plural_pronoun());
1161            }
1162            #[test]
1163            fn us_is_object_pronoun() {
1164                assert!(md("us").is_object_pronoun());
1165            }
1166
1167            #[test]
1168            fn ourselves_is_pronoun() {
1169                assert!(md("ourselves").is_pronoun());
1170            }
1171            #[test]
1172            fn ourselves_is_personal_pronoun() {
1173                assert!(md("ourselves").is_personal_pronoun());
1174            }
1175            #[test]
1176            fn ourselves_is_plural_pronoun() {
1177                assert!(md("ourselves").is_plural_pronoun());
1178            }
1179            #[test]
1180            fn ourselves_is_reflexive_pronoun() {
1181                assert!(md("ourselves").is_reflexive_pronoun());
1182            }
1183        }
1184
1185        mod you_yourself {
1186            use crate::word_metadata::tests::md;
1187
1188            #[test]
1189            fn you_is_pronoun() {
1190                assert!(md("you").is_pronoun());
1191            }
1192            #[test]
1193            fn you_is_personal_pronoun() {
1194                assert!(md("you").is_personal_pronoun());
1195            }
1196            #[test]
1197            fn you_is_singular_pronoun() {
1198                assert!(md("you").is_singular_pronoun());
1199            }
1200            #[test]
1201            fn you_is_plural_pronoun() {
1202                assert!(md("you").is_plural_pronoun());
1203            }
1204            #[test]
1205            fn you_is_subject_pronoun() {
1206                assert!(md("you").is_subject_pronoun());
1207            }
1208            #[test]
1209            fn you_is_object_pronoun() {
1210                assert!(md("you").is_object_pronoun());
1211            }
1212            #[test]
1213            fn yourself_is_pronoun() {
1214                assert!(md("yourself").is_pronoun());
1215            }
1216            #[test]
1217            fn yourself_is_personal_pronoun() {
1218                assert!(md("yourself").is_personal_pronoun());
1219            }
1220            #[test]
1221            fn yourself_is_singular_pronoun() {
1222                assert!(md("yourself").is_singular_pronoun());
1223            }
1224            #[test]
1225            fn yourself_is_reflexive_pronoun() {
1226                assert!(md("yourself").is_reflexive_pronoun());
1227            }
1228        }
1229
1230        mod he_him_himself {
1231            use crate::word_metadata::tests::md;
1232
1233            #[test]
1234            fn he_is_pronoun() {
1235                assert!(md("he").is_pronoun());
1236            }
1237            #[test]
1238            fn he_is_personal_pronoun() {
1239                assert!(md("he").is_personal_pronoun());
1240            }
1241            #[test]
1242            fn he_is_singular_pronoun() {
1243                assert!(md("he").is_singular_pronoun());
1244            }
1245            #[test]
1246            fn he_is_subject_pronoun() {
1247                assert!(md("he").is_subject_pronoun());
1248            }
1249
1250            #[test]
1251            fn him_is_pronoun() {
1252                assert!(md("him").is_pronoun());
1253            }
1254            #[test]
1255            fn him_is_personal_pronoun() {
1256                assert!(md("him").is_personal_pronoun());
1257            }
1258            #[test]
1259            fn him_is_singular_pronoun() {
1260                assert!(md("him").is_singular_pronoun());
1261            }
1262            #[test]
1263            fn him_is_object_pronoun() {
1264                assert!(md("him").is_object_pronoun());
1265            }
1266
1267            #[test]
1268            fn himself_is_pronoun() {
1269                assert!(md("himself").is_pronoun());
1270            }
1271            #[test]
1272            fn himself_is_personal_pronoun() {
1273                assert!(md("himself").is_personal_pronoun());
1274            }
1275            #[test]
1276            fn himself_is_singular_pronoun() {
1277                assert!(md("himself").is_singular_pronoun());
1278            }
1279            #[test]
1280            fn himself_is_reflexive_pronoun() {
1281                assert!(md("himself").is_reflexive_pronoun());
1282            }
1283        }
1284
1285        mod she_her_herself {
1286            use crate::word_metadata::tests::md;
1287
1288            #[test]
1289            fn she_is_pronoun() {
1290                assert!(md("she").is_pronoun());
1291            }
1292            #[test]
1293            fn she_is_personal_pronoun() {
1294                assert!(md("she").is_personal_pronoun());
1295            }
1296            #[test]
1297            fn she_is_singular_pronoun() {
1298                assert!(md("she").is_singular_pronoun());
1299            }
1300            #[test]
1301            fn she_is_subject_pronoun() {
1302                assert!(md("she").is_subject_pronoun());
1303            }
1304
1305            #[test]
1306            fn her_is_pronoun() {
1307                assert!(md("her").is_pronoun());
1308            }
1309            #[test]
1310            fn her_is_personal_pronoun() {
1311                assert!(md("her").is_personal_pronoun());
1312            }
1313            #[test]
1314            fn her_is_singular_pronoun() {
1315                assert!(md("her").is_singular_pronoun());
1316            }
1317            #[test]
1318            fn her_is_object_pronoun() {
1319                assert!(md("her").is_object_pronoun());
1320            }
1321
1322            #[test]
1323            fn herself_is_pronoun() {
1324                assert!(md("herself").is_pronoun());
1325            }
1326            #[test]
1327            fn herself_is_personal_pronoun() {
1328                assert!(md("herself").is_personal_pronoun());
1329            }
1330            #[test]
1331            fn herself_is_singular_pronoun() {
1332                assert!(md("herself").is_singular_pronoun());
1333            }
1334            #[test]
1335            fn herself_is_reflexive_pronoun() {
1336                assert!(md("herself").is_reflexive_pronoun());
1337            }
1338        }
1339
1340        mod it_itself {
1341            use crate::word_metadata::tests::md;
1342
1343            #[test]
1344            fn it_is_pronoun() {
1345                assert!(md("it").is_pronoun());
1346            }
1347            #[test]
1348            fn it_is_personal_pronoun() {
1349                assert!(md("it").is_personal_pronoun());
1350            }
1351            #[test]
1352            fn it_is_singular_pronoun() {
1353                assert!(md("it").is_singular_pronoun());
1354            }
1355            #[test]
1356            fn it_is_subject_pronoun() {
1357                assert!(md("it").is_subject_pronoun());
1358            }
1359            #[test]
1360            fn it_is_object_pronoun() {
1361                assert!(md("it").is_object_pronoun());
1362            }
1363
1364            #[test]
1365            fn itself_is_pronoun() {
1366                assert!(md("itself").is_pronoun());
1367            }
1368            #[test]
1369            fn itself_is_personal_pronoun() {
1370                assert!(md("itself").is_personal_pronoun());
1371            }
1372            #[test]
1373            fn itself_is_singular_pronoun() {
1374                assert!(md("itself").is_singular_pronoun());
1375            }
1376            #[test]
1377            fn itself_is_reflexive_pronoun() {
1378                assert!(md("itself").is_reflexive_pronoun());
1379            }
1380        }
1381
1382        mod they_them_themselves {
1383            use crate::word_metadata::tests::md;
1384
1385            #[test]
1386            fn they_is_pronoun() {
1387                assert!(md("they").is_pronoun());
1388            }
1389            #[test]
1390            fn they_is_personal_pronoun() {
1391                assert!(md("they").is_personal_pronoun());
1392            }
1393            #[test]
1394            fn they_is_plural_pronoun() {
1395                assert!(md("they").is_plural_pronoun());
1396            }
1397            #[test]
1398            fn they_is_subject_pronoun() {
1399                assert!(md("they").is_subject_pronoun());
1400            }
1401
1402            #[test]
1403            fn them_is_pronoun() {
1404                assert!(md("them").is_pronoun());
1405            }
1406            #[test]
1407            fn them_is_personal_pronoun() {
1408                assert!(md("them").is_personal_pronoun());
1409            }
1410            #[test]
1411            fn them_is_plural_pronoun() {
1412                assert!(md("them").is_plural_pronoun());
1413            }
1414            #[test]
1415            fn them_is_object_pronoun() {
1416                assert!(md("them").is_object_pronoun());
1417            }
1418
1419            #[test]
1420            fn themselves_is_pronoun() {
1421                assert!(md("themselves").is_pronoun());
1422            }
1423            #[test]
1424            fn themselves_is_personal_pronoun() {
1425                assert!(md("themselves").is_personal_pronoun());
1426            }
1427            #[test]
1428            fn themselves_is_plural_pronoun() {
1429                assert!(md("themselves").is_plural_pronoun());
1430            }
1431            #[test]
1432            fn themselves_is_reflexive_pronoun() {
1433                assert!(md("themselves").is_reflexive_pronoun());
1434            }
1435        }
1436
1437        // Possessive pronouns (not to be confused with possessive adjectives/determiners)
1438        #[test]
1439        fn mine_is_pronoun() {
1440            assert!(md("mine").is_pronoun());
1441        }
1442        #[test]
1443        fn ours_is_pronoun() {
1444            assert!(md("ours").is_pronoun());
1445        }
1446        #[test]
1447        fn yours_is_pronoun() {
1448            assert!(md("yours").is_pronoun());
1449        }
1450        #[test]
1451        fn his_is_pronoun() {
1452            assert!(md("his").is_pronoun());
1453        }
1454        #[test]
1455        fn hers_is_pronoun() {
1456            assert!(md("hers").is_pronoun());
1457        }
1458        #[test]
1459        fn its_is_pronoun() {
1460            assert!(md("its").is_pronoun());
1461        }
1462        #[test]
1463        fn theirs_is_pronoun() {
1464            assert!(md("theirs").is_pronoun());
1465        }
1466
1467        // archaic pronouns
1468        #[test]
1469        fn archaic_pronouns() {
1470            assert!(md("thou").is_pronoun());
1471            assert!(md("thee").is_pronoun());
1472            assert!(md("thyself").is_pronoun());
1473            assert!(md("thine").is_pronoun());
1474        }
1475
1476        // generic pronouns
1477        #[test]
1478        fn generic_pronouns() {
1479            assert!(md("one").is_pronoun());
1480            assert!(md("oneself").is_pronoun());
1481        }
1482
1483        // relative and interrogative pronouns
1484        #[test]
1485        fn relative_and_interrogative_pronouns() {
1486            assert!(md("who").is_pronoun());
1487            assert!(md("whom").is_pronoun());
1488            assert!(md("whose").is_pronoun());
1489            assert!(md("which").is_pronoun());
1490            assert!(md("what").is_pronoun());
1491        }
1492
1493        // nonstandard pronouns
1494        #[test]
1495        #[ignore = "not in dictionary"]
1496        fn nonstandard_pronouns() {
1497            assert!(md("themself").pronoun.is_some());
1498            assert!(md("y'all'").pronoun.is_some());
1499        }
1500    }
1501
1502    #[test]
1503    fn the_is_determiner() {
1504        assert!(md("the").is_determiner());
1505    }
1506    #[test]
1507    fn this_is_demonstrative_determiner() {
1508        assert!(md("this").is_demonstrative_determiner());
1509    }
1510    #[test]
1511    fn your_is_possessive_determiner() {
1512        assert!(md("your").is_possessive_determiner());
1513    }
1514}