Skip to main content

harper_core/
dict_word_metadata.rs

1use harper_brill::UPOS;
2use is_macro::Is;
3use itertools::Itertools;
4use paste::paste;
5use serde::{Deserialize, Serialize};
6use smallvec::SmallVec;
7use strum::{EnumCount as _, VariantArray as _};
8use strum_macros::{Display, EnumCount, EnumIter, EnumString, VariantArray};
9
10use std::convert::TryFrom;
11
12use crate::dict_word_metadata_orthography::OrthFlags;
13use crate::spell::WordId;
14use crate::{Document, TokenKind, TokenStringExt};
15
16/// This represents a "lexeme" or "headword" which is case-folded but affix-expanded.
17/// So not only lemmata but also inflected forms are stored here, with "horn" and "horns" each
18/// having their own lexeme, but "Ivy" and "ivy" sharing the same lexeme.
19#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)]
20pub struct DictWordMetadata {
21    /// The main parts of speech which have extra data.
22    pub noun: Option<NounData>,
23    pub pronoun: Option<PronounData>,
24    pub verb: Option<VerbData>,
25    pub adjective: Option<AdjectiveData>,
26    pub adverb: Option<AdverbData>,
27    pub conjunction: Option<ConjunctionData>,
28    pub determiner: Option<DeterminerData>,
29    pub affix: Option<AffixData>,
30    /// Parts of speech which don't have extra data.
31    /// Whether the word is a [preposition](https://www.merriam-webster.com/dictionary/preposition).
32    #[serde(default = "default_false")]
33    pub preposition: bool,
34    /// Whether the word is an offensive word.
35    pub swear: Option<bool>,
36    /// The dialects this word belongs to.
37    /// If no dialects are defined, it can be assumed that the word is
38    /// valid in all dialects of English.
39    #[serde(default = "default_default")]
40    pub dialects: DialectFlags,
41    /// Orthographic information: letter case, spaces, hyphens, etc.
42    #[serde(default = "OrthFlags::empty")]
43    pub orth_info: OrthFlags,
44    /// Whether the word is considered especially common.
45    #[serde(default = "default_false")]
46    pub common: bool,
47    #[serde(default = "default_none")]
48    pub derived_from: Option<WordId>,
49    /// Generated by a chunker. Declares whether the word is a member of a nominal phrase. Using
50    /// this should be preferred over the similarly named `Pattern`.
51    ///
52    /// For more details, see [the announcement blog post](https://elijahpotter.dev/articles/training_a_chunker_with_burn).
53    pub np_member: Option<bool>,
54    /// Generated by a POS tagger. Declares what it inferred the word's part of speech to be.
55    pub pos_tag: Option<UPOS>,
56}
57
58/// Needed for `serde`
59fn default_false() -> bool {
60    false
61}
62
63/// Needed for `serde`
64fn default_none<T>() -> Option<T> {
65    None
66}
67
68/// Needed for `serde`
69fn default_default<T: Default>() -> T {
70    T::default()
71}
72
73macro_rules! generate_metadata_queries {
74    ($($category:ident has $($sub:ident),*).*) => {
75        paste! {
76            pub fn is_likely_homograph(&self) -> bool {
77                [self.is_determiner(), self.preposition, $(
78                    self.[< is_ $category >](),
79                )*].iter().map(|b| *b as u8).sum::<u8>() > 1
80            }
81
82            /// How different is this word from another?
83            pub fn difference(&self, other: &Self) -> u32 {
84                [
85                    $(
86                        Self::[< is_ $category >],
87                        $(
88                            Self::[< is_ $sub _ $category >],
89                            Self::[< is_non_ $sub _ $category >],
90                        )*
91                    )*
92                ]
93                .iter()
94                .fold(0, |acc, func| acc + (func(self) ^ func(other)) as u32)
95            }
96
97            $(
98                #[doc = concat!("Checks if the word is definitely a ", stringify!($category), ".")]
99                pub fn [< is_ $category >](&self) -> bool {
100                    self.$category.is_some()
101                }
102
103                $(
104                    #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as (a) ", stringify!($sub), ".")]
105                    pub fn [< is_ $sub _ $category >](&self) -> bool {
106                        matches!(
107                            self.$category,
108                            Some([< $category:camel Data >]{
109                                [< is_ $sub >]: Some(true),
110                                ..
111                            })
112                        ) }
113
114                    #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as __not__ (a) ", stringify!($sub), ".")]
115                    pub fn [< is_non_ $sub _ $category >](&self) -> bool {
116                        matches!(
117                            self.$category,
118                            Some([< $category:camel Data >]{
119                                [< is_ $sub >]: None | Some(false),
120                                ..
121                            })
122                        )
123                    }
124                )*
125            )*
126        }
127    };
128}
129
130impl DictWordMetadata {
131    /// If there is only one possible interpretation of the metadata, infer its UPOS tag.
132    pub fn infer_pos_tag(&self) -> Option<UPOS> {
133        // If an explicit POS tag exists, return it immediately.
134        if let Some(pos) = self.pos_tag {
135            return Some(pos);
136        }
137
138        // Collect all possible POS tags from metadata
139        let mut candidates = SmallVec::<[UPOS; 14]>::with_capacity(14);
140
141        if self.is_proper_noun() {
142            candidates.push(UPOS::PROPN);
143        }
144
145        if self.is_pronoun() {
146            candidates.push(UPOS::PRON);
147        }
148        if self.is_noun() {
149            candidates.push(UPOS::NOUN);
150        }
151        if self.is_verb() {
152            // Distinguish auxiliary verbs
153            if let Some(data) = &self.verb {
154                if data.is_auxiliary == Some(true) {
155                    candidates.push(UPOS::AUX);
156                } else {
157                    candidates.push(UPOS::VERB);
158                }
159            } else {
160                candidates.push(UPOS::VERB);
161            }
162        }
163        if self.is_adjective() {
164            candidates.push(UPOS::ADJ);
165        }
166        if self.is_adverb() {
167            candidates.push(UPOS::ADV);
168        }
169        if self.is_conjunction() {
170            candidates.push(UPOS::CCONJ);
171        }
172        if self.is_determiner() {
173            candidates.push(UPOS::DET);
174        }
175        if self.preposition {
176            candidates.push(UPOS::ADP);
177        }
178
179        // Remove duplicates
180        candidates.sort();
181        candidates.dedup();
182
183        candidates.into_iter().exactly_one().ok()
184    }
185
186    /// Produce a copy of `self` with the known properties of `other` set.
187    pub fn or(&self, other: &Self) -> Self {
188        let mut clone = self.clone();
189        clone.merge(other);
190        clone
191    }
192
193    /// Given a UPOS tag, discard any metadata that would disagree with the given POS tag.
194    /// For example, if the metadata suggests a word could either be a noun or an adjective, and we
195    /// provide a [`UPOS::NOUN`], this function will remove the adjective data.
196    ///
197    /// Additionally, if the metadata does not currently declare the potential of the word to be
198    /// the specific POS, it becomes so. That means if we provide a [`UPOS::ADJ`] to the function
199    /// for a metadata whose `Self::adjective = None`, it will become `Some`.
200    pub fn enforce_pos_exclusivity(&mut self, pos: &UPOS) {
201        use UPOS::*;
202        match pos {
203            NOUN => {
204                if let Some(noun) = self.noun {
205                    self.noun = Some(NounData {
206                        is_proper: Some(false),
207                        ..noun
208                    })
209                } else {
210                    self.noun = Some(NounData {
211                        is_proper: Some(false),
212                        is_singular: None,
213                        is_plural: None,
214                        is_countable: None,
215                        is_mass: None,
216                        is_possessive: None,
217                    })
218                }
219
220                self.pronoun = None;
221                self.verb = None;
222                self.adjective = None;
223                self.adverb = None;
224                self.conjunction = None;
225                self.determiner = None;
226                self.affix = None;
227                self.preposition = false;
228            }
229            PROPN => {
230                if let Some(noun) = self.noun {
231                    self.noun = Some(NounData {
232                        is_proper: Some(true),
233                        ..noun
234                    })
235                } else {
236                    self.noun = Some(NounData {
237                        is_proper: Some(true),
238                        is_singular: None,
239                        is_plural: None,
240                        is_countable: None,
241                        is_mass: None,
242                        is_possessive: None,
243                    })
244                }
245
246                self.pronoun = None;
247                self.verb = None;
248                self.adjective = None;
249                self.adverb = None;
250                self.conjunction = None;
251                self.determiner = None;
252                self.affix = None;
253                self.preposition = false;
254            }
255            PRON => {
256                if self.pronoun.is_none() {
257                    self.pronoun = Some(PronounData::default())
258                }
259
260                self.noun = None;
261                self.verb = None;
262                self.adjective = None;
263                self.adverb = None;
264                self.conjunction = None;
265                self.determiner = None;
266                self.affix = None;
267                self.preposition = false;
268            }
269            VERB => {
270                if let Some(verb) = self.verb {
271                    self.verb = Some(VerbData {
272                        is_auxiliary: Some(false),
273                        ..verb
274                    })
275                } else {
276                    self.verb = Some(VerbData {
277                        is_auxiliary: Some(false),
278                        ..Default::default()
279                    })
280                }
281
282                self.noun = None;
283                self.pronoun = None;
284                self.adjective = None;
285                self.adverb = None;
286                self.conjunction = None;
287                self.determiner = None;
288                self.affix = None;
289                self.preposition = false;
290            }
291            AUX => {
292                if let Some(verb) = self.verb {
293                    self.verb = Some(VerbData {
294                        is_auxiliary: Some(true),
295                        ..verb
296                    })
297                } else {
298                    self.verb = Some(VerbData {
299                        is_auxiliary: Some(true),
300                        ..Default::default()
301                    })
302                }
303
304                self.noun = None;
305                self.pronoun = None;
306                self.adjective = None;
307                self.adverb = None;
308                self.conjunction = None;
309                self.determiner = None;
310                self.affix = None;
311                self.preposition = false;
312            }
313            ADJ => {
314                if self.adjective.is_none() {
315                    self.adjective = Some(AdjectiveData::default())
316                }
317
318                self.noun = None;
319                self.pronoun = None;
320                self.verb = None;
321                self.adverb = None;
322                self.conjunction = None;
323                self.determiner = None;
324                self.affix = None;
325                self.preposition = false;
326            }
327            ADV => {
328                if self.adverb.is_none() {
329                    self.adverb = Some(AdverbData::default())
330                }
331
332                self.noun = None;
333                self.pronoun = None;
334                self.verb = None;
335                self.adjective = None;
336                self.conjunction = None;
337                self.determiner = None;
338                self.affix = None;
339                self.preposition = false;
340            }
341            ADP => {
342                self.noun = None;
343                self.pronoun = None;
344                self.verb = None;
345                self.adjective = None;
346                self.adverb = None;
347                self.conjunction = None;
348                self.determiner = None;
349                self.affix = None;
350                self.preposition = true;
351            }
352            DET => {
353                self.noun = None;
354                self.pronoun = None;
355                self.verb = None;
356                self.adjective = None;
357                self.adverb = None;
358                self.conjunction = None;
359                self.affix = None;
360                self.preposition = false;
361                self.determiner = Some(DeterminerData::default());
362            }
363            CCONJ | SCONJ => {
364                if self.conjunction.is_none() {
365                    self.conjunction = Some(ConjunctionData::default())
366                }
367
368                self.noun = None;
369                self.pronoun = None;
370                self.verb = None;
371                self.adjective = None;
372                self.adverb = None;
373                self.determiner = None;
374                self.affix = None;
375                self.preposition = false;
376            }
377            _ => {}
378        }
379    }
380
381    generate_metadata_queries!(
382        // Singular and countable default to true, so their metadata queries are not generated.
383        noun has proper, plural, mass, possessive.
384        pronoun has personal, singular, plural, possessive, reflexive, subject, object.
385        determiner has demonstrative, possessive, quantifier.
386        verb has linking, auxiliary.
387        conjunction has.
388        adjective has.
389        adverb has manner, frequency, degree
390    );
391
392    // Manual metadata queries
393
394    // Pronoun metadata queries
395
396    pub fn get_person(&self) -> Option<Person> {
397        self.pronoun.as_ref().and_then(|p| p.person)
398    }
399
400    pub fn is_first_person_plural_pronoun(&self) -> bool {
401        matches!(
402            self.pronoun,
403            Some(PronounData {
404                person: Some(Person::First),
405                is_plural: Some(true),
406                ..
407            })
408        )
409    }
410
411    pub fn is_first_person_singular_pronoun(&self) -> bool {
412        matches!(
413            self.pronoun,
414            Some(PronounData {
415                person: Some(Person::First),
416                is_singular: Some(true),
417                ..
418            })
419        )
420    }
421
422    pub fn is_third_person_plural_pronoun(&self) -> bool {
423        matches!(
424            self.pronoun,
425            Some(PronounData {
426                person: Some(Person::Third),
427                is_plural: Some(true),
428                ..
429            })
430        )
431    }
432
433    pub fn is_third_person_singular_pronoun(&self) -> bool {
434        matches!(
435            self.pronoun,
436            Some(PronounData {
437                person: Some(Person::Third),
438                is_singular: Some(true),
439                ..
440            })
441        )
442    }
443
444    pub fn is_third_person_pronoun(&self) -> bool {
445        matches!(
446            self.pronoun,
447            Some(PronounData {
448                person: Some(Person::Third),
449                ..
450            })
451        )
452    }
453
454    pub fn is_second_person_pronoun(&self) -> bool {
455        matches!(
456            self.pronoun,
457            Some(PronounData {
458                person: Some(Person::Second),
459                ..
460            })
461        )
462    }
463
464    // Lemma is default if no verb form is specified in the dictionary
465    pub fn is_verb_lemma(&self) -> bool {
466        if let Some(verb) = self.verb {
467            if let Some(forms) = verb.verb_forms {
468                return forms.is_empty() || forms.contains(VerbFormFlags::LEMMA);
469            } else {
470                return true;
471            }
472        }
473        false
474    }
475
476    pub fn is_verb_past_form(&self) -> bool {
477        self.verb.is_some_and(|v| {
478            v.verb_forms
479                .is_some_and(|vf| vf.contains(VerbFormFlags::PAST))
480        })
481    }
482
483    pub fn is_verb_regular_past_form(&self) -> bool {
484        self.verb.is_some_and(|v| {
485            v.verb_forms.is_some_and(|vf| {
486                vf.contains(VerbFormFlags::PRETERITE) && vf.contains(VerbFormFlags::PAST_PARTICIPLE)
487            })
488        })
489    }
490
491    pub fn is_verb_simple_past_form(&self) -> bool {
492        self.verb.is_some_and(|v| {
493            v.verb_forms
494                .is_some_and(|vf| vf.contains(VerbFormFlags::PRETERITE))
495        })
496    }
497
498    pub fn is_verb_past_participle_form(&self) -> bool {
499        self.verb.is_some_and(|v| {
500            v.verb_forms
501                .is_some_and(|vf| vf.contains(VerbFormFlags::PAST_PARTICIPLE))
502        })
503    }
504
505    pub fn is_verb_simple_past_only(&self) -> bool {
506        self.verb.is_some_and(|v| {
507            v.verb_forms.is_some_and(|vf| {
508                vf.contains(VerbFormFlags::PRETERITE)
509                    && !vf.intersects(VerbFormFlags::PAST | VerbFormFlags::PAST_PARTICIPLE)
510            })
511        })
512    }
513
514    pub fn is_verb_past_participle_only(&self) -> bool {
515        self.verb.is_some_and(|v| {
516            v.verb_forms.is_some_and(|vf| {
517                vf.contains(VerbFormFlags::PAST_PARTICIPLE)
518                    && !vf.intersects(VerbFormFlags::PAST | VerbFormFlags::PRETERITE)
519            })
520        })
521    }
522
523    pub fn is_verb_progressive_form(&self) -> bool {
524        self.verb.is_some_and(|v| {
525            v.verb_forms
526                .is_some_and(|vf| vf.contains(VerbFormFlags::PROGRESSIVE))
527        })
528    }
529
530    pub fn is_verb_third_person_singular_present_form(&self) -> bool {
531        self.verb.is_some_and(|v| {
532            v.verb_forms
533                .is_some_and(|vf| vf.contains(VerbFormFlags::THIRD_PERSON_SINGULAR))
534        })
535    }
536
537    // Noun metadata queries
538
539    // Singular is default if number is not marked in the dictionary.
540    pub fn is_singular_noun(&self) -> bool {
541        if let Some(noun) = self.noun {
542            matches!(
543                (noun.is_singular, noun.is_plural),
544                (Some(true), _) | (None | Some(false), None | Some(false))
545            )
546        } else {
547            false
548        }
549    }
550    pub fn is_non_singular_noun(&self) -> bool {
551        if let Some(noun) = self.noun {
552            !matches!(
553                (noun.is_singular, noun.is_plural),
554                (Some(true), _) | (None | Some(false), None | Some(false))
555            )
556        } else {
557            false
558        }
559    }
560
561    // Countable is default if countability is not marked in the dictionary.
562    pub fn is_countable_noun(&self) -> bool {
563        if let Some(noun) = self.noun {
564            matches!(
565                (noun.is_countable, noun.is_mass),
566                (Some(true), _) | (None | Some(false), None | Some(false))
567            )
568        } else {
569            false
570        }
571    }
572    pub fn is_non_countable_noun(&self) -> bool {
573        if let Some(noun) = self.noun {
574            !matches!(
575                (noun.is_countable, noun.is_mass),
576                (Some(true), _) | (None | Some(false), None | Some(false))
577            )
578        } else {
579            false
580        }
581    }
582
583    // Most mass nouns also have countable senses. Match those that are only mass nouns.
584    pub fn is_mass_noun_only(&self) -> bool {
585        if let Some(noun) = self.noun {
586            matches!(
587                (noun.is_countable, noun.is_mass),
588                (None | Some(false), Some(true))
589            )
590        } else {
591            false
592        }
593    }
594
595    // Nominal metadata queries (noun + pronoun)
596
597    /// Checks if the word is definitely nominal.
598    pub fn is_nominal(&self) -> bool {
599        self.is_noun() || self.is_pronoun()
600    }
601
602    /// Checks if the word is definitely a nominal and more specifically is labeled as (a) singular.
603    pub fn is_singular_nominal(&self) -> bool {
604        self.is_singular_noun() || self.is_singular_pronoun()
605    }
606
607    /// Checks if the word is definitely a nominal and more specifically is labeled as (a) plural.
608    pub fn is_plural_nominal(&self) -> bool {
609        self.is_plural_noun() || self.is_plural_pronoun()
610    }
611
612    /// Checks if the word is definitely a nominal and more specifically is labeled as (a) possessive.
613    /// NOTE: `possessive pronoun`s are not qualifiers, but words like `mine`, `yours`, etc.
614    /// The terminology of `possessive noun`, `possessive pronoun` and `possessive determiner` only
615    /// tends to reinforce this confusion.
616    pub fn is_possessive_nominal(&self) -> bool {
617        self.is_possessive_noun() || self.is_possessive_determiner()
618    }
619
620    /// Checks if the word is definitely a nominal and more specifically is labeled as __not__ (a) singular.
621    pub fn is_non_singular_nominal(&self) -> bool {
622        self.is_non_singular_noun() || self.is_non_singular_pronoun()
623    }
624
625    /// Checks if the word is definitely a nominal and more specifically is labeled as __not__ (a) plural.
626    pub fn is_non_plural_nominal(&self) -> bool {
627        self.is_non_plural_noun() || self.is_non_plural_pronoun()
628    }
629
630    // Adjective metadata queries
631
632    pub fn get_degree(&self) -> Option<Degree> {
633        self.adjective.as_ref().and_then(|a| a.degree)
634    }
635
636    pub fn is_comparative_adjective(&self) -> bool {
637        matches!(
638            self.adjective,
639            Some(AdjectiveData {
640                degree: Some(Degree::Comparative)
641            })
642        )
643    }
644
645    pub fn is_superlative_adjective(&self) -> bool {
646        matches!(
647            self.adjective,
648            Some(AdjectiveData {
649                degree: Some(Degree::Superlative)
650            })
651        )
652    }
653
654    // Degree::Positive is the default if degree is not marked in the dictionary.
655    pub fn is_positive_adjective(&self) -> bool {
656        match self.adjective {
657            Some(AdjectiveData {
658                degree: Some(Degree::Positive),
659            }) => true,
660            Some(AdjectiveData { degree: None }) => true,
661            Some(AdjectiveData {
662                degree: Some(degree),
663            }) => !matches!(degree, Degree::Comparative | Degree::Superlative),
664            _ => false,
665        }
666    }
667
668    // Determiner metadata queries
669
670    // Checks if the word is definitely a determiner and more specifically is labeled as (a) quantifier.
671    pub fn is_quantifier(&self) -> bool {
672        self.is_quantifier_determiner()
673    }
674
675    // Non-POS queries
676
677    /// Checks whether a word is _definitely_ a swear.
678    pub fn is_swear(&self) -> bool {
679        matches!(self.swear, Some(true))
680    }
681
682    // Orthographic queries
683
684    /// Does the metadata for this word cover an all-lowercase variant? (e.g., "hello")
685    ///
686    /// This returns true if all letters in the word are lowercase. Words containing
687    /// non-letter characters (like numbers or symbols) are only considered if all
688    /// letter characters are lowercase.
689    pub fn is_lowercase(&self) -> bool {
690        self.orth_info.contains(OrthFlags::LOWERCASE)
691    }
692    /// Does the metadata for this word cover a titlecase variant? (e.g., "Hello")
693    ///
694    /// This returns true if the word is in titlecase form, which means:
695    /// - The first letter is uppercase
696    /// - All other letters are lowercase
697    /// - The word is at least 2 characters long
698    ///
699    /// Examples: "Hello", "World"
700    ///
701    /// Note: Words with internal capital letters (like "McDonald") or apostrophes (like "O'Reilly")
702    /// are not considered titlecase - they are classified as UPPER_CAMEL instead.
703    pub fn is_titlecase(&self) -> bool {
704        self.orth_info.contains(OrthFlags::TITLECASE)
705    }
706    /// Does the metadata for this word cover an all-uppercase variant? (e.g., "HELLO")
707    ///
708    /// This returns true if all letters in the word are uppercase. Words containing
709    /// non-letter characters (like numbers or symbols) are only considered if all
710    /// letter characters are uppercase.
711    ///
712    /// Examples: "HELLO", "NASA", "I"
713    pub fn is_allcaps(&self) -> bool {
714        self.orth_info.contains(OrthFlags::ALLCAPS)
715    }
716    /// Does the metadata for this word cover a lower camel case variant? (e.g., "helloWorld")
717    ///
718    /// This returns true if the word is in lower camel case, which means:
719    /// - The first letter is lowercase
720    /// - There is at least one uppercase letter after the first character
721    /// - The word must be at least 2 characters long
722    ///
723    /// Examples: "helloWorld", "getHTTPResponse", "eBay"
724    ///
725    /// Note: Single words that are all lowercase will return false.
726    /// Words starting with an uppercase letter will return false (those would be UpperCamel).
727    pub fn is_lower_camel(&self) -> bool {
728        self.orth_info.contains(OrthFlags::LOWER_CAMEL)
729    }
730    /// Does the metadata for this word cover an upper camel case / pascal case variant? (e.g., "HelloWorld")
731    ///
732    /// This returns true if the word is in upper camel case (also known as Pascal case), which means:
733    /// - The first letter is uppercase
734    /// - There is at least one other uppercase letter after the first character
735    /// - There is at least one lowercase letter after the first uppercase letter
736    /// - The word must be at least 3 characters long
737    ///
738    /// Examples:
739    /// - "HelloWorld" (standard Pascal case)
740    /// - "McDonald" (name with internal caps)
741    /// - "O'Reilly" (name with apostrophe and internal caps)
742    /// - "HttpRequest" (initialism followed by word)
743    ///
744    /// Note: Single words that are titlecase (like "Hello") will return false.
745    /// Words that are all uppercase (like "NASA") will also return false.
746    pub fn is_upper_camel(&self) -> bool {
747        self.orth_info.contains(OrthFlags::UPPER_CAMEL)
748    }
749
750    /// Does the metadata for this word cover an apostrophized variant? (e.g., "doesn't")
751    pub fn is_apostrophized(&self) -> bool {
752        self.orth_info.contains(OrthFlags::APOSTROPHE)
753    }
754
755    pub fn is_roman_numerals(&self) -> bool {
756        self.orth_info.contains(OrthFlags::ROMAN_NUMERALS)
757    }
758
759    /// Same thing as [`Self::or`], except in-place rather than a clone.
760    pub fn merge(&mut self, other: &Self) -> &mut Self {
761        macro_rules! merge {
762            ($a:expr, $b:expr) => {
763                match ($a, $b) {
764                    (Some(a), Some(b)) => Some(a.or(&b)),
765                    (Some(a), None) => Some(a),
766                    (None, Some(b)) => Some(b),
767                    (None, None) => None,
768                }
769            };
770        }
771
772        self.noun = merge!(self.noun, other.noun);
773        self.pronoun = merge!(self.pronoun, other.pronoun);
774        self.verb = merge!(self.verb, other.verb);
775        self.adjective = merge!(self.adjective, other.adjective);
776        self.adverb = merge!(self.adverb, other.adverb);
777        self.conjunction = merge!(self.conjunction, other.conjunction);
778        self.determiner = merge!(self.determiner, other.determiner);
779        self.affix = merge!(self.affix, other.affix);
780        self.preposition |= other.preposition;
781        self.dialects |= other.dialects;
782        self.orth_info |= other.orth_info;
783        self.swear = self.swear.or(other.swear);
784        self.common |= other.common;
785        self.derived_from = self.derived_from.or(other.derived_from);
786        self.pos_tag = self.pos_tag.or(other.pos_tag);
787        self.np_member = self.np_member.or(other.np_member);
788
789        self
790    }
791}
792
793// These verb forms are morphological variations, distinct from TAM (Tense-Aspect-Mood)
794// Each form can be used in various TAM combinations:
795// - Lemma form (infinitive, citation form, dictionary form)
796//   Used in infinitives (e.g., "to sleep"), imperatives (e.g., "sleep!"), and with modals (e.g., "will sleep")
797// - Past form (past participle and simple past)
798//   Used as verbs (e.g., "slept") or adjectives (e.g., "closed door")
799// - Progressive form (present participle and gerund)
800//   Used as verbs (e.g., "sleeping"), nouns (e.g., "sleeping is important"), or adjectives (e.g., "sleeping dog")
801// - Third person singular present (-s/-es)
802//   Used for third person singular subjects (e.g., "he sleeps", "she reads")
803//
804// Important notes:
805// 1. English expresses time through auxiliary verbs, not verb form alone
806// 2. Irregular verbs can have different forms for past participle and simple past
807// 3. Future is always expressed through auxiliary verbs (e.g., "will sleep", "going to sleep")
808#[repr(u32)]
809pub enum VerbForm {
810    /// The uninflected verb form: "walk", "eat"
811    LemmaForm = 1 << 0,
812    /// The past form for regular verbs: "walked"
813    PastForm = 1 << 1,
814    /// The simple past/preterite form for irregular verbs: "ate"
815    SimplePastForm = 1 << 2,
816    /// The past participle form for irregular verbs: "eaten"
817    PastParticipleForm = 1 << 3,
818    /// The progressive/continuous/gerund/present participle form: "walking", "eating"
819    ProgressiveForm = 1 << 4,
820    /// The third person singular present form: "walks", "eats"
821    ThirdPersonSingularPresentForm = 1 << 5,
822}
823
824/// The underlying type used for verb form flags.
825pub type VerbFormFlagsUnderlyingType = u32;
826
827bitflags::bitflags! {
828    /// A collection of bit flags used to represent verb forms.
829    ///
830    /// This allows a word to be tagged with multiple verb forms when applicable.
831    #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
832    #[serde(transparent)]
833    pub struct VerbFormFlags: VerbFormFlagsUnderlyingType {
834        const LEMMA = VerbForm::LemmaForm as VerbFormFlagsUnderlyingType;
835        const PAST = VerbForm::PastForm as VerbFormFlagsUnderlyingType;
836        const PRETERITE = VerbForm::SimplePastForm as VerbFormFlagsUnderlyingType;
837        const PAST_PARTICIPLE = VerbForm::PastParticipleForm as VerbFormFlagsUnderlyingType;
838        const PROGRESSIVE = VerbForm::ProgressiveForm as VerbFormFlagsUnderlyingType;
839        const THIRD_PERSON_SINGULAR = VerbForm::ThirdPersonSingularPresentForm as VerbFormFlagsUnderlyingType;
840    }
841}
842
843#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
844pub struct VerbData {
845    pub is_linking: Option<bool>,
846    pub is_auxiliary: Option<bool>,
847    #[serde(rename = "verb_form", default)]
848    pub verb_forms: Option<VerbFormFlags>,
849}
850
851impl VerbData {
852    /// Produce a copy of `self` with the known properties of `other` set.
853    pub fn or(&self, other: &Self) -> Self {
854        let verb_forms = match (self.verb_forms, other.verb_forms) {
855            (Some(self_verb_forms), Some(other_verb_forms)) => {
856                Some(self_verb_forms | other_verb_forms)
857            }
858            (Some(self_verb_forms), None) => Some(self_verb_forms),
859            (None, Some(other_verb_forms)) => Some(other_verb_forms),
860            (None, None) => None,
861        };
862
863        Self {
864            is_linking: self.is_linking.or(other.is_linking),
865            is_auxiliary: self.is_auxiliary.or(other.is_auxiliary),
866            verb_forms,
867        }
868    }
869}
870
871// nouns can be both singular and plural: "aircraft", "biceps", "fish", "sheep"
872// TODO other noun properties may be worth adding: abstract
873#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
874pub struct NounData {
875    pub is_proper: Option<bool>,
876    pub is_singular: Option<bool>,
877    pub is_plural: Option<bool>,
878    pub is_countable: Option<bool>,
879    pub is_mass: Option<bool>,
880    pub is_possessive: Option<bool>,
881}
882
883impl NounData {
884    /// Produce a copy of `self` with the known properties of `other` set.
885    pub fn or(&self, other: &Self) -> Self {
886        Self {
887            is_proper: self.is_proper.or(other.is_proper),
888            is_singular: self.is_singular.or(other.is_singular),
889            is_plural: self.is_plural.or(other.is_plural),
890            is_countable: self.is_countable.or(other.is_countable),
891            is_mass: self.is_mass.or(other.is_mass),
892            is_possessive: self.is_possessive.or(other.is_possessive),
893        }
894    }
895}
896
897// Person is a property of pronouns; the verb 'be', plus all verbs reflect 3rd person singular with -s
898#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Is, Hash)]
899pub enum Person {
900    First,
901    Second,
902    Third,
903}
904
905// TODO for now focused on personal pronouns?
906#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
907pub struct PronounData {
908    pub is_personal: Option<bool>,
909    pub is_singular: Option<bool>,
910    pub is_plural: Option<bool>,
911    pub is_possessive: Option<bool>,
912    pub is_reflexive: Option<bool>,
913    pub person: Option<Person>,
914    pub is_subject: Option<bool>,
915    pub is_object: Option<bool>,
916}
917
918impl PronounData {
919    /// Produce a copy of `self` with the known properties of `other` set.
920    pub fn or(&self, other: &Self) -> Self {
921        Self {
922            is_personal: self.is_personal.or(other.is_personal),
923            is_singular: self.is_singular.or(other.is_singular),
924            is_plural: self.is_plural.or(other.is_plural),
925            is_possessive: self.is_possessive.or(other.is_possessive),
926            is_reflexive: self.is_reflexive.or(other.is_reflexive),
927            person: self.person.or(other.person),
928            is_subject: self.is_subject.or(other.is_subject),
929            is_object: self.is_object.or(other.is_object),
930        }
931    }
932}
933
934/// Additional metadata for determiners
935#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
936pub struct DeterminerData {
937    pub is_demonstrative: Option<bool>,
938    pub is_possessive: Option<bool>,
939    pub is_quantifier: Option<bool>,
940}
941
942impl DeterminerData {
943    /// Produce a copy of `self` with the known properties of `other` set.
944    pub fn or(&self, other: &Self) -> Self {
945        Self {
946            is_demonstrative: self.is_demonstrative.or(other.is_demonstrative),
947            is_possessive: self.is_possessive.or(other.is_possessive),
948            is_quantifier: self.is_quantifier.or(other.is_quantifier),
949        }
950    }
951}
952
953/// Degree is a property of adjectives: positive is not inflected
954/// Comparative is inflected with -er or comes after the word "more"
955/// Superlative is inflected with -est or comes after the word "most"
956#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Is, Hash)]
957pub enum Degree {
958    Positive,
959    Comparative,
960    Superlative,
961}
962
963/// Some adjectives are not comparable so don't have -er or -est forms and can't be used with "more" or "most".
964/// Some adjectives can only be used "attributively" (before a noun); some only predicatively (after "is" etc.).
965/// In old grammars words like the articles and determiners are classified as adjectives but behave differently.
966#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
967pub struct AdjectiveData {
968    pub degree: Option<Degree>,
969}
970
971impl AdjectiveData {
972    /// Produce a copy of `self` with the known properties of `other` set.
973    pub fn or(&self, other: &Self) -> Self {
974        Self {
975            degree: self.degree.or(other.degree),
976        }
977    }
978}
979
980/// Adverb can be a "junk drawer" category for words which don't fit the other major categories.
981/// The typical adverbs are "adverbs of manner", those derived from adjectives in -ly
982/// other adverbs (time, place, etc) should probably not be considered adverbs for Harper's purposes
983#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
984pub struct AdverbData {
985    pub is_manner: Option<bool>,
986    pub is_frequency: Option<bool>,
987    pub is_degree: Option<bool>,
988}
989
990impl AdverbData {
991    /// Produce a copy of `self` with the known properties of `other` set.
992    pub fn or(&self, _other: &Self) -> Self {
993        Self {
994            is_manner: self.is_manner.or(_other.is_manner),
995            is_frequency: self.is_frequency.or(_other.is_frequency),
996            is_degree: self.is_degree.or(_other.is_degree),
997        }
998    }
999}
1000
1001#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
1002pub struct ConjunctionData {}
1003
1004impl ConjunctionData {
1005    /// Produce a copy of `self` with the known properties of `other` set.
1006    pub fn or(&self, _other: &Self) -> Self {
1007        Self {}
1008    }
1009}
1010
1011#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
1012pub struct AffixData {
1013    pub is_prefix: Option<bool>,
1014    pub is_suffix: Option<bool>,
1015}
1016
1017impl AffixData {
1018    /// Produce a copy of `self` with the known properties of `other` set.
1019    pub fn or(&self, _other: &Self) -> Self {
1020        Self {
1021            is_prefix: self.is_prefix.or(_other.is_prefix),
1022            is_suffix: self.is_suffix.or(_other.is_suffix),
1023        }
1024    }
1025}
1026
1027/// A regional dialect.
1028///
1029/// Note: these have bit-shifted values so that they can ergonomically integrate with
1030/// `DialectFlags`. Each value here must have a unique bit index inside
1031/// `DialectsUnderlyingType`.
1032#[derive(
1033    Debug,
1034    Clone,
1035    Copy,
1036    Serialize,
1037    Deserialize,
1038    PartialEq,
1039    PartialOrd,
1040    Eq,
1041    Hash,
1042    EnumCount,
1043    EnumString,
1044    EnumIter,
1045    Display,
1046    VariantArray,
1047)]
1048pub enum Dialect {
1049    American = 1 << 0,
1050    Canadian = 1 << 1,
1051    Australian = 1 << 2,
1052    British = 1 << 3,
1053    Indian = 1 << 4,
1054}
1055impl Dialect {
1056    /// Tries to guess the dialect used in the document by finding which dialect is used the most.
1057    /// Returns `None` if it fails to find a single dialect that is used the most.
1058    #[must_use]
1059    pub fn try_guess_from_document(document: &Document) -> Option<Self> {
1060        Self::try_from(DialectFlags::get_most_used_dialects_from_document(document)).ok()
1061    }
1062
1063    /// Tries to get a dialect from its abbreviation. Returns `None` if the abbreviation is not
1064    /// recognized.
1065    ///
1066    /// # Examples
1067    ///
1068    /// ```
1069    /// use harper_core::Dialect;
1070    ///
1071    /// let abbrs = ["US", "CA", "AU", "GB", "IN"];
1072    /// let mut dialects = abbrs.iter().map(|abbr| Dialect::try_from_abbr(abbr));
1073    ///
1074    /// assert_eq!(Some(Dialect::American), dialects.next().unwrap()); // US
1075    /// assert_eq!(Some(Dialect::Canadian), dialects.next().unwrap()); // CA
1076    /// assert_eq!(Some(Dialect::Australian), dialects.next().unwrap()); // AU
1077    /// assert_eq!(Some(Dialect::British), dialects.next().unwrap()); // GB
1078    /// assert_eq!(Some(Dialect::Indian), dialects.next().unwrap()); // IN
1079    /// ```
1080    #[must_use]
1081    pub fn try_from_abbr(abbr: &str) -> Option<Self> {
1082        match abbr {
1083            "US" => Some(Self::American),
1084            "CA" => Some(Self::Canadian),
1085            "AU" => Some(Self::Australian),
1086            "GB" => Some(Self::British),
1087            "IN" => Some(Self::Indian),
1088            _ => None,
1089        }
1090    }
1091    // BCP-47 https://www.rfc-editor.org/rfc/rfc5646
1092    pub fn try_from_bcp47(bcp47: &str) -> Option<Self> {
1093        bcp47.strip_prefix("en-").and_then(Self::try_from_abbr)
1094    }
1095}
1096impl TryFrom<DialectFlags> for Dialect {
1097    type Error = ();
1098
1099    /// Attempts to convert `DialectFlags` to a single `Dialect`.
1100    ///
1101    /// # Errors
1102    ///
1103    /// Will return `Err` if more than one dialect is enabled or if an undefined dialect is
1104    /// enabled.
1105    fn try_from(dialect_flags: DialectFlags) -> Result<Self, Self::Error> {
1106        // Ensure only one dialect is enabled before converting.
1107        if dialect_flags.bits().count_ones() == 1 {
1108            match dialect_flags {
1109                df if df.is_dialect_enabled_strict(Dialect::American) => Ok(Dialect::American),
1110                df if df.is_dialect_enabled_strict(Dialect::Canadian) => Ok(Dialect::Canadian),
1111                df if df.is_dialect_enabled_strict(Dialect::Australian) => Ok(Dialect::Australian),
1112                df if df.is_dialect_enabled_strict(Dialect::British) => Ok(Dialect::British),
1113                df if df.is_dialect_enabled_strict(Dialect::Indian) => Ok(Dialect::Indian),
1114                _ => Err(()),
1115            }
1116        } else {
1117            // More than one dialect enabled; can't soundly convert.
1118            Err(())
1119        }
1120    }
1121}
1122
1123// The underlying type used for DialectFlags.
1124// At the time of writing, this is currently a `u8`. If we want to define more than 8 dialects in
1125// the future, we will need to switch this to a larger type.
1126type DialectFlagsUnderlyingType = u8;
1127
1128bitflags::bitflags! {
1129    /// A collection of bit flags used to represent enabled dialects.
1130    ///
1131    /// This is generally used to allow a word (or similar) to be tagged with multiple dialects.
1132    #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash)]
1133    #[serde(transparent)]
1134    pub struct DialectFlags: DialectFlagsUnderlyingType {
1135        const AMERICAN = Dialect::American as DialectFlagsUnderlyingType;
1136        const CANADIAN = Dialect::Canadian as DialectFlagsUnderlyingType;
1137        const AUSTRALIAN = Dialect::Australian as DialectFlagsUnderlyingType;
1138        const BRITISH = Dialect::British as DialectFlagsUnderlyingType;
1139        const INDIAN = Dialect::Indian as DialectFlagsUnderlyingType;
1140    }
1141}
1142impl DialectFlags {
1143    /// Checks if the provided dialect is enabled.
1144    /// If no dialect is explicitly enabled, it is assumed that all dialects are enabled.
1145    #[must_use]
1146    pub fn is_dialect_enabled(self, dialect: Dialect) -> bool {
1147        self.is_empty() || self.intersects(Self::from_dialect(dialect))
1148    }
1149
1150    /// Checks if the provided dialect is ***explicitly*** enabled.
1151    ///
1152    /// Unlike `is_dialect_enabled`, this will return false when no dialects are explicitly
1153    /// enabled.
1154    #[must_use]
1155    pub fn is_dialect_enabled_strict(self, dialect: Dialect) -> bool {
1156        self.intersects(Self::from_dialect(dialect))
1157    }
1158
1159    /// Constructs a `DialectFlags` from the provided `Dialect`, with only that dialect being
1160    /// enabled.
1161    ///
1162    /// # Panics
1163    ///
1164    /// This will panic if `dialect` represents a dialect that is not defined in
1165    /// `DialectFlags`.
1166    #[must_use]
1167    pub fn from_dialect(dialect: Dialect) -> Self {
1168        let Some(out) = Self::from_bits(dialect as DialectFlagsUnderlyingType) else {
1169            panic!("The '{dialect}' dialect isn't defined in DialectFlags!");
1170        };
1171        out
1172    }
1173
1174    /// Gets the most commonly used dialect(s) in the document.
1175    ///
1176    /// If multiple dialects are used equally often, they will all be enabled in the returned
1177    /// `DialectFlags`. On the other hand, if there is a single dialect that is used the most, it
1178    /// will be the only one enabled.
1179    #[must_use]
1180    pub fn get_most_used_dialects_from_document(document: &Document) -> Self {
1181        // Initialize counters.
1182        let mut dialect_counters: [(Dialect, usize); Dialect::COUNT] = Dialect::VARIANTS
1183            .iter()
1184            .map(|d| (*d, 0))
1185            .collect_array()
1186            .unwrap();
1187
1188        // Count word dialects.
1189        document.iter_words().for_each(|w| {
1190            if let TokenKind::Word(Some(lexeme_metadata)) = &w.kind {
1191                // If the token is a word, iterate though the dialects in `dialect_counters` and
1192                // increment those counters where the word has the respective dialect enabled.
1193                dialect_counters.iter_mut().for_each(|(dialect, count)| {
1194                    if lexeme_metadata.dialects.is_dialect_enabled(*dialect) {
1195                        *count += 1;
1196                    }
1197                });
1198            }
1199        });
1200
1201        // Find max counter.
1202        let max_counter = dialect_counters
1203            .iter()
1204            .map(|(_, count)| count)
1205            .max()
1206            .unwrap();
1207        // Get and convert the collection of most used dialects into a `DialectFlags`.
1208        dialect_counters
1209            .into_iter()
1210            .filter(|(_, count)| count == max_counter)
1211            .fold(DialectFlags::empty(), |acc, dialect| {
1212                // Fold most used dialects into `DialectFlags` via bitwise or.
1213                acc | Self::from_dialect(dialect.0)
1214            })
1215    }
1216}
1217impl Default for DialectFlags {
1218    /// A default value with no dialects explicitly enabled.
1219    /// Implicitly, this state corresponds to all dialects being enabled.
1220    fn default() -> Self {
1221        Self::empty()
1222    }
1223}
1224
1225#[cfg(test)]
1226pub mod tests {
1227    use crate::DictWordMetadata;
1228    use crate::spell::{Dictionary, FstDictionary};
1229
1230    // Helper function to get metadata from the curated dictionary
1231    pub fn md(word: &str) -> DictWordMetadata {
1232        FstDictionary::curated()
1233            .get_word_metadata_str(word)
1234            .unwrap_or_else(|| panic!("Word '{word}' not found in dictionary"))
1235            .into_owned()
1236    }
1237
1238    mod dialect {
1239        use super::super::{Dialect, DialectFlags};
1240        use crate::Document;
1241
1242        #[test]
1243        fn guess_british_dialect() {
1244            let document = Document::new_plain_english_curated("Aluminium was used.");
1245            let df = DialectFlags::get_most_used_dialects_from_document(&document);
1246            assert!(
1247                df.is_dialect_enabled_strict(Dialect::British)
1248                    && !df.is_dialect_enabled_strict(Dialect::American)
1249            );
1250        }
1251
1252        #[test]
1253        fn guess_american_dialect() {
1254            let document = Document::new_plain_english_curated("Aluminum was used.");
1255            let df = DialectFlags::get_most_used_dialects_from_document(&document);
1256            assert!(
1257                df.is_dialect_enabled_strict(Dialect::American)
1258                    && !df.is_dialect_enabled_strict(Dialect::British)
1259            );
1260        }
1261    }
1262
1263    mod noun {
1264        use crate::dict_word_metadata::tests::md;
1265
1266        #[test]
1267        fn puppy_is_noun() {
1268            assert!(md("puppy").is_noun());
1269        }
1270
1271        #[test]
1272        fn prepare_is_not_noun() {
1273            assert!(!md("prepare").is_noun());
1274        }
1275
1276        #[test]
1277        fn paris_is_proper_noun() {
1278            assert!(md("Paris").is_proper_noun());
1279        }
1280
1281        #[test]
1282        fn permit_is_non_proper_noun() {
1283            assert!(md("lapdog").is_non_proper_noun());
1284        }
1285
1286        #[test]
1287        fn hound_is_singular_noun() {
1288            assert!(md("hound").is_singular_noun());
1289        }
1290
1291        #[test]
1292        fn pooches_is_non_singular_noun() {
1293            assert!(md("pooches").is_non_singular_noun());
1294        }
1295
1296        // Make sure is_non_xxx_noun methods don't behave like is_not_xxx_noun.
1297        // In other words, make sure they don't return true for words that are not nouns.
1298        // They must only pass for words that are nouns but not singular etc.
1299        #[test]
1300        fn loyal_doesnt_pass_is_non_singular_noun() {
1301            assert!(!md("loyal").is_non_singular_noun());
1302        }
1303
1304        #[test]
1305        fn hounds_is_plural_noun() {
1306            assert!(md("hounds").is_plural_noun());
1307        }
1308
1309        #[test]
1310        fn pooch_is_non_plural_noun() {
1311            assert!(md("pooch").is_non_plural_noun());
1312        }
1313
1314        #[test]
1315        fn fish_is_singular_noun() {
1316            assert!(md("fish").is_singular_noun());
1317        }
1318
1319        #[test]
1320        fn fish_is_plural_noun() {
1321            assert!(md("fish").is_plural_noun());
1322        }
1323
1324        #[test]
1325        fn fishes_is_plural_noun() {
1326            assert!(md("fishes").is_plural_noun());
1327        }
1328
1329        #[test]
1330        fn sheep_is_singular_noun() {
1331            assert!(md("sheep").is_singular_noun());
1332        }
1333
1334        #[test]
1335        fn sheep_is_plural_noun() {
1336            assert!(md("sheep").is_plural_noun());
1337        }
1338
1339        #[test]
1340        #[should_panic]
1341        fn sheeps_is_not_word() {
1342            md("sheeps");
1343        }
1344
1345        #[test]
1346        fn bicep_is_singular_noun() {
1347            assert!(md("bicep").is_singular_noun());
1348        }
1349
1350        #[test]
1351        fn biceps_is_singular_noun() {
1352            assert!(md("biceps").is_singular_noun());
1353        }
1354
1355        #[test]
1356        fn biceps_is_plural_noun() {
1357            assert!(md("biceps").is_plural_noun());
1358        }
1359
1360        #[test]
1361        fn aircraft_is_singular_noun() {
1362            assert!(md("aircraft").is_singular_noun());
1363        }
1364
1365        #[test]
1366        fn aircraft_is_plural_noun() {
1367            assert!(md("aircraft").is_plural_noun());
1368        }
1369
1370        #[test]
1371        #[should_panic]
1372        fn aircrafts_is_not_word() {
1373            md("aircrafts");
1374        }
1375
1376        #[test]
1377        fn dog_apostrophe_s_is_possessive_noun() {
1378            assert!(md("dog's").is_possessive_noun());
1379        }
1380
1381        #[test]
1382        fn dogs_is_non_possessive_noun() {
1383            assert!(md("dogs").is_non_possessive_noun());
1384        }
1385
1386        // noun countability
1387
1388        #[test]
1389        fn dog_is_countable() {
1390            assert!(md("dog").is_countable_noun());
1391        }
1392        #[test]
1393        fn dog_is_non_mass_noun() {
1394            assert!(md("dog").is_non_mass_noun());
1395        }
1396
1397        #[test]
1398        fn furniture_is_mass_noun() {
1399            assert!(md("furniture").is_mass_noun());
1400        }
1401        #[test]
1402        fn furniture_is_non_countable_noun() {
1403            assert!(md("furniture").is_non_countable_noun());
1404        }
1405
1406        #[test]
1407        fn equipment_is_mass_noun() {
1408            assert!(md("equipment").is_mass_noun());
1409        }
1410        #[test]
1411        fn equipment_is_non_countable_noun() {
1412            assert!(md("equipment").is_non_countable_noun());
1413        }
1414
1415        #[test]
1416        fn beer_is_countable_noun() {
1417            assert!(md("beer").is_countable_noun());
1418        }
1419        #[test]
1420        fn beer_is_mass_noun() {
1421            assert!(md("beer").is_mass_noun());
1422        }
1423    }
1424
1425    mod pronoun {
1426        use crate::dict_word_metadata::tests::md;
1427
1428        mod i_me_myself {
1429            use crate::dict_word_metadata::tests::md;
1430
1431            #[test]
1432            fn i_is_pronoun() {
1433                assert!(md("I").is_pronoun());
1434            }
1435            #[test]
1436            fn i_is_personal_pronoun() {
1437                assert!(md("I").is_personal_pronoun());
1438            }
1439            #[test]
1440            fn i_is_singular_pronoun() {
1441                assert!(md("I").is_singular_pronoun());
1442            }
1443            #[test]
1444            fn i_is_subject_pronoun() {
1445                assert!(md("I").is_subject_pronoun());
1446            }
1447
1448            #[test]
1449            fn me_is_pronoun() {
1450                assert!(md("me").is_pronoun());
1451            }
1452            #[test]
1453            fn me_is_personal_pronoun() {
1454                assert!(md("me").is_personal_pronoun());
1455            }
1456            #[test]
1457            fn me_is_singular_pronoun() {
1458                assert!(md("me").is_singular_pronoun());
1459            }
1460            #[test]
1461            fn me_is_object_pronoun() {
1462                assert!(md("me").is_object_pronoun());
1463            }
1464
1465            #[test]
1466            fn myself_is_pronoun() {
1467                assert!(md("myself").is_pronoun());
1468            }
1469            #[test]
1470            fn myself_is_personal_pronoun() {
1471                assert!(md("myself").is_personal_pronoun());
1472            }
1473            #[test]
1474            fn myself_is_singular_pronoun() {
1475                assert!(md("myself").is_singular_pronoun());
1476            }
1477            #[test]
1478            fn myself_is_reflexive_pronoun() {
1479                assert!(md("myself").is_reflexive_pronoun());
1480            }
1481        }
1482
1483        mod we_us_ourselves {
1484            use crate::dict_word_metadata::tests::md;
1485
1486            #[test]
1487            fn we_is_pronoun() {
1488                assert!(md("we").is_pronoun());
1489            }
1490            #[test]
1491            fn we_is_personal_pronoun() {
1492                assert!(md("we").is_personal_pronoun());
1493            }
1494            #[test]
1495            fn we_is_plural_pronoun() {
1496                assert!(md("we").is_plural_pronoun());
1497            }
1498            #[test]
1499            fn we_is_subject_pronoun() {
1500                assert!(md("we").is_subject_pronoun());
1501            }
1502
1503            #[test]
1504            fn us_is_pronoun() {
1505                assert!(md("us").is_pronoun());
1506            }
1507            #[test]
1508            fn us_is_personal_pronoun() {
1509                assert!(md("us").is_personal_pronoun());
1510            }
1511            #[test]
1512            fn us_is_plural_pronoun() {
1513                assert!(md("us").is_plural_pronoun());
1514            }
1515            #[test]
1516            fn us_is_object_pronoun() {
1517                assert!(md("us").is_object_pronoun());
1518            }
1519
1520            #[test]
1521            fn ourselves_is_pronoun() {
1522                assert!(md("ourselves").is_pronoun());
1523            }
1524            #[test]
1525            fn ourselves_is_personal_pronoun() {
1526                assert!(md("ourselves").is_personal_pronoun());
1527            }
1528            #[test]
1529            fn ourselves_is_plural_pronoun() {
1530                assert!(md("ourselves").is_plural_pronoun());
1531            }
1532            #[test]
1533            fn ourselves_is_reflexive_pronoun() {
1534                assert!(md("ourselves").is_reflexive_pronoun());
1535            }
1536        }
1537
1538        mod you_yourself {
1539            use crate::dict_word_metadata::tests::md;
1540
1541            #[test]
1542            fn you_is_pronoun() {
1543                assert!(md("you").is_pronoun());
1544            }
1545            #[test]
1546            fn you_is_personal_pronoun() {
1547                assert!(md("you").is_personal_pronoun());
1548            }
1549            #[test]
1550            fn you_is_singular_pronoun() {
1551                assert!(md("you").is_singular_pronoun());
1552            }
1553            #[test]
1554            fn you_is_plural_pronoun() {
1555                assert!(md("you").is_plural_pronoun());
1556            }
1557            #[test]
1558            fn you_is_subject_pronoun() {
1559                assert!(md("you").is_subject_pronoun());
1560            }
1561            #[test]
1562            fn you_is_object_pronoun() {
1563                assert!(md("you").is_object_pronoun());
1564            }
1565            #[test]
1566            fn yourself_is_pronoun() {
1567                assert!(md("yourself").is_pronoun());
1568            }
1569            #[test]
1570            fn yourself_is_personal_pronoun() {
1571                assert!(md("yourself").is_personal_pronoun());
1572            }
1573            #[test]
1574            fn yourself_is_singular_pronoun() {
1575                assert!(md("yourself").is_singular_pronoun());
1576            }
1577            #[test]
1578            fn yourself_is_reflexive_pronoun() {
1579                assert!(md("yourself").is_reflexive_pronoun());
1580            }
1581        }
1582
1583        mod he_him_himself {
1584            use crate::dict_word_metadata::tests::md;
1585
1586            #[test]
1587            fn he_is_pronoun() {
1588                assert!(md("he").is_pronoun());
1589            }
1590            #[test]
1591            fn he_is_personal_pronoun() {
1592                assert!(md("he").is_personal_pronoun());
1593            }
1594            #[test]
1595            fn he_is_singular_pronoun() {
1596                assert!(md("he").is_singular_pronoun());
1597            }
1598            #[test]
1599            fn he_is_subject_pronoun() {
1600                assert!(md("he").is_subject_pronoun());
1601            }
1602
1603            #[test]
1604            fn him_is_pronoun() {
1605                assert!(md("him").is_pronoun());
1606            }
1607            #[test]
1608            fn him_is_personal_pronoun() {
1609                assert!(md("him").is_personal_pronoun());
1610            }
1611            #[test]
1612            fn him_is_singular_pronoun() {
1613                assert!(md("him").is_singular_pronoun());
1614            }
1615            #[test]
1616            fn him_is_object_pronoun() {
1617                assert!(md("him").is_object_pronoun());
1618            }
1619
1620            #[test]
1621            fn himself_is_pronoun() {
1622                assert!(md("himself").is_pronoun());
1623            }
1624            #[test]
1625            fn himself_is_personal_pronoun() {
1626                assert!(md("himself").is_personal_pronoun());
1627            }
1628            #[test]
1629            fn himself_is_singular_pronoun() {
1630                assert!(md("himself").is_singular_pronoun());
1631            }
1632            #[test]
1633            fn himself_is_reflexive_pronoun() {
1634                assert!(md("himself").is_reflexive_pronoun());
1635            }
1636        }
1637
1638        mod she_her_herself {
1639            use crate::dict_word_metadata::tests::md;
1640
1641            #[test]
1642            fn she_is_pronoun() {
1643                assert!(md("she").is_pronoun());
1644            }
1645            #[test]
1646            fn she_is_personal_pronoun() {
1647                assert!(md("she").is_personal_pronoun());
1648            }
1649            #[test]
1650            fn she_is_singular_pronoun() {
1651                assert!(md("she").is_singular_pronoun());
1652            }
1653            #[test]
1654            fn she_is_subject_pronoun() {
1655                assert!(md("she").is_subject_pronoun());
1656            }
1657
1658            #[test]
1659            fn her_is_pronoun() {
1660                assert!(md("her").is_pronoun());
1661            }
1662            #[test]
1663            fn her_is_personal_pronoun() {
1664                assert!(md("her").is_personal_pronoun());
1665            }
1666            #[test]
1667            fn her_is_singular_pronoun() {
1668                assert!(md("her").is_singular_pronoun());
1669            }
1670            #[test]
1671            fn her_is_object_pronoun() {
1672                assert!(md("her").is_object_pronoun());
1673            }
1674
1675            #[test]
1676            fn herself_is_pronoun() {
1677                assert!(md("herself").is_pronoun());
1678            }
1679            #[test]
1680            fn herself_is_personal_pronoun() {
1681                assert!(md("herself").is_personal_pronoun());
1682            }
1683            #[test]
1684            fn herself_is_singular_pronoun() {
1685                assert!(md("herself").is_singular_pronoun());
1686            }
1687            #[test]
1688            fn herself_is_reflexive_pronoun() {
1689                assert!(md("herself").is_reflexive_pronoun());
1690            }
1691        }
1692
1693        mod it_itself {
1694            use crate::dict_word_metadata::tests::md;
1695
1696            #[test]
1697            fn it_is_pronoun() {
1698                assert!(md("it").is_pronoun());
1699            }
1700            #[test]
1701            fn it_is_personal_pronoun() {
1702                assert!(md("it").is_personal_pronoun());
1703            }
1704            #[test]
1705            fn it_is_singular_pronoun() {
1706                assert!(md("it").is_singular_pronoun());
1707            }
1708            #[test]
1709            fn it_is_subject_pronoun() {
1710                assert!(md("it").is_subject_pronoun());
1711            }
1712            #[test]
1713            fn it_is_object_pronoun() {
1714                assert!(md("it").is_object_pronoun());
1715            }
1716
1717            #[test]
1718            fn itself_is_pronoun() {
1719                assert!(md("itself").is_pronoun());
1720            }
1721            #[test]
1722            fn itself_is_personal_pronoun() {
1723                assert!(md("itself").is_personal_pronoun());
1724            }
1725            #[test]
1726            fn itself_is_singular_pronoun() {
1727                assert!(md("itself").is_singular_pronoun());
1728            }
1729            #[test]
1730            fn itself_is_reflexive_pronoun() {
1731                assert!(md("itself").is_reflexive_pronoun());
1732            }
1733        }
1734
1735        mod they_them_themselves {
1736            use crate::dict_word_metadata::tests::md;
1737
1738            #[test]
1739            fn they_is_pronoun() {
1740                assert!(md("they").is_pronoun());
1741            }
1742            #[test]
1743            fn they_is_personal_pronoun() {
1744                assert!(md("they").is_personal_pronoun());
1745            }
1746            #[test]
1747            fn they_is_plural_pronoun() {
1748                assert!(md("they").is_plural_pronoun());
1749            }
1750            #[test]
1751            fn they_is_subject_pronoun() {
1752                assert!(md("they").is_subject_pronoun());
1753            }
1754
1755            #[test]
1756            fn them_is_pronoun() {
1757                assert!(md("them").is_pronoun());
1758            }
1759            #[test]
1760            fn them_is_personal_pronoun() {
1761                assert!(md("them").is_personal_pronoun());
1762            }
1763            #[test]
1764            fn them_is_plural_pronoun() {
1765                assert!(md("them").is_plural_pronoun());
1766            }
1767            #[test]
1768            fn them_is_object_pronoun() {
1769                assert!(md("them").is_object_pronoun());
1770            }
1771
1772            #[test]
1773            fn themselves_is_pronoun() {
1774                assert!(md("themselves").is_pronoun());
1775            }
1776            #[test]
1777            fn themselves_is_personal_pronoun() {
1778                assert!(md("themselves").is_personal_pronoun());
1779            }
1780            #[test]
1781            fn themselves_is_plural_pronoun() {
1782                assert!(md("themselves").is_plural_pronoun());
1783            }
1784            #[test]
1785            fn themselves_is_reflexive_pronoun() {
1786                assert!(md("themselves").is_reflexive_pronoun());
1787            }
1788        }
1789
1790        // Possessive pronouns (not to be confused with possessive adjectives/determiners)
1791        #[test]
1792        fn mine_is_pronoun() {
1793            assert!(md("mine").is_pronoun());
1794        }
1795        #[test]
1796        fn ours_is_pronoun() {
1797            assert!(md("ours").is_pronoun());
1798        }
1799        #[test]
1800        fn yours_is_pronoun() {
1801            assert!(md("yours").is_pronoun());
1802        }
1803        #[test]
1804        fn his_is_pronoun() {
1805            assert!(md("his").is_pronoun());
1806        }
1807        #[test]
1808        fn hers_is_pronoun() {
1809            assert!(md("hers").is_pronoun());
1810        }
1811        #[test]
1812        fn its_is_pronoun() {
1813            assert!(md("its").is_pronoun());
1814        }
1815        #[test]
1816        fn theirs_is_pronoun() {
1817            assert!(md("theirs").is_pronoun());
1818        }
1819
1820        // archaic pronouns
1821        #[test]
1822        fn archaic_pronouns() {
1823            assert!(md("thou").is_pronoun());
1824            assert!(md("thee").is_pronoun());
1825            assert!(md("thyself").is_pronoun());
1826            assert!(md("thine").is_pronoun());
1827        }
1828
1829        // generic pronouns
1830        #[test]
1831        fn generic_pronouns() {
1832            assert!(md("one").is_pronoun());
1833            assert!(md("oneself").is_pronoun());
1834        }
1835
1836        // relative and interrogative pronouns
1837        #[test]
1838        fn relative_and_interrogative_pronouns() {
1839            assert!(md("who").is_pronoun());
1840            assert!(md("whom").is_pronoun());
1841            assert!(md("whose").is_pronoun());
1842            assert!(md("which").is_pronoun());
1843            assert!(md("what").is_pronoun());
1844        }
1845
1846        // nonstandard pronouns
1847        #[test]
1848        #[ignore = "not in dictionary"]
1849        fn nonstandard_pronouns() {
1850            assert!(md("themself").pronoun.is_some());
1851            assert!(md("y'all'").pronoun.is_some());
1852        }
1853    }
1854
1855    mod nominal {
1856        use crate::dict_word_metadata::tests::md;
1857
1858        #[test]
1859        fn my_is_possessive_nominal() {
1860            assert!(md("my").is_possessive_nominal());
1861        }
1862
1863        #[test]
1864        fn mine_is_not_possessive_nominal() {
1865            assert!(!md("mine").is_possessive_nominal());
1866        }
1867
1868        #[test]
1869        fn freds_is_possessive_nominal() {
1870            assert!(md("Fred's").is_possessive_nominal());
1871        }
1872
1873        #[test]
1874        fn fred_is_not_possessive_nominal() {
1875            assert!(!md("Fred").is_possessive_nominal());
1876        }
1877
1878        #[test]
1879        fn dogs_is_possessive_nominal() {
1880            assert!(md("dog's").is_possessive_nominal());
1881        }
1882
1883        #[test]
1884        fn microsofts_is_possessive_nominal() {
1885            assert!(md("Microsoft's").is_possessive_nominal());
1886        }
1887    }
1888
1889    mod adjective {
1890        use crate::{Degree, dict_word_metadata::tests::md};
1891
1892        // Getting degrees
1893
1894        #[test]
1895        #[ignore = "not marked yet because it might not be reliable"]
1896        fn big_is_positive() {
1897            assert_eq!(md("big").get_degree(), Some(Degree::Positive));
1898        }
1899
1900        #[test]
1901        fn bigger_is_comparative() {
1902            assert_eq!(md("bigger").get_degree(), Some(Degree::Comparative));
1903        }
1904
1905        #[test]
1906        fn biggest_is_superlative() {
1907            assert_eq!(md("biggest").get_degree(), Some(Degree::Superlative));
1908        }
1909
1910        #[test]
1911        #[should_panic(expected = "Word 'bigly' not found in dictionary")]
1912        fn bigly_is_not_an_adjective_form_we_track() {
1913            assert_eq!(md("bigly").get_degree(), None);
1914        }
1915
1916        // Calling is_ methods
1917
1918        // TODO: positive degree not implemented
1919
1920        #[test]
1921        fn bigger_is_comparative_adjective() {
1922            assert!(md("bigger").is_comparative_adjective());
1923        }
1924
1925        #[test]
1926        fn biggest_is_superlative_adjective() {
1927            assert!(md("biggest").is_superlative_adjective());
1928        }
1929    }
1930
1931    #[test]
1932    fn the_is_determiner() {
1933        assert!(md("the").is_determiner());
1934    }
1935    #[test]
1936    fn this_is_demonstrative_determiner() {
1937        assert!(md("this").is_demonstrative_determiner());
1938    }
1939    #[test]
1940    fn your_is_possessive_determiner() {
1941        assert!(md("your").is_possessive_determiner());
1942    }
1943
1944    #[test]
1945    fn every_is_quantifier() {
1946        assert!(md("every").is_quantifier());
1947    }
1948
1949    #[test]
1950    fn the_isnt_quantifier() {
1951        assert!(!md("the").is_quantifier());
1952    }
1953
1954    #[test]
1955    fn equipment_is_mass_noun() {
1956        assert!(md("equipment").is_mass_noun());
1957    }
1958
1959    #[test]
1960    fn equipment_is_non_countable_noun() {
1961        assert!(md("equipment").is_non_countable_noun());
1962    }
1963
1964    #[test]
1965    fn equipment_isnt_countable_noun() {
1966        assert!(!md("equipment").is_countable_noun());
1967    }
1968
1969    mod verb {
1970        use crate::dict_word_metadata::tests::md;
1971
1972        #[test]
1973        fn lemma_walk() {
1974            let md = md("walk");
1975            assert!(md.is_verb_lemma())
1976        }
1977
1978        #[test]
1979        fn lemma_fix() {
1980            let md = md("fix");
1981            assert!(md.is_verb_lemma())
1982        }
1983
1984        #[test]
1985        fn progressive_walking() {
1986            let md = md("walking");
1987            assert!(md.is_verb_progressive_form())
1988        }
1989
1990        #[test]
1991        fn past_walked() {
1992            let md = md("walked");
1993            assert!(md.is_verb_past_form())
1994        }
1995
1996        #[test]
1997        fn regular_past_thought() {
1998            let md = md("thought");
1999            assert!(md.is_verb_regular_past_form())
2000        }
2001
2002        #[test]
2003        fn simple_past_ate() {
2004            let md = md("ate");
2005            assert!(md.is_verb_simple_past_form())
2006        }
2007
2008        #[test]
2009        fn past_participle_eaten() {
2010            let md = md("eaten");
2011            assert!(md.is_verb_past_participle_form())
2012        }
2013
2014        #[test]
2015        fn ate_is_simple_past_only() {
2016            let md = md("ate");
2017            assert!(md.is_verb_simple_past_only());
2018            assert!(!md.is_verb_past_participle_only());
2019        }
2020
2021        #[test]
2022        fn eaten_is_past_participle_only() {
2023            let md = md("eaten");
2024            assert!(md.is_verb_past_participle_only());
2025            assert!(!md.is_verb_simple_past_only());
2026        }
2027
2028        #[test]
2029        fn thought_is_neither_past_form_only() {
2030            let md = md("thought");
2031            assert!(!md.is_verb_simple_past_only());
2032            assert!(!md.is_verb_past_participle_only());
2033        }
2034
2035        #[test]
2036        fn shared_past_forms_are_neither_past_form_only() {
2037            let md = md("thought");
2038            assert!(!md.is_verb_simple_past_only());
2039            assert!(!md.is_verb_past_participle_only());
2040            assert!(md.is_verb_regular_past_form());
2041        }
2042
2043        #[test]
2044        fn distinct_past_forms_are_not_regular_past() {
2045            assert!(!md("ate").is_verb_regular_past_form());
2046            assert!(!md("eaten").is_verb_regular_past_form());
2047            assert!(!md("walked").is_verb_regular_past_form());
2048        }
2049
2050        #[test]
2051        fn third_pers_sing_walks() {
2052            let md = md("walks");
2053            assert!(md.is_verb_third_person_singular_present_form())
2054        }
2055    }
2056}