harper_core/
word_metadata.rs

1use harper_brill::UPOS;
2use is_macro::Is;
3use paste::paste;
4use serde::{Deserialize, Serialize};
5use strum_macros::{Display, EnumString};
6
7use std::convert::TryFrom;
8
9use crate::WordId;
10
11#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)]
12pub struct WordMetadata {
13    pub noun: Option<NounData>,
14    pub pronoun: Option<PronounData>,
15    pub verb: Option<VerbData>,
16    pub adjective: Option<AdjectiveData>,
17    pub adverb: Option<AdverbData>,
18    pub conjunction: Option<ConjunctionData>,
19    pub swear: Option<bool>,
20    /// The dialects this word belongs to.
21    /// If no dialects are defined, it can be assumed that the word is
22    /// valid in all dialects of English.
23    #[serde(default = "default_default")]
24    pub dialects: DialectFlags,
25    /// Whether the word is a [determiner](https://en.wikipedia.org/wiki/English_determiners).
26    pub determiner: Option<DeterminerData>,
27    /// Whether the word is a [preposition](https://www.merriam-webster.com/dictionary/preposition).
28    #[serde(default = "default_false")]
29    pub preposition: bool,
30    /// Whether the word is considered especially common.
31    #[serde(default = "default_false")]
32    pub common: bool,
33    #[serde(default = "default_none")]
34    pub derived_from: Option<WordId>,
35    /// Generated by a chunker
36    pub np_member: Option<bool>,
37    /// Generated by a POS tagger
38    pub pos_tag: Option<UPOS>,
39}
40
41/// Needed for `serde`
42fn default_false() -> bool {
43    false
44}
45
46/// Needed for `serde`
47fn default_none<T>() -> Option<T> {
48    None
49}
50
51/// Needed for `serde`
52fn default_default<T: Default>() -> T {
53    T::default()
54}
55
56macro_rules! generate_metadata_queries {
57    ($($category:ident has $($sub:ident),*).*) => {
58        paste! {
59            pub fn is_likely_homograph(&self) -> bool {
60                [self.is_determiner(), self.preposition, $(
61                    self.[< is_ $category >](),
62                )*].iter().map(|b| *b as u8).sum::<u8>() > 1
63            }
64
65            $(
66                #[doc = concat!("Checks if the word is definitely a ", stringify!($category), ".")]
67                pub fn [< is_ $category >](&self) -> bool {
68                    self.$category.is_some()
69                }
70
71                $(
72                    #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as (a) ", stringify!($sub), ".")]
73                    pub fn [< is_ $sub _ $category >](&self) -> bool {
74                        matches!(
75                            self.$category,
76                            Some([< $category:camel Data >]{
77                                [< is_ $sub >]: Some(true),
78                                ..
79                            })
80                        )
81                    }
82
83
84                    #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as __not__ (a) ", stringify!($sub), ".")]
85                    pub fn [< is_not_ $sub _ $category >](&self) -> bool {
86                        matches!(
87                            self.$category,
88                            Some([< $category:camel Data >]{
89                                [< is_ $sub >]: Some(false),
90                                ..
91                            })
92                        )
93                    }
94                )*
95            )*
96        }
97    };
98}
99
100impl WordMetadata {
101    /// Produce a copy of `self` with the known properties of `other` set.
102    pub fn or(&self, other: &Self) -> Self {
103        macro_rules! merge {
104            ($a:expr, $b:expr) => {
105                match ($a, $b) {
106                    (Some(a), Some(b)) => Some(a.or(&b)),
107                    (Some(a), None) => Some(a),
108                    (None, Some(b)) => Some(b),
109                    (None, None) => None,
110                }
111            };
112        }
113
114        Self {
115            noun: merge!(self.noun, other.noun),
116            pronoun: merge!(self.pronoun, other.pronoun),
117            verb: merge!(self.verb, other.verb),
118            adjective: merge!(self.adjective, other.adjective),
119            adverb: merge!(self.adverb, other.adverb),
120            conjunction: merge!(self.conjunction, other.conjunction),
121            dialects: self.dialects | other.dialects,
122            swear: self.swear.or(other.swear),
123            determiner: merge!(self.determiner, other.determiner),
124            preposition: self.preposition || other.preposition,
125            common: self.common || other.common,
126            derived_from: self.derived_from.or(other.derived_from),
127            pos_tag: self.pos_tag.or(other.pos_tag),
128            np_member: self.np_member.or(other.np_member),
129        }
130    }
131
132    /// Given a UPOS tag, discard any metadata that would disagree with the given POS tag.
133    /// For example, if the metadata suggests a word could either be a noun or an adjective, and we
134    /// provide a [`UPOS::NOUN`], this function will remove the adjective data.
135    ///
136    /// Additionally, if the metadata does not currently declare the potential of the word to be
137    /// the specific POS, it becomes so. That means if we provide a [`UPOS::ADJ`] to the function
138    /// for a metadata whose `Self::adjective = None`, it will become `Some`.
139    pub fn enforce_pos_exclusivity(&mut self, pos: &UPOS) {
140        use UPOS::*;
141        match pos {
142            NOUN => {
143                if let Some(noun) = self.noun {
144                    self.noun = Some(NounData {
145                        is_proper: Some(false),
146                        ..noun
147                    })
148                } else {
149                    self.noun = Some(NounData {
150                        is_proper: Some(false),
151                        is_plural: None,
152                        is_possessive: None,
153                    })
154                }
155
156                self.pronoun = None;
157                self.verb = None;
158                self.adjective = None;
159                self.adverb = None;
160                self.conjunction = None;
161                self.determiner = None;
162                self.preposition = false;
163            }
164            PROPN => {
165                if let Some(noun) = self.noun {
166                    self.noun = Some(NounData {
167                        is_proper: Some(true),
168                        ..noun
169                    })
170                } else {
171                    self.noun = Some(NounData {
172                        is_proper: Some(true),
173                        is_plural: None,
174                        is_possessive: None,
175                    })
176                }
177
178                self.pronoun = None;
179                self.verb = None;
180                self.adjective = None;
181                self.adverb = None;
182                self.conjunction = None;
183                self.determiner = None;
184                self.preposition = false;
185            }
186            PRON => {
187                if self.pronoun.is_none() {
188                    self.pronoun = Some(PronounData::default())
189                }
190
191                self.noun = None;
192                self.verb = None;
193                self.adjective = None;
194                self.adverb = None;
195                self.conjunction = None;
196                self.determiner = None;
197                self.preposition = false;
198            }
199            VERB => {
200                if let Some(verb) = self.verb {
201                    self.verb = Some(VerbData {
202                        is_auxiliary: Some(false),
203                        ..verb
204                    })
205                } else {
206                    self.verb = Some(VerbData {
207                        is_auxiliary: Some(false),
208                        ..Default::default()
209                    })
210                }
211
212                self.noun = None;
213                self.pronoun = None;
214                self.adjective = None;
215                self.adverb = None;
216                self.conjunction = None;
217                self.determiner = None;
218                self.preposition = false;
219            }
220            AUX => {
221                if let Some(verb) = self.verb {
222                    self.verb = Some(VerbData {
223                        is_auxiliary: Some(true),
224                        ..verb
225                    })
226                } else {
227                    self.verb = Some(VerbData {
228                        is_auxiliary: Some(true),
229                        ..Default::default()
230                    })
231                }
232
233                self.noun = None;
234                self.pronoun = None;
235                self.adjective = None;
236                self.adverb = None;
237                self.conjunction = None;
238                self.determiner = None;
239                self.preposition = false;
240            }
241            ADJ => {
242                if self.adjective.is_none() {
243                    self.adjective = Some(AdjectiveData::default())
244                }
245
246                self.noun = None;
247                self.pronoun = None;
248                self.verb = None;
249                self.adverb = None;
250                self.conjunction = None;
251                self.determiner = None;
252                self.preposition = false;
253            }
254            ADV => {
255                if self.adverb.is_none() {
256                    self.adverb = Some(AdverbData::default())
257                }
258
259                self.noun = None;
260                self.pronoun = None;
261                self.verb = None;
262                self.adjective = None;
263                self.conjunction = None;
264                self.determiner = None;
265                self.preposition = false;
266            }
267            ADP => {
268                self.noun = None;
269                self.pronoun = None;
270                self.verb = None;
271                self.adjective = None;
272                self.adverb = None;
273                self.conjunction = None;
274                self.determiner = None;
275                self.preposition = true;
276            }
277            DET => {
278                self.noun = None;
279                self.pronoun = None;
280                self.verb = None;
281                self.adjective = None;
282                self.adverb = None;
283                self.conjunction = None;
284                self.preposition = false;
285                self.determiner = Some(DeterminerData::default());
286            }
287            CCONJ | SCONJ => {
288                if self.conjunction.is_none() {
289                    self.conjunction = Some(ConjunctionData::default())
290                }
291
292                self.noun = None;
293                self.pronoun = None;
294                self.verb = None;
295                self.adjective = None;
296                self.adverb = None;
297                self.determiner = None;
298                self.preposition = false;
299            }
300            _ => {}
301        }
302    }
303
304    generate_metadata_queries!(
305        noun has proper, plural, possessive.
306        pronoun has plural, possessive, reflexive.
307        determiner has demonstrative, possessive.
308        verb has linking, auxiliary.
309        conjunction has.
310        adjective has.
311        adverb has
312    );
313
314    pub fn is_verb_lemma(&self) -> bool {
315        matches!(
316            self.verb,
317            Some(VerbData {
318                verb_form: Some(VerbForm::LemmaForm),
319                ..
320            })
321        )
322    }
323
324    pub fn is_verb_past_form(&self) -> bool {
325        matches!(
326            self.verb,
327            Some(VerbData {
328                verb_form: Some(VerbForm::PastForm),
329                ..
330            })
331        )
332    }
333
334    pub fn is_verb_progressive_form(&self) -> bool {
335        matches!(
336            self.verb,
337            Some(VerbData {
338                verb_form: Some(VerbForm::ProgressiveForm),
339                ..
340            })
341        )
342    }
343
344    pub fn is_verb_third_person_singular_present_form(&self) -> bool {
345        matches!(
346            self.verb,
347            Some(VerbData {
348                verb_form: Some(VerbForm::ThirdPersonSingularPresentForm),
349                ..
350            })
351        )
352    }
353
354    /// Checks if the word is definitely nominal.
355    pub fn is_nominal(&self) -> bool {
356        self.noun.is_some() || self.pronoun.is_some()
357    }
358
359    /// Checks if the word is definitely a nominal and more specifically is labeled as (a) plural.
360    pub fn is_plural_nominal(&self) -> bool {
361        matches!(
362            self.noun,
363            Some(NounData {
364                is_plural: Some(true),
365                ..
366            })
367        ) || matches!(
368            self.pronoun,
369            Some(PronounData {
370                is_plural: Some(true),
371                ..
372            })
373        )
374    }
375
376    /// Checks if the word is definitely a nominal and more specifically is labeled as (a) possessive.
377    pub fn is_possessive_nominal(&self) -> bool {
378        matches!(
379            self.noun,
380            Some(NounData {
381                is_possessive: Some(true),
382                ..
383            })
384        ) || matches!(
385            self.pronoun,
386            Some(PronounData {
387                is_possessive: Some(true),
388                ..
389            })
390        )
391    }
392
393    /// Checks if the word is definitely a nominal and more specifically is labeled as __not__ (a) plural.
394    pub fn is_not_plural_nominal(&self) -> bool {
395        matches!(
396            self.noun,
397            Some(NounData {
398                is_plural: Some(false),
399                ..
400            })
401        ) || matches!(
402            self.pronoun,
403            Some(PronounData {
404                is_plural: Some(false),
405                ..
406            })
407        )
408    }
409
410    /// Checks if the word is definitely a nominal and more specifically is labeled as __not__ (a) possessive.
411    pub fn is_not_possessive_nominal(&self) -> bool {
412        matches!(
413            self.noun,
414            Some(NounData {
415                is_possessive: Some(false),
416                ..
417            })
418        ) && matches!(
419            self.pronoun,
420            Some(PronounData {
421                is_possessive: Some(false),
422                ..
423            })
424        )
425    }
426
427    /// Checks whether a word is _definitely_ a swear.
428    pub fn is_swear(&self) -> bool {
429        matches!(self.swear, Some(true))
430    }
431
432    /// Same thing as [`Self::or`], except in-place rather than a clone.
433    pub fn append(&mut self, other: &Self) -> &mut Self {
434        *self = self.or(other);
435        self
436    }
437}
438
439// These verb forms are morphological variations, distinct from TAM (Tense-Aspect-Mood)
440// Each form can be used in various TAM combinations:
441// - Lemma form (infinitive, citation form, dictionary form)
442//   Used in infinitives (e.g., "to sleep"), imperatives (e.g., "sleep!"), and with modals (e.g., "will sleep")
443// - Past form (past participle and simple past)
444//   Used as verbs (e.g., "slept") or adjectives (e.g., "closed door")
445// - Progressive form (present participle and gerund)
446//   Used as verbs (e.g., "sleeping"), nouns (e.g., "sleeping is important"), or adjectives (e.g., "sleeping dog")
447// - Third person singular present (-s/-es)
448//   Used for third person singular subjects (e.g., "he sleeps", "she reads")
449//
450// Important notes:
451// 1. English expresses time through auxiliary verbs, not verb form alone
452// 2. Irregular verbs can have different forms for past participle and simple past
453// 3. Future is always expressed through auxiliary verbs (e.g., "will sleep", "going to sleep")
454#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Is, Hash)]
455pub enum VerbForm {
456    LemmaForm,
457    PastForm,
458    ProgressiveForm,
459    ThirdPersonSingularPresentForm,
460}
461
462#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
463pub struct VerbData {
464    pub is_linking: Option<bool>,
465    pub is_auxiliary: Option<bool>,
466    pub verb_form: Option<VerbForm>,
467}
468
469impl VerbData {
470    /// Produce a copy of `self` with the known properties of `other` set.
471    pub fn or(&self, other: &Self) -> Self {
472        Self {
473            is_linking: self.is_linking.or(other.is_linking),
474            is_auxiliary: self.is_auxiliary.or(other.is_auxiliary),
475            verb_form: self.verb_form.or(other.verb_form),
476        }
477    }
478}
479
480// TODO other noun properties may be worth adding:
481// TODO count vs mass; abstract
482#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
483pub struct NounData {
484    pub is_proper: Option<bool>,
485    pub is_plural: Option<bool>,
486    pub is_possessive: Option<bool>,
487}
488
489impl NounData {
490    /// Produce a copy of `self` with the known properties of `other` set.
491    pub fn or(&self, other: &Self) -> Self {
492        Self {
493            is_proper: self.is_proper.or(other.is_proper),
494            is_plural: self.is_plural.or(other.is_plural),
495            is_possessive: self.is_possessive.or(other.is_possessive),
496        }
497    }
498}
499
500// Person is a property of pronouns; the verb 'be', plus all verbs reflect 3rd person singular with -s
501#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Is, Hash)]
502pub enum Person {
503    First,
504    Second,
505    Third,
506}
507
508// case is a property of pronouns
509#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Is, Hash)]
510pub enum Case {
511    Subject,
512    Object,
513}
514
515// TODO for now focused on personal pronouns?
516#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
517pub struct PronounData {
518    pub is_plural: Option<bool>,
519    pub is_possessive: Option<bool>,
520    pub is_reflexive: Option<bool>,
521    pub person: Option<Person>,
522    pub case: Option<Case>,
523}
524
525impl PronounData {
526    /// Produce a copy of `self` with the known properties of `other` set.
527    pub fn or(&self, other: &Self) -> Self {
528        Self {
529            is_plural: self.is_plural.or(other.is_plural),
530            is_possessive: self.is_possessive.or(other.is_possessive),
531            is_reflexive: self.is_reflexive.or(other.is_reflexive),
532            person: self.person.or(other.person),
533            case: self.case.or(other.case),
534        }
535    }
536}
537
538#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
539pub struct DeterminerData {
540    pub is_demonstrative: Option<bool>,
541    pub is_possessive: Option<bool>,
542}
543
544impl DeterminerData {
545    /// Produce a copy of `self` with the known properties of `other` set.
546    pub fn or(&self, other: &Self) -> Self {
547        Self {
548            is_demonstrative: self.is_demonstrative.or(other.is_demonstrative),
549            is_possessive: self.is_possessive.or(other.is_possessive),
550        }
551    }
552}
553
554// Degree is a property of adjectives: positive is not inflected
555// Comparative is inflected with -er or comes after the word "more"
556// Superlative is inflected with -est or comes after the word "most"
557#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Is, Hash)]
558pub enum Degree {
559    Positive,
560    Comparative,
561    Superlative,
562}
563
564// Some adjectives are not comparable so don't have -er or -est forms and can't be used with "more" or "most".
565// Some adjectives can only be used "attributively" (before a noun); some only predicatively (after "is" etc.).
566// In old grammars words like the articles and determiners are classified as adjectives but behave differently.
567#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
568pub struct AdjectiveData {
569    pub degree: Option<Degree>,
570}
571
572impl AdjectiveData {
573    /// Produce a copy of `self` with the known properties of `other` set.
574    pub fn or(&self, other: &Self) -> Self {
575        Self {
576            degree: self.degree.or(other.degree),
577        }
578    }
579}
580
581// Adverb can be a "junk drawer" category for words which don't fit the other major categories.
582// The typical adverbs are "adverbs of manner", those derived from adjectives in -ly
583// other adverbs (time, place, etc) should probably not be considered adverbs for Harper's purposes
584#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
585pub struct AdverbData {}
586
587impl AdverbData {
588    /// Produce a copy of `self` with the known properties of `other` set.
589    pub fn or(&self, _other: &Self) -> Self {
590        Self {}
591    }
592}
593
594#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
595pub struct ConjunctionData {}
596
597impl ConjunctionData {
598    /// Produce a copy of `self` with the known properties of `other` set.
599    pub fn or(&self, _other: &Self) -> Self {
600        Self {}
601    }
602}
603
604/// A regional dialect.
605#[derive(
606    Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, EnumString, Display,
607)]
608pub enum Dialect {
609    // Note: these have bit-shifted values so that they can ergonomically integrate with
610    // `DialectFlags`. Each value here must have a unique bit index inside
611    // `DialectsUnderlyingType`.
612    American = 1 << 0,
613    Canadian = 1 << 1,
614    Australian = 1 << 2,
615    British = 1 << 3,
616}
617impl TryFrom<DialectFlags> for Dialect {
618    type Error = ();
619
620    /// Attempts to convert `DialectFlags` to a single `Dialect`.
621    ///
622    /// # Errors
623    ///
624    /// Will return `Err` if more than one dialect is enabled or if an undefined dialect is
625    /// enabled.
626    fn try_from(dialect_flags: DialectFlags) -> Result<Self, Self::Error> {
627        // Ensure only one dialect is enabled before converting.
628        if dialect_flags.bits().count_ones() == 1 {
629            match dialect_flags {
630                df if df.is_dialect_enabled_strict(Dialect::American) => Ok(Dialect::American),
631                df if df.is_dialect_enabled_strict(Dialect::Canadian) => Ok(Dialect::Canadian),
632                df if df.is_dialect_enabled_strict(Dialect::Australian) => Ok(Dialect::Australian),
633                df if df.is_dialect_enabled_strict(Dialect::British) => Ok(Dialect::British),
634                _ => Err(()),
635            }
636        } else {
637            // More than one dialect enabled; can't soundly convert.
638            Err(())
639        }
640    }
641}
642
643// The underlying type used for DialectFlags.
644// At the time of writing, this is currently a `u8`. If we want to define more than 8 dialects in
645// the future, we will need to switch this to a larger type.
646type DialectFlagsUnderlyingType = u8;
647
648bitflags::bitflags! {
649    /// A collection of bit flags used to represent enabled dialects.
650    ///
651    /// This is generally used to allow a word (or similar) to be tagged with multiple dialects.
652    #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash)]
653    #[serde(transparent)]
654    pub struct DialectFlags: DialectFlagsUnderlyingType {
655        const AMERICAN = Dialect::American as DialectFlagsUnderlyingType;
656        const CANADIAN = Dialect::Canadian as DialectFlagsUnderlyingType;
657        const AUSTRALIAN = Dialect::Australian as DialectFlagsUnderlyingType;
658        const BRITISH = Dialect::British as DialectFlagsUnderlyingType;
659    }
660}
661impl DialectFlags {
662    /// Checks if the provided dialect is enabled.
663    /// If no dialect is explicitly enabled, it is assumed that all dialects are enabled.
664    #[must_use]
665    pub fn is_dialect_enabled(self, dialect: Dialect) -> bool {
666        self.is_empty() || self.intersects(Self::from_dialect(dialect))
667    }
668
669    /// Checks if the provided dialect is ***explicitly*** enabled.
670    ///
671    /// Unlike `is_dialect_enabled`, this will return false when no dialects are explicitly
672    /// enabled.
673    #[must_use]
674    pub fn is_dialect_enabled_strict(self, dialect: Dialect) -> bool {
675        self.intersects(Self::from_dialect(dialect))
676    }
677
678    /// Constructs a `DialectFlags` from the provided `Dialect`, with only that dialect being
679    /// enabled.
680    ///
681    /// # Panics
682    ///
683    /// This will panic if `dialect` represents a dialect that is not defined in
684    /// `DialectFlags`.
685    #[must_use]
686    pub fn from_dialect(dialect: Dialect) -> Self {
687        let Some(out) = Self::from_bits(dialect as DialectFlagsUnderlyingType) else {
688            panic!("The '{dialect}' dialect isn't defined in DialectFlags!");
689        };
690        out
691    }
692}
693impl Default for DialectFlags {
694    /// A default value with no dialects explicitly enabled.
695    /// Implicitly, this state corresponds to all dialects being enabled.
696    fn default() -> Self {
697        Self::empty()
698    }
699}