harper_core/
token_kind.rs

1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word};
6
7/// Generate wrapper code to pass a function call to the inner [`DictWordMetadata`],  
8/// if the token is indeed a word, while also emitting method-level documentation.
9macro_rules! delegate_to_metadata {
10    ($($method:ident),* $(,)?) => {
11        $(
12            #[doc = concat!(
13                "Delegates to [`DictWordMetadata::",
14                stringify!($method),
15                "`] when this token is a word.\n\n",
16                "Returns `false` if the token is not a word."
17            )]
18            pub fn $method(&self) -> bool {
19                let Word(Some(metadata)) = self else {
20                    return false;
21                };
22                metadata.$method()
23            }
24        )*
25    };
26}
27
28/// The parsed value of a [`Token`](crate::Token).
29/// Has a variety of queries available.
30/// If there is a query missing, it may be easy to implement by just calling the
31/// `delegate_to_metadata` macro.
32#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
33#[serde(tag = "kind", content = "value")]
34pub enum TokenKind {
35    /// `None` if the word does not exist in the dictionary.
36    Word(Option<DictWordMetadata>),
37    Punctuation(Punctuation),
38    Decade,
39    Number(Number),
40    /// A sequence of " " spaces.
41    Space(usize),
42    /// A sequence of "\n" newlines
43    Newline(usize),
44    EmailAddress,
45    Url,
46    Hostname,
47    /// A special token used for things like inline code blocks that should be
48    /// ignored by all linters.
49    #[default]
50    Unlintable,
51    ParagraphBreak,
52    Regexish,
53}
54
55impl TokenKind {
56    // DictWord metadata delegation methods grouped by part of speech
57    delegate_to_metadata! {
58        // Nominal methods (nouns and pronouns)
59        is_nominal,
60        is_noun,
61        is_pronoun,
62        is_proper_noun,
63        is_singular_nominal,
64        is_plural_nominal,
65        is_possessive_nominal,
66        is_non_plural_nominal,
67        is_singular_noun,
68        is_plural_noun,
69        is_non_plural_noun,
70        is_countable_noun,
71        is_non_countable_noun,
72        is_mass_noun,
73        is_mass_noun_only,
74        is_non_mass_noun,
75        is_singular_pronoun,
76        is_plural_pronoun,
77        is_non_plural_pronoun,
78        is_reflexive_pronoun,
79        is_personal_pronoun,
80        is_first_person_singular_pronoun,
81        is_first_person_plural_pronoun,
82        is_second_person_pronoun,
83        is_third_person_pronoun,
84        is_third_person_singular_pronoun,
85        is_third_person_plural_pronoun,
86        is_subject_pronoun,
87        is_object_pronoun,
88        is_possessive_noun,
89        is_possessive_pronoun,
90
91        // Verb methods
92        is_verb,
93        is_auxiliary_verb,
94        is_linking_verb,
95        is_verb_lemma,
96        is_verb_past_form,
97        is_verb_simple_past_form,
98        is_verb_past_participle_form,
99        is_verb_progressive_form,
100        is_verb_third_person_singular_present_form,
101
102        // Adjective methods
103        is_adjective,
104        is_comparative_adjective,
105        is_superlative_adjective,
106        is_positive_adjective,
107
108        // Adverb methods
109        is_adverb,
110        is_manner_adverb,
111        is_frequency_adverb,
112        is_degree_adverb,
113
114        // Determiner methods
115        is_determiner,
116        is_demonstrative_determiner,
117        is_possessive_determiner,
118        is_quantifier,
119        is_non_quantifier_determiner,
120        is_non_demonstrative_determiner,
121
122        // Conjunction methods
123        is_conjunction,
124
125        // Generic word methods
126        is_swear,
127        is_likely_homograph,
128
129        // Orthography methods
130        is_lowercase,
131        is_titlecase,
132        is_allcaps,
133        is_lower_camel,
134        is_upper_camel,
135        is_apostrophized,
136
137        is_roman_numerals
138    }
139
140    // DictWord metadata delegation methods not generated by macro
141    pub fn is_preposition(&self) -> bool {
142        let Word(Some(metadata)) = self else {
143            return false;
144        };
145        metadata.preposition
146    }
147
148    // Generic word is-methods
149
150    pub fn is_common_word(&self) -> bool {
151        let Word(Some(metadata)) = self else {
152            return true;
153        };
154        metadata.common
155    }
156
157    /// Checks whether the token is a member of a nominal phrase.
158    pub fn is_np_member(&self) -> bool {
159        let Word(Some(metadata)) = self else {
160            return false;
161        };
162        metadata.np_member.unwrap_or(false)
163    }
164
165    /// Checks whether a word token is out-of-vocabulary (not found in the dictionary).
166    ///
167    /// Returns `true` if the token is a word that was not found in the dictionary,
168    /// `false` if the token is a word found in the dictionary or is not a word token.
169    pub fn is_oov(&self) -> bool {
170        matches!(self, TokenKind::Word(None))
171    }
172
173    // Number is-methods
174
175    pub fn is_cardinal_number(&self) -> bool {
176        matches!(self, TokenKind::Number(Number { suffix: None, .. }))
177    }
178
179    pub fn is_ordinal_number(&self) -> bool {
180        matches!(
181            self,
182            TokenKind::Number(Number {
183                suffix: Some(_),
184                ..
185            })
186        )
187    }
188
189    // Punctuation and symbol is-methods
190
191    pub fn is_open_square(&self) -> bool {
192        matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
193    }
194
195    pub fn is_close_square(&self) -> bool {
196        matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
197    }
198
199    pub fn is_open_round(&self) -> bool {
200        matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
201    }
202
203    pub fn is_close_round(&self) -> bool {
204        matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
205    }
206
207    pub fn is_pipe(&self) -> bool {
208        matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
209    }
210
211    pub fn is_currency(&self) -> bool {
212        matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
213    }
214
215    pub fn is_ellipsis(&self) -> bool {
216        matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
217    }
218
219    pub fn is_hyphen(&self) -> bool {
220        matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
221    }
222
223    pub fn is_quote(&self) -> bool {
224        matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
225    }
226
227    pub fn is_apostrophe(&self) -> bool {
228        matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
229    }
230
231    pub fn is_period(&self) -> bool {
232        matches!(self, TokenKind::Punctuation(Punctuation::Period))
233    }
234
235    pub fn is_at(&self) -> bool {
236        matches!(self, TokenKind::Punctuation(Punctuation::At))
237    }
238
239    pub fn is_comma(&self) -> bool {
240        matches!(self, TokenKind::Punctuation(Punctuation::Comma))
241    }
242
243    pub fn is_semicolon(&self) -> bool {
244        matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
245    }
246
247    pub fn is_ampersand(&self) -> bool {
248        matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
249    }
250
251    pub fn is_slash(&self) -> bool {
252        matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
253    }
254
255    // Miscellaneous is-methods
256
257    /// Checks whether a token is word-like--meaning it is more complex than punctuation and can
258    /// hold semantic meaning in the way a word does.
259    pub fn is_word_like(&self) -> bool {
260        matches!(
261            self,
262            TokenKind::Word(..)
263                | TokenKind::EmailAddress
264                | TokenKind::Hostname
265                | TokenKind::Decade
266                | TokenKind::Number(..)
267        )
268    }
269
270    pub(crate) fn is_chunk_terminator(&self) -> bool {
271        if self.is_sentence_terminator() {
272            return true;
273        }
274
275        match self {
276            TokenKind::Punctuation(punct) => {
277                matches!(
278                    punct,
279                    Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
280                )
281            }
282            _ => false,
283        }
284    }
285
286    pub(crate) fn is_sentence_terminator(&self) -> bool {
287        match self {
288            TokenKind::Punctuation(punct) => [
289                Punctuation::Period,
290                Punctuation::Bang,
291                Punctuation::Question,
292            ]
293            .contains(punct),
294            TokenKind::ParagraphBreak => true,
295            _ => false,
296        }
297    }
298
299    /// Used by `crate::parsers::CollapseIdentifiers`
300    /// TODO: Separate this into two functions and add OR functionality to
301    /// pattern matching
302    pub fn is_case_separator(&self) -> bool {
303        matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
304            || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
305    }
306
307    /// Checks whether the token is whitespace.
308    pub fn is_whitespace(&self) -> bool {
309        matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
310    }
311
312    pub fn is_upos(&self, upos: UPOS) -> bool {
313        let Some(Some(meta)) = self.as_word() else {
314            return false;
315        };
316
317        meta.pos_tag == Some(upos)
318    }
319
320    // Miscellaneous non-is methods
321
322    /// Checks that `self` is the same enum variant as `other`, regardless of
323    /// whether the inner metadata is also equal.
324    pub fn matches_variant_of(&self, other: &Self) -> bool {
325        self.with_default_data() == other.with_default_data()
326    }
327
328    /// Produces a copy of `self` with any inner data replaced with its default
329    /// value. Useful for making comparisons on just the variant of the
330    /// enum.
331    pub fn with_default_data(&self) -> Self {
332        match self {
333            TokenKind::Word(_) => TokenKind::Word(Default::default()),
334            TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
335            TokenKind::Number(..) => TokenKind::Number(Default::default()),
336            TokenKind::Space(_) => TokenKind::Space(Default::default()),
337            TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
338            _ => self.clone(),
339        }
340    }
341
342    /// Construct a [`TokenKind::Word`] with no metadata.
343    pub fn blank_word() -> Self {
344        Self::Word(None)
345    }
346
347    // Punctuation and symbol non-is methods
348
349    pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
350        self.as_mut_punctuation()?.as_mut_quote()
351    }
352
353    pub fn as_quote(&self) -> Option<&Quote> {
354        self.as_punctuation()?.as_quote()
355    }
356}
357
358#[cfg(test)]
359mod tests {
360    use crate::Document;
361
362    #[test]
363    fn car_is_singular_noun() {
364        let doc = Document::new_plain_english_curated("car");
365        let tk = &doc.tokens().next().unwrap().kind;
366        assert!(tk.is_singular_noun());
367    }
368
369    #[test]
370    fn traffic_is_mass_noun_only() {
371        let doc = Document::new_plain_english_curated("traffic");
372        let tk = &doc.tokens().next().unwrap().kind;
373        assert!(tk.is_mass_noun_only());
374    }
375
376    #[test]
377    fn equipment_is_mass_noun() {
378        let doc = Document::new_plain_english_curated("equipment");
379        let tk = &doc.tokens().next().unwrap().kind;
380        assert!(tk.is_mass_noun());
381    }
382
383    #[test]
384    fn equipment_is_non_countable_noun() {
385        let doc = Document::new_plain_english_curated("equipment");
386        let tk = &doc.tokens().next().unwrap().kind;
387        assert!(tk.is_non_countable_noun());
388    }
389
390    #[test]
391    fn equipment_isnt_countable_noun() {
392        let doc = Document::new_plain_english_curated("equipment");
393        let tk = &doc.tokens().next().unwrap().kind;
394        assert!(!tk.is_countable_noun());
395    }
396
397    #[test]
398    fn oov_word_is_oov() {
399        let doc = Document::new_plain_english_curated("nonexistentword");
400        let tk = &doc.tokens().next().unwrap().kind;
401        assert!(tk.is_oov());
402    }
403
404    #[test]
405    fn known_word_is_not_oov() {
406        let doc = Document::new_plain_english_curated("car");
407        let tk = &doc.tokens().next().unwrap().kind;
408        assert!(!tk.is_oov());
409    }
410
411    #[test]
412    fn non_word_tokens_are_not_oov() {
413        let doc = Document::new_plain_english_curated("Hello, world!");
414        let tokens: Vec<_> = doc.tokens().collect();
415
416        // Comma should not be OOV
417        assert!(!tokens[1].kind.is_oov());
418        // Exclamation mark should not be OOV
419        assert!(!tokens[3].kind.is_oov());
420    }
421}