Skip to main content

harper_core/
token_kind.rs

1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{
6    DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word, dict_word_metadata::Person,
7};
8
9/// Generate wrapper code to pass a function call to the inner [`DictWordMetadata`],  
10/// if the token is indeed a word, while also emitting method-level documentation.
11macro_rules! delegate_to_metadata {
12    ($($method:ident),* $(,)?) => {
13        $(
14            #[doc = concat!(
15                "Delegates to [`DictWordMetadata::",
16                stringify!($method),
17                "`] when this token is a word.\n\n",
18                "Returns `false` if the token is not a word."
19            )]
20            pub fn $method(&self) -> bool {
21                let Word(Some(metadata)) = self else {
22                    return false;
23                };
24                metadata.$method()
25            }
26        )*
27    };
28}
29
30/// The parsed value of a [`Token`](crate::Token).
31/// Has a variety of queries available.
32/// If there is a query missing, it may be easy to implement by just calling the
33/// `delegate_to_metadata` macro.
34#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
35#[serde(tag = "kind", content = "value")]
36pub enum TokenKind {
37    /// `None` if the word does not exist in the dictionary.
38    Word(Option<DictWordMetadata>),
39    Punctuation(Punctuation),
40    Decade,
41    Number(Number),
42    /// A sequence of " " spaces.
43    Space(usize),
44    /// A sequence of "\n" newlines
45    Newline(usize),
46    EmailAddress,
47    Url,
48    Hostname,
49    /// A special token used for things like inline code blocks that should be
50    /// ignored by all linters.
51    #[default]
52    Unlintable,
53    ParagraphBreak,
54    Regexish,
55    HeadingStart,
56}
57
58impl TokenKind {
59    // DictWord metadata delegation methods grouped by part of speech
60    delegate_to_metadata! {
61        // Nominal methods (nouns and pronouns)
62        is_nominal,
63        is_noun,
64        is_pronoun,
65        is_proper_noun,
66        is_singular_nominal,
67        is_plural_nominal,
68        is_possessive_nominal,
69        is_non_plural_nominal,
70        is_singular_noun,
71        is_plural_noun,
72        is_non_plural_noun,
73        is_non_possessive_noun,
74        is_countable_noun,
75        is_non_countable_noun,
76        is_mass_noun,
77        is_mass_noun_only,
78        is_non_mass_noun,
79        is_singular_pronoun,
80        is_plural_pronoun,
81        is_non_plural_pronoun,
82        is_reflexive_pronoun,
83        is_personal_pronoun,
84        is_first_person_singular_pronoun,
85        is_first_person_plural_pronoun,
86        is_second_person_pronoun,
87        is_third_person_pronoun,
88        is_third_person_singular_pronoun,
89        is_third_person_plural_pronoun,
90        is_subject_pronoun,
91        is_object_pronoun,
92        is_possessive_noun,
93        // Note: possessive pronouns are: mine, ours, yours, his, hers, its, theirs
94        is_possessive_pronoun,
95
96        // Verb methods
97        is_verb,
98        is_auxiliary_verb,
99        is_linking_verb,
100        is_verb_lemma,
101        is_verb_past_form,
102        is_verb_simple_past_form,
103        is_verb_past_participle_form,
104        is_verb_progressive_form,
105        is_verb_third_person_singular_present_form,
106
107        // Adjective methods
108        is_adjective,
109        is_comparative_adjective,
110        is_superlative_adjective,
111        is_positive_adjective,
112
113        // Adverb methods
114        is_adverb,
115        is_manner_adverb,
116        is_frequency_adverb,
117        is_degree_adverb,
118
119        // Determiner methods
120        is_determiner,
121        is_demonstrative_determiner,
122        is_possessive_determiner,
123        is_quantifier,
124        is_non_quantifier_determiner,
125        is_non_demonstrative_determiner,
126
127        // Conjunction methods
128        is_conjunction,
129
130        // Generic word methods
131        is_swear,
132        is_likely_homograph,
133
134        // Orthography methods
135        is_lowercase,
136        is_titlecase,
137        is_allcaps,
138        is_lower_camel,
139        is_upper_camel,
140        is_apostrophized,
141
142        is_roman_numerals
143    }
144
145    pub fn get_pronoun_person(&self) -> Option<Person> {
146        let Word(Some(metadata)) = self else {
147            return None;
148        };
149        metadata.get_person()
150    }
151
152    // DictWord metadata delegation methods not generated by macro
153    pub fn is_preposition(&self) -> bool {
154        let Word(Some(metadata)) = self else {
155            return false;
156        };
157        metadata.preposition
158    }
159
160    // Generic word is-methods
161
162    pub fn is_common_word(&self) -> bool {
163        let Word(Some(metadata)) = self else {
164            return true;
165        };
166        metadata.common
167    }
168
169    /// Checks whether the token is a member of a nominal phrase.
170    pub fn is_np_member(&self) -> bool {
171        let Word(Some(metadata)) = self else {
172            return false;
173        };
174        metadata.np_member.unwrap_or(false)
175    }
176
177    /// Checks whether a word token is out-of-vocabulary (not found in the dictionary).
178    ///
179    /// Returns `true` if the token is a word that was not found in the dictionary,
180    /// `false` if the token is a word found in the dictionary or is not a word token.
181    pub fn is_oov(&self) -> bool {
182        matches!(self, TokenKind::Word(None))
183    }
184
185    // Number is-methods
186
187    pub fn is_cardinal_number(&self) -> bool {
188        matches!(self, TokenKind::Number(Number { suffix: None, .. }))
189    }
190
191    pub fn is_ordinal_number(&self) -> bool {
192        matches!(
193            self,
194            TokenKind::Number(Number {
195                suffix: Some(_),
196                ..
197            })
198        )
199    }
200
201    // Punctuation and symbol is-methods
202
203    pub fn is_open_square(&self) -> bool {
204        matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
205    }
206
207    pub fn is_close_square(&self) -> bool {
208        matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
209    }
210
211    pub fn is_less_than(&self) -> bool {
212        matches!(self, TokenKind::Punctuation(Punctuation::LessThan))
213    }
214
215    pub fn is_greater_than(&self) -> bool {
216        matches!(self, TokenKind::Punctuation(Punctuation::GreaterThan))
217    }
218
219    pub fn is_open_round(&self) -> bool {
220        matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
221    }
222
223    pub fn is_close_round(&self) -> bool {
224        matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
225    }
226
227    pub fn is_pipe(&self) -> bool {
228        matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
229    }
230
231    pub fn is_currency(&self) -> bool {
232        matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
233    }
234
235    pub fn is_ellipsis(&self) -> bool {
236        matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
237    }
238
239    // AKA 'minus'
240    pub fn is_hyphen(&self) -> bool {
241        matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
242    }
243
244    pub fn is_plus(&self) -> bool {
245        matches!(self, TokenKind::Punctuation(Punctuation::Plus))
246    }
247
248    pub fn is_quote(&self) -> bool {
249        matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
250    }
251
252    pub fn is_apostrophe(&self) -> bool {
253        matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
254    }
255
256    pub fn is_period(&self) -> bool {
257        matches!(self, TokenKind::Punctuation(Punctuation::Period))
258    }
259
260    pub fn is_at(&self) -> bool {
261        matches!(self, TokenKind::Punctuation(Punctuation::At))
262    }
263
264    pub fn is_comma(&self) -> bool {
265        matches!(self, TokenKind::Punctuation(Punctuation::Comma))
266    }
267
268    pub fn is_semicolon(&self) -> bool {
269        matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
270    }
271
272    pub fn is_acute(&self) -> bool {
273        matches!(self, TokenKind::Punctuation(Punctuation::Acute))
274    }
275
276    pub fn is_ampersand(&self) -> bool {
277        matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
278    }
279
280    pub fn is_backslash(&self) -> bool {
281        matches!(self, TokenKind::Punctuation(Punctuation::Backslash))
282    }
283
284    pub fn is_slash(&self) -> bool {
285        matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
286    }
287
288    pub fn is_percent(&self) -> bool {
289        matches!(self, TokenKind::Punctuation(Punctuation::Percent))
290    }
291
292    // Miscellaneous is-methods
293
294    /// Checks whether a token is word-like--meaning it is more complex than punctuation and can
295    /// hold semantic meaning in the way a word does.
296    pub fn is_word_like(&self) -> bool {
297        matches!(
298            self,
299            TokenKind::Word(..)
300                | TokenKind::EmailAddress
301                | TokenKind::Hostname
302                | TokenKind::Decade
303                | TokenKind::Number(..)
304        )
305    }
306
307    pub(crate) fn is_chunk_terminator(&self) -> bool {
308        if self.is_sentence_terminator() {
309            return true;
310        }
311
312        match self {
313            TokenKind::Punctuation(punct) => {
314                matches!(
315                    punct,
316                    Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
317                )
318            }
319            _ => false,
320        }
321    }
322
323    pub fn is_sentence_terminator(&self) -> bool {
324        match self {
325            TokenKind::Punctuation(punct) => [
326                Punctuation::Period,
327                Punctuation::Bang,
328                Punctuation::Question,
329            ]
330            .contains(punct),
331            TokenKind::ParagraphBreak => true,
332            _ => false,
333        }
334    }
335
336    /// Used by `crate::parsers::CollapseIdentifiers`
337    /// TODO: Separate this into two functions and add OR functionality to
338    /// pattern matching
339    pub fn is_case_separator(&self) -> bool {
340        matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
341            || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
342    }
343
344    /// Checks whether the token is whitespace.
345    pub fn is_whitespace(&self) -> bool {
346        matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
347    }
348
349    pub fn is_upos(&self, upos: UPOS) -> bool {
350        let Some(Some(meta)) = self.as_word() else {
351            return false;
352        };
353
354        meta.pos_tag == Some(upos)
355    }
356
357    // Miscellaneous non-is methods
358
359    /// Checks that `self` is the same enum variant as `other`, regardless of
360    /// whether the inner metadata is also equal.
361    pub fn matches_variant_of(&self, other: &Self) -> bool {
362        self.with_default_data() == other.with_default_data()
363    }
364
365    /// Produces a copy of `self` with any inner data replaced with its default
366    /// value. Useful for making comparisons on just the variant of the
367    /// enum.
368    pub fn with_default_data(&self) -> Self {
369        match self {
370            TokenKind::Word(_) => TokenKind::Word(Default::default()),
371            TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
372            TokenKind::Number(..) => TokenKind::Number(Default::default()),
373            TokenKind::Space(_) => TokenKind::Space(Default::default()),
374            TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
375            _ => self.clone(),
376        }
377    }
378
379    /// Construct a [`TokenKind::Word`] with no metadata.
380    pub fn blank_word() -> Self {
381        Self::Word(None)
382    }
383
384    // Punctuation and symbol non-is methods
385
386    pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
387        self.as_mut_punctuation()?.as_mut_quote()
388    }
389
390    pub fn as_quote(&self) -> Option<&Quote> {
391        self.as_punctuation()?.as_quote()
392    }
393}
394
395#[cfg(test)]
396mod tests {
397    use crate::Document;
398
399    #[test]
400    fn car_is_singular_noun() {
401        let doc = Document::new_plain_english_curated("car");
402        let tk = &doc.tokens().next().unwrap().kind;
403        assert!(tk.is_singular_noun());
404    }
405
406    #[test]
407    fn traffic_is_mass_noun_only() {
408        let doc = Document::new_plain_english_curated("traffic");
409        let tk = &doc.tokens().next().unwrap().kind;
410        assert!(tk.is_mass_noun_only());
411    }
412
413    #[test]
414    fn equipment_is_mass_noun() {
415        let doc = Document::new_plain_english_curated("equipment");
416        let tk = &doc.tokens().next().unwrap().kind;
417        assert!(tk.is_mass_noun());
418    }
419
420    #[test]
421    fn equipment_is_non_countable_noun() {
422        let doc = Document::new_plain_english_curated("equipment");
423        let tk = &doc.tokens().next().unwrap().kind;
424        assert!(tk.is_non_countable_noun());
425    }
426
427    #[test]
428    fn equipment_isnt_countable_noun() {
429        let doc = Document::new_plain_english_curated("equipment");
430        let tk = &doc.tokens().next().unwrap().kind;
431        assert!(!tk.is_countable_noun());
432    }
433
434    #[test]
435    fn oov_word_is_oov() {
436        let doc = Document::new_plain_english_curated("nonexistentword");
437        let tk = &doc.tokens().next().unwrap().kind;
438        assert!(tk.is_oov());
439    }
440
441    #[test]
442    fn known_word_is_not_oov() {
443        let doc = Document::new_plain_english_curated("car");
444        let tk = &doc.tokens().next().unwrap().kind;
445        assert!(!tk.is_oov());
446    }
447
448    #[test]
449    fn non_word_tokens_are_not_oov() {
450        let doc = Document::new_plain_english_curated("Hello, world!");
451        let tokens: Vec<_> = doc.tokens().collect();
452
453        // Comma should not be OOV
454        assert!(!tokens[1].kind.is_oov());
455        // Exclamation mark should not be OOV
456        assert!(!tokens[3].kind.is_oov());
457    }
458}