harper_core/
token_kind.rs

1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{
6    DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word, dict_word_metadata::Person,
7};
8
9/// Generate wrapper code to pass a function call to the inner [`DictWordMetadata`],  
10/// if the token is indeed a word, while also emitting method-level documentation.
11macro_rules! delegate_to_metadata {
12    ($($method:ident),* $(,)?) => {
13        $(
14            #[doc = concat!(
15                "Delegates to [`DictWordMetadata::",
16                stringify!($method),
17                "`] when this token is a word.\n\n",
18                "Returns `false` if the token is not a word."
19            )]
20            pub fn $method(&self) -> bool {
21                let Word(Some(metadata)) = self else {
22                    return false;
23                };
24                metadata.$method()
25            }
26        )*
27    };
28}
29
30/// The parsed value of a [`Token`](crate::Token).
31/// Has a variety of queries available.
32/// If there is a query missing, it may be easy to implement by just calling the
33/// `delegate_to_metadata` macro.
34#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
35#[serde(tag = "kind", content = "value")]
36pub enum TokenKind {
37    /// `None` if the word does not exist in the dictionary.
38    Word(Option<DictWordMetadata>),
39    Punctuation(Punctuation),
40    Decade,
41    Number(Number),
42    /// A sequence of " " spaces.
43    Space(usize),
44    /// A sequence of "\n" newlines
45    Newline(usize),
46    EmailAddress,
47    Url,
48    Hostname,
49    /// A special token used for things like inline code blocks that should be
50    /// ignored by all linters.
51    #[default]
52    Unlintable,
53    ParagraphBreak,
54    Regexish,
55    HeadingStart,
56}
57
58impl TokenKind {
59    // DictWord metadata delegation methods grouped by part of speech
60    delegate_to_metadata! {
61        // Nominal methods (nouns and pronouns)
62        is_nominal,
63        is_noun,
64        is_pronoun,
65        is_proper_noun,
66        is_singular_nominal,
67        is_plural_nominal,
68        is_possessive_nominal,
69        is_non_plural_nominal,
70        is_singular_noun,
71        is_plural_noun,
72        is_non_plural_noun,
73        is_non_possessive_noun,
74        is_countable_noun,
75        is_non_countable_noun,
76        is_mass_noun,
77        is_mass_noun_only,
78        is_non_mass_noun,
79        is_singular_pronoun,
80        is_plural_pronoun,
81        is_non_plural_pronoun,
82        is_reflexive_pronoun,
83        is_personal_pronoun,
84        is_first_person_singular_pronoun,
85        is_first_person_plural_pronoun,
86        is_second_person_pronoun,
87        is_third_person_pronoun,
88        is_third_person_singular_pronoun,
89        is_third_person_plural_pronoun,
90        is_subject_pronoun,
91        is_object_pronoun,
92        is_possessive_noun,
93        is_possessive_pronoun,
94
95        // Verb methods
96        is_verb,
97        is_auxiliary_verb,
98        is_linking_verb,
99        is_verb_lemma,
100        is_verb_past_form,
101        is_verb_simple_past_form,
102        is_verb_past_participle_form,
103        is_verb_progressive_form,
104        is_verb_third_person_singular_present_form,
105
106        // Adjective methods
107        is_adjective,
108        is_comparative_adjective,
109        is_superlative_adjective,
110        is_positive_adjective,
111
112        // Adverb methods
113        is_adverb,
114        is_manner_adverb,
115        is_frequency_adverb,
116        is_degree_adverb,
117
118        // Determiner methods
119        is_determiner,
120        is_demonstrative_determiner,
121        is_possessive_determiner,
122        is_quantifier,
123        is_non_quantifier_determiner,
124        is_non_demonstrative_determiner,
125
126        // Conjunction methods
127        is_conjunction,
128
129        // Generic word methods
130        is_swear,
131        is_likely_homograph,
132
133        // Orthography methods
134        is_lowercase,
135        is_titlecase,
136        is_allcaps,
137        is_lower_camel,
138        is_upper_camel,
139        is_apostrophized,
140
141        is_roman_numerals
142    }
143
144    pub fn get_pronoun_person(&self) -> Option<Person> {
145        let Word(Some(metadata)) = self else {
146            return None;
147        };
148        metadata.get_person()
149    }
150
151    // DictWord metadata delegation methods not generated by macro
152    pub fn is_preposition(&self) -> bool {
153        let Word(Some(metadata)) = self else {
154            return false;
155        };
156        metadata.preposition
157    }
158
159    // Generic word is-methods
160
161    pub fn is_common_word(&self) -> bool {
162        let Word(Some(metadata)) = self else {
163            return true;
164        };
165        metadata.common
166    }
167
168    /// Checks whether the token is a member of a nominal phrase.
169    pub fn is_np_member(&self) -> bool {
170        let Word(Some(metadata)) = self else {
171            return false;
172        };
173        metadata.np_member.unwrap_or(false)
174    }
175
176    /// Checks whether a word token is out-of-vocabulary (not found in the dictionary).
177    ///
178    /// Returns `true` if the token is a word that was not found in the dictionary,
179    /// `false` if the token is a word found in the dictionary or is not a word token.
180    pub fn is_oov(&self) -> bool {
181        matches!(self, TokenKind::Word(None))
182    }
183
184    // Number is-methods
185
186    pub fn is_cardinal_number(&self) -> bool {
187        matches!(self, TokenKind::Number(Number { suffix: None, .. }))
188    }
189
190    pub fn is_ordinal_number(&self) -> bool {
191        matches!(
192            self,
193            TokenKind::Number(Number {
194                suffix: Some(_),
195                ..
196            })
197        )
198    }
199
200    // Punctuation and symbol is-methods
201
202    pub fn is_open_square(&self) -> bool {
203        matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
204    }
205
206    pub fn is_close_square(&self) -> bool {
207        matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
208    }
209
210    pub fn is_less_than(&self) -> bool {
211        matches!(self, TokenKind::Punctuation(Punctuation::LessThan))
212    }
213
214    pub fn is_greater_than(&self) -> bool {
215        matches!(self, TokenKind::Punctuation(Punctuation::GreaterThan))
216    }
217
218    pub fn is_open_round(&self) -> bool {
219        matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
220    }
221
222    pub fn is_close_round(&self) -> bool {
223        matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
224    }
225
226    pub fn is_pipe(&self) -> bool {
227        matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
228    }
229
230    pub fn is_currency(&self) -> bool {
231        matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
232    }
233
234    pub fn is_ellipsis(&self) -> bool {
235        matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
236    }
237
238    pub fn is_hyphen(&self) -> bool {
239        matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
240    }
241
242    pub fn is_quote(&self) -> bool {
243        matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
244    }
245
246    pub fn is_apostrophe(&self) -> bool {
247        matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
248    }
249
250    pub fn is_period(&self) -> bool {
251        matches!(self, TokenKind::Punctuation(Punctuation::Period))
252    }
253
254    pub fn is_at(&self) -> bool {
255        matches!(self, TokenKind::Punctuation(Punctuation::At))
256    }
257
258    pub fn is_comma(&self) -> bool {
259        matches!(self, TokenKind::Punctuation(Punctuation::Comma))
260    }
261
262    pub fn is_semicolon(&self) -> bool {
263        matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
264    }
265
266    pub fn is_ampersand(&self) -> bool {
267        matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
268    }
269
270    pub fn is_slash(&self) -> bool {
271        matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
272    }
273
274    // Miscellaneous is-methods
275
276    /// Checks whether a token is word-like--meaning it is more complex than punctuation and can
277    /// hold semantic meaning in the way a word does.
278    pub fn is_word_like(&self) -> bool {
279        matches!(
280            self,
281            TokenKind::Word(..)
282                | TokenKind::EmailAddress
283                | TokenKind::Hostname
284                | TokenKind::Decade
285                | TokenKind::Number(..)
286        )
287    }
288
289    pub(crate) fn is_chunk_terminator(&self) -> bool {
290        if self.is_sentence_terminator() {
291            return true;
292        }
293
294        match self {
295            TokenKind::Punctuation(punct) => {
296                matches!(
297                    punct,
298                    Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
299                )
300            }
301            _ => false,
302        }
303    }
304
305    pub fn is_sentence_terminator(&self) -> bool {
306        match self {
307            TokenKind::Punctuation(punct) => [
308                Punctuation::Period,
309                Punctuation::Bang,
310                Punctuation::Question,
311            ]
312            .contains(punct),
313            TokenKind::ParagraphBreak => true,
314            _ => false,
315        }
316    }
317
318    /// Used by `crate::parsers::CollapseIdentifiers`
319    /// TODO: Separate this into two functions and add OR functionality to
320    /// pattern matching
321    pub fn is_case_separator(&self) -> bool {
322        matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
323            || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
324    }
325
326    /// Checks whether the token is whitespace.
327    pub fn is_whitespace(&self) -> bool {
328        matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
329    }
330
331    pub fn is_upos(&self, upos: UPOS) -> bool {
332        let Some(Some(meta)) = self.as_word() else {
333            return false;
334        };
335
336        meta.pos_tag == Some(upos)
337    }
338
339    // Miscellaneous non-is methods
340
341    /// Checks that `self` is the same enum variant as `other`, regardless of
342    /// whether the inner metadata is also equal.
343    pub fn matches_variant_of(&self, other: &Self) -> bool {
344        self.with_default_data() == other.with_default_data()
345    }
346
347    /// Produces a copy of `self` with any inner data replaced with its default
348    /// value. Useful for making comparisons on just the variant of the
349    /// enum.
350    pub fn with_default_data(&self) -> Self {
351        match self {
352            TokenKind::Word(_) => TokenKind::Word(Default::default()),
353            TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
354            TokenKind::Number(..) => TokenKind::Number(Default::default()),
355            TokenKind::Space(_) => TokenKind::Space(Default::default()),
356            TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
357            _ => self.clone(),
358        }
359    }
360
361    /// Construct a [`TokenKind::Word`] with no metadata.
362    pub fn blank_word() -> Self {
363        Self::Word(None)
364    }
365
366    // Punctuation and symbol non-is methods
367
368    pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
369        self.as_mut_punctuation()?.as_mut_quote()
370    }
371
372    pub fn as_quote(&self) -> Option<&Quote> {
373        self.as_punctuation()?.as_quote()
374    }
375}
376
377#[cfg(test)]
378mod tests {
379    use crate::Document;
380
381    #[test]
382    fn car_is_singular_noun() {
383        let doc = Document::new_plain_english_curated("car");
384        let tk = &doc.tokens().next().unwrap().kind;
385        assert!(tk.is_singular_noun());
386    }
387
388    #[test]
389    fn traffic_is_mass_noun_only() {
390        let doc = Document::new_plain_english_curated("traffic");
391        let tk = &doc.tokens().next().unwrap().kind;
392        assert!(tk.is_mass_noun_only());
393    }
394
395    #[test]
396    fn equipment_is_mass_noun() {
397        let doc = Document::new_plain_english_curated("equipment");
398        let tk = &doc.tokens().next().unwrap().kind;
399        assert!(tk.is_mass_noun());
400    }
401
402    #[test]
403    fn equipment_is_non_countable_noun() {
404        let doc = Document::new_plain_english_curated("equipment");
405        let tk = &doc.tokens().next().unwrap().kind;
406        assert!(tk.is_non_countable_noun());
407    }
408
409    #[test]
410    fn equipment_isnt_countable_noun() {
411        let doc = Document::new_plain_english_curated("equipment");
412        let tk = &doc.tokens().next().unwrap().kind;
413        assert!(!tk.is_countable_noun());
414    }
415
416    #[test]
417    fn oov_word_is_oov() {
418        let doc = Document::new_plain_english_curated("nonexistentword");
419        let tk = &doc.tokens().next().unwrap().kind;
420        assert!(tk.is_oov());
421    }
422
423    #[test]
424    fn known_word_is_not_oov() {
425        let doc = Document::new_plain_english_curated("car");
426        let tk = &doc.tokens().next().unwrap().kind;
427        assert!(!tk.is_oov());
428    }
429
430    #[test]
431    fn non_word_tokens_are_not_oov() {
432        let doc = Document::new_plain_english_curated("Hello, world!");
433        let tokens: Vec<_> = doc.tokens().collect();
434
435        // Comma should not be OOV
436        assert!(!tokens[1].kind.is_oov());
437        // Exclamation mark should not be OOV
438        assert!(!tokens[3].kind.is_oov());
439    }
440}