harper_core/
token_kind.rs

1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word};
6
7/// Generate wrapper code to pass a function call to the inner [`DictWordMetadata`],  
8/// if the token is indeed a word, while also emitting method-level documentation.
9macro_rules! delegate_to_metadata {
10    ($($method:ident),* $(,)?) => {
11        $(
12            #[doc = concat!(
13                "Delegates to [`DictWordMetadata::",
14                stringify!($method),
15                "`] when this token is a word.\n\n",
16                "Returns `false` if the token is not a word."
17            )]
18            pub fn $method(&self) -> bool {
19                let Word(Some(metadata)) = self else {
20                    return false;
21                };
22                metadata.$method()
23            }
24        )*
25    };
26}
27
28/// The parsed value of a [`Token`](crate::Token).
29/// Has a variety of queries available.
30/// If there is a query missing, it may be easy to implement by just calling the
31/// `delegate_to_metadata` macro.
32#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
33#[serde(tag = "kind", content = "value")]
34pub enum TokenKind {
35    /// `None` if the word does not exist in the dictionary.
36    Word(Option<DictWordMetadata>),
37    Punctuation(Punctuation),
38    Decade,
39    Number(Number),
40    /// A sequence of " " spaces.
41    Space(usize),
42    /// A sequence of "\n" newlines
43    Newline(usize),
44    EmailAddress,
45    Url,
46    Hostname,
47    /// A special token used for things like inline code blocks that should be
48    /// ignored by all linters.
49    #[default]
50    Unlintable,
51    ParagraphBreak,
52    Regexish,
53    HeadingStart,
54}
55
56impl TokenKind {
57    // DictWord metadata delegation methods grouped by part of speech
58    delegate_to_metadata! {
59        // Nominal methods (nouns and pronouns)
60        is_nominal,
61        is_noun,
62        is_pronoun,
63        is_proper_noun,
64        is_singular_nominal,
65        is_plural_nominal,
66        is_possessive_nominal,
67        is_non_plural_nominal,
68        is_singular_noun,
69        is_plural_noun,
70        is_non_plural_noun,
71        is_countable_noun,
72        is_non_countable_noun,
73        is_mass_noun,
74        is_mass_noun_only,
75        is_non_mass_noun,
76        is_singular_pronoun,
77        is_plural_pronoun,
78        is_non_plural_pronoun,
79        is_reflexive_pronoun,
80        is_personal_pronoun,
81        is_first_person_singular_pronoun,
82        is_first_person_plural_pronoun,
83        is_second_person_pronoun,
84        is_third_person_pronoun,
85        is_third_person_singular_pronoun,
86        is_third_person_plural_pronoun,
87        is_subject_pronoun,
88        is_object_pronoun,
89        is_possessive_noun,
90        is_possessive_pronoun,
91
92        // Verb methods
93        is_verb,
94        is_auxiliary_verb,
95        is_linking_verb,
96        is_verb_lemma,
97        is_verb_past_form,
98        is_verb_simple_past_form,
99        is_verb_past_participle_form,
100        is_verb_progressive_form,
101        is_verb_third_person_singular_present_form,
102
103        // Adjective methods
104        is_adjective,
105        is_comparative_adjective,
106        is_superlative_adjective,
107        is_positive_adjective,
108
109        // Adverb methods
110        is_adverb,
111        is_manner_adverb,
112        is_frequency_adverb,
113        is_degree_adverb,
114
115        // Determiner methods
116        is_determiner,
117        is_demonstrative_determiner,
118        is_possessive_determiner,
119        is_quantifier,
120        is_non_quantifier_determiner,
121        is_non_demonstrative_determiner,
122
123        // Conjunction methods
124        is_conjunction,
125
126        // Generic word methods
127        is_swear,
128        is_likely_homograph,
129
130        // Orthography methods
131        is_lowercase,
132        is_titlecase,
133        is_allcaps,
134        is_lower_camel,
135        is_upper_camel,
136        is_apostrophized,
137
138        is_roman_numerals
139    }
140
141    // DictWord metadata delegation methods not generated by macro
142    pub fn is_preposition(&self) -> bool {
143        let Word(Some(metadata)) = self else {
144            return false;
145        };
146        metadata.preposition
147    }
148
149    // Generic word is-methods
150
151    pub fn is_common_word(&self) -> bool {
152        let Word(Some(metadata)) = self else {
153            return true;
154        };
155        metadata.common
156    }
157
158    /// Checks whether the token is a member of a nominal phrase.
159    pub fn is_np_member(&self) -> bool {
160        let Word(Some(metadata)) = self else {
161            return false;
162        };
163        metadata.np_member.unwrap_or(false)
164    }
165
166    /// Checks whether a word token is out-of-vocabulary (not found in the dictionary).
167    ///
168    /// Returns `true` if the token is a word that was not found in the dictionary,
169    /// `false` if the token is a word found in the dictionary or is not a word token.
170    pub fn is_oov(&self) -> bool {
171        matches!(self, TokenKind::Word(None))
172    }
173
174    // Number is-methods
175
176    pub fn is_cardinal_number(&self) -> bool {
177        matches!(self, TokenKind::Number(Number { suffix: None, .. }))
178    }
179
180    pub fn is_ordinal_number(&self) -> bool {
181        matches!(
182            self,
183            TokenKind::Number(Number {
184                suffix: Some(_),
185                ..
186            })
187        )
188    }
189
190    // Punctuation and symbol is-methods
191
192    pub fn is_open_square(&self) -> bool {
193        matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
194    }
195
196    pub fn is_close_square(&self) -> bool {
197        matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
198    }
199
200    pub fn is_open_round(&self) -> bool {
201        matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
202    }
203
204    pub fn is_close_round(&self) -> bool {
205        matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
206    }
207
208    pub fn is_pipe(&self) -> bool {
209        matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
210    }
211
212    pub fn is_currency(&self) -> bool {
213        matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
214    }
215
216    pub fn is_ellipsis(&self) -> bool {
217        matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
218    }
219
220    pub fn is_hyphen(&self) -> bool {
221        matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
222    }
223
224    pub fn is_quote(&self) -> bool {
225        matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
226    }
227
228    pub fn is_apostrophe(&self) -> bool {
229        matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
230    }
231
232    pub fn is_period(&self) -> bool {
233        matches!(self, TokenKind::Punctuation(Punctuation::Period))
234    }
235
236    pub fn is_at(&self) -> bool {
237        matches!(self, TokenKind::Punctuation(Punctuation::At))
238    }
239
240    pub fn is_comma(&self) -> bool {
241        matches!(self, TokenKind::Punctuation(Punctuation::Comma))
242    }
243
244    pub fn is_semicolon(&self) -> bool {
245        matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
246    }
247
248    pub fn is_ampersand(&self) -> bool {
249        matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
250    }
251
252    pub fn is_slash(&self) -> bool {
253        matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
254    }
255
256    // Miscellaneous is-methods
257
258    /// Checks whether a token is word-like--meaning it is more complex than punctuation and can
259    /// hold semantic meaning in the way a word does.
260    pub fn is_word_like(&self) -> bool {
261        matches!(
262            self,
263            TokenKind::Word(..)
264                | TokenKind::EmailAddress
265                | TokenKind::Hostname
266                | TokenKind::Decade
267                | TokenKind::Number(..)
268        )
269    }
270
271    pub(crate) fn is_chunk_terminator(&self) -> bool {
272        if self.is_sentence_terminator() {
273            return true;
274        }
275
276        match self {
277            TokenKind::Punctuation(punct) => {
278                matches!(
279                    punct,
280                    Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
281                )
282            }
283            _ => false,
284        }
285    }
286
287    pub(crate) fn is_sentence_terminator(&self) -> bool {
288        match self {
289            TokenKind::Punctuation(punct) => [
290                Punctuation::Period,
291                Punctuation::Bang,
292                Punctuation::Question,
293            ]
294            .contains(punct),
295            TokenKind::ParagraphBreak => true,
296            _ => false,
297        }
298    }
299
300    /// Used by `crate::parsers::CollapseIdentifiers`
301    /// TODO: Separate this into two functions and add OR functionality to
302    /// pattern matching
303    pub fn is_case_separator(&self) -> bool {
304        matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
305            || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
306    }
307
308    /// Checks whether the token is whitespace.
309    pub fn is_whitespace(&self) -> bool {
310        matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
311    }
312
313    pub fn is_upos(&self, upos: UPOS) -> bool {
314        let Some(Some(meta)) = self.as_word() else {
315            return false;
316        };
317
318        meta.pos_tag == Some(upos)
319    }
320
321    // Miscellaneous non-is methods
322
323    /// Checks that `self` is the same enum variant as `other`, regardless of
324    /// whether the inner metadata is also equal.
325    pub fn matches_variant_of(&self, other: &Self) -> bool {
326        self.with_default_data() == other.with_default_data()
327    }
328
329    /// Produces a copy of `self` with any inner data replaced with its default
330    /// value. Useful for making comparisons on just the variant of the
331    /// enum.
332    pub fn with_default_data(&self) -> Self {
333        match self {
334            TokenKind::Word(_) => TokenKind::Word(Default::default()),
335            TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
336            TokenKind::Number(..) => TokenKind::Number(Default::default()),
337            TokenKind::Space(_) => TokenKind::Space(Default::default()),
338            TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
339            _ => self.clone(),
340        }
341    }
342
343    /// Construct a [`TokenKind::Word`] with no metadata.
344    pub fn blank_word() -> Self {
345        Self::Word(None)
346    }
347
348    // Punctuation and symbol non-is methods
349
350    pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
351        self.as_mut_punctuation()?.as_mut_quote()
352    }
353
354    pub fn as_quote(&self) -> Option<&Quote> {
355        self.as_punctuation()?.as_quote()
356    }
357}
358
359#[cfg(test)]
360mod tests {
361    use crate::Document;
362
363    #[test]
364    fn car_is_singular_noun() {
365        let doc = Document::new_plain_english_curated("car");
366        let tk = &doc.tokens().next().unwrap().kind;
367        assert!(tk.is_singular_noun());
368    }
369
370    #[test]
371    fn traffic_is_mass_noun_only() {
372        let doc = Document::new_plain_english_curated("traffic");
373        let tk = &doc.tokens().next().unwrap().kind;
374        assert!(tk.is_mass_noun_only());
375    }
376
377    #[test]
378    fn equipment_is_mass_noun() {
379        let doc = Document::new_plain_english_curated("equipment");
380        let tk = &doc.tokens().next().unwrap().kind;
381        assert!(tk.is_mass_noun());
382    }
383
384    #[test]
385    fn equipment_is_non_countable_noun() {
386        let doc = Document::new_plain_english_curated("equipment");
387        let tk = &doc.tokens().next().unwrap().kind;
388        assert!(tk.is_non_countable_noun());
389    }
390
391    #[test]
392    fn equipment_isnt_countable_noun() {
393        let doc = Document::new_plain_english_curated("equipment");
394        let tk = &doc.tokens().next().unwrap().kind;
395        assert!(!tk.is_countable_noun());
396    }
397
398    #[test]
399    fn oov_word_is_oov() {
400        let doc = Document::new_plain_english_curated("nonexistentword");
401        let tk = &doc.tokens().next().unwrap().kind;
402        assert!(tk.is_oov());
403    }
404
405    #[test]
406    fn known_word_is_not_oov() {
407        let doc = Document::new_plain_english_curated("car");
408        let tk = &doc.tokens().next().unwrap().kind;
409        assert!(!tk.is_oov());
410    }
411
412    #[test]
413    fn non_word_tokens_are_not_oov() {
414        let doc = Document::new_plain_english_curated("Hello, world!");
415        let tokens: Vec<_> = doc.tokens().collect();
416
417        // Comma should not be OOV
418        assert!(!tokens[1].kind.is_oov());
419        // Exclamation mark should not be OOV
420        assert!(!tokens[3].kind.is_oov());
421    }
422}