harper_core/
token_kind.rs

1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{Number, Punctuation, Quote, TokenKind::Word, WordMetadata};
6
7/// Generate wrapper code to pass a function call to the inner [`WordMetadata`],  
8/// if the token is indeed a word, while also emitting method-level documentation.
9macro_rules! delegate_to_metadata {
10    ($($method:ident),* $(,)?) => {
11        $(
12            #[doc = concat!(
13                "Delegates to [`WordMetadata::",
14                stringify!($method),
15                "`] when this token is a word.\n\n",
16                "Returns `false` if the token is not a word."
17            )]
18            pub fn $method(&self) -> bool {
19                let Word(Some(metadata)) = self else {
20                    return false;
21                };
22                metadata.$method()
23            }
24        )*
25    };
26}
27
28/// The parsed value of a [`Token`](crate::Token).
29/// Has a variety of queries available.
30/// If there is a query missing, it may be easy to implement by just calling the
31/// `delegate_to_metadata` macro.
32#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
33#[serde(tag = "kind", content = "value")]
34pub enum TokenKind {
35    /// `None` if the word does not exist in the dictionary.
36    Word(Option<WordMetadata>),
37    Punctuation(Punctuation),
38    Decade,
39    Number(Number),
40    /// A sequence of " " spaces.
41    Space(usize),
42    /// A sequence of "\n" newlines
43    Newline(usize),
44    EmailAddress,
45    Url,
46    Hostname,
47    /// A special token used for things like inline code blocks that should be
48    /// ignored by all linters.
49    #[default]
50    Unlintable,
51    ParagraphBreak,
52    Regexish,
53}
54
55impl TokenKind {
56    // Word metadata delegation methods grouped by part of speech
57    delegate_to_metadata! {
58        // Nominal methods (nouns and pronouns)
59        is_nominal,
60        is_noun,
61        is_pronoun,
62        is_proper_noun,
63        is_singular_nominal,
64        is_plural_nominal,
65        is_possessive_nominal,
66        is_non_plural_nominal,
67        is_singular_noun,
68        is_plural_noun,
69        is_non_plural_noun,
70        is_countable_noun,
71        is_non_countable_noun,
72        is_mass_noun,
73        is_mass_noun_only,
74        is_non_mass_noun,
75        is_singular_pronoun,
76        is_plural_pronoun,
77        is_non_plural_pronoun,
78        is_reflexive_pronoun,
79        is_personal_pronoun,
80        is_first_person_singular_pronoun,
81        is_first_person_plural_pronoun,
82        is_second_person_pronoun,
83        is_third_person_pronoun,
84        is_third_person_singular_pronoun,
85        is_third_person_plural_pronoun,
86        is_object_pronoun,
87        is_possessive_noun,
88        is_possessive_pronoun,
89
90        // Verb methods
91        is_verb,
92        is_auxiliary_verb,
93        is_linking_verb,
94        is_verb_lemma,
95        is_verb_past_form,
96        is_verb_progressive_form,
97        is_verb_third_person_singular_present_form,
98
99        // Adjective methods
100        is_adjective,
101        is_comparative_adjective,
102        is_superlative_adjective,
103        is_positive_adjective,
104
105        // Adverb methods
106        is_adverb,
107
108        // Determiner methods
109        is_determiner,
110        is_demonstrative_determiner,
111        is_possessive_determiner,
112        is_quantifier,
113        is_non_quantifier_determiner,
114
115        // Conjunction methods
116        is_conjunction,
117
118        // Generic word methods
119        is_swear,
120        is_likely_homograph,
121
122        // Orthography methods
123        is_lowercase,
124        is_titlecase,
125        is_allcaps,
126        is_lower_camel,
127        is_upper_camel,
128        is_apostrophized,
129
130        is_roman_numerals
131    }
132
133    // Word metadata delegation methods not generated by macro
134    pub fn is_preposition(&self) -> bool {
135        let Word(Some(metadata)) = self else {
136            return false;
137        };
138        metadata.preposition
139    }
140
141    pub fn is_common_word(&self) -> bool {
142        let Word(Some(metadata)) = self else {
143            return true;
144        };
145        metadata.common
146    }
147
148    /// Checks whether the token is a member of a nominal phrase.
149    pub fn is_np_member(&self) -> bool {
150        let Word(Some(metadata)) = self else {
151            return false;
152        };
153        metadata.np_member.unwrap_or(false)
154    }
155
156    /// Checks whether a word token is out-of-vocabulary (not found in the dictionary).
157    ///
158    /// Returns `true` if the token is a word that was not found in the dictionary,
159    /// `false` if the token is a word found in the dictionary or is not a word token.
160    pub fn is_oov(&self) -> bool {
161        matches!(self, TokenKind::Word(None))
162    }
163
164    // Punctuation and symbol is-methods
165
166    pub fn is_open_square(&self) -> bool {
167        matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
168    }
169
170    pub fn is_close_square(&self) -> bool {
171        matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
172    }
173
174    pub fn is_open_round(&self) -> bool {
175        matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
176    }
177
178    pub fn is_close_round(&self) -> bool {
179        matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
180    }
181
182    pub fn is_pipe(&self) -> bool {
183        matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
184    }
185
186    pub fn is_currency(&self) -> bool {
187        matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
188    }
189
190    pub fn is_ellipsis(&self) -> bool {
191        matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
192    }
193
194    pub fn is_hyphen(&self) -> bool {
195        matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
196    }
197
198    pub fn is_quote(&self) -> bool {
199        matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
200    }
201
202    pub fn is_apostrophe(&self) -> bool {
203        matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
204    }
205
206    pub fn is_period(&self) -> bool {
207        matches!(self, TokenKind::Punctuation(Punctuation::Period))
208    }
209
210    pub fn is_at(&self) -> bool {
211        matches!(self, TokenKind::Punctuation(Punctuation::At))
212    }
213
214    pub fn is_comma(&self) -> bool {
215        matches!(self, TokenKind::Punctuation(Punctuation::Comma))
216    }
217
218    pub fn is_semicolon(&self) -> bool {
219        matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
220    }
221
222    pub fn is_ampersand(&self) -> bool {
223        matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
224    }
225
226    pub fn is_slash(&self) -> bool {
227        matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
228    }
229
230    // Miscellaneous is-methods
231
232    /// Checks whether a token is word-like--meaning it is more complex than punctuation and can
233    /// hold semantic meaning in the way a word does.
234    pub fn is_word_like(&self) -> bool {
235        matches!(
236            self,
237            TokenKind::Word(..)
238                | TokenKind::EmailAddress
239                | TokenKind::Hostname
240                | TokenKind::Decade
241                | TokenKind::Number(..)
242        )
243    }
244
245    pub(crate) fn is_chunk_terminator(&self) -> bool {
246        if self.is_sentence_terminator() {
247            return true;
248        }
249
250        match self {
251            TokenKind::Punctuation(punct) => {
252                matches!(
253                    punct,
254                    Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
255                )
256            }
257            _ => false,
258        }
259    }
260
261    pub(crate) fn is_sentence_terminator(&self) -> bool {
262        match self {
263            TokenKind::Punctuation(punct) => [
264                Punctuation::Period,
265                Punctuation::Bang,
266                Punctuation::Question,
267            ]
268            .contains(punct),
269            TokenKind::ParagraphBreak => true,
270            _ => false,
271        }
272    }
273
274    /// Used by `crate::parsers::CollapseIdentifiers`
275    /// TODO: Separate this into two functions and add OR functionality to
276    /// pattern matching
277    pub fn is_case_separator(&self) -> bool {
278        matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
279            || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
280    }
281
282    /// Checks whether the token is whitespace.
283    pub fn is_whitespace(&self) -> bool {
284        matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
285    }
286
287    pub fn is_upos(&self, upos: UPOS) -> bool {
288        let Some(Some(meta)) = self.as_word() else {
289            return false;
290        };
291
292        meta.pos_tag == Some(upos)
293    }
294
295    // Miscellaneous non-is methods
296
297    /// Checks that `self` is the same enum variant as `other`, regardless of
298    /// whether the inner metadata is also equal.
299    pub fn matches_variant_of(&self, other: &Self) -> bool {
300        self.with_default_data() == other.with_default_data()
301    }
302
303    /// Produces a copy of `self` with any inner data replaced with its default
304    /// value. Useful for making comparisons on just the variant of the
305    /// enum.
306    pub fn with_default_data(&self) -> Self {
307        match self {
308            TokenKind::Word(_) => TokenKind::Word(Default::default()),
309            TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
310            TokenKind::Number(..) => TokenKind::Number(Default::default()),
311            TokenKind::Space(_) => TokenKind::Space(Default::default()),
312            TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
313            _ => self.clone(),
314        }
315    }
316
317    /// Construct a [`TokenKind::Word`] with no metadata.
318    pub fn blank_word() -> Self {
319        Self::Word(None)
320    }
321
322    // Punctuation and symbol non-is methods
323
324    pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
325        self.as_mut_punctuation()?.as_mut_quote()
326    }
327
328    pub fn as_quote(&self) -> Option<&Quote> {
329        self.as_punctuation()?.as_quote()
330    }
331}
332
333#[cfg(test)]
334mod tests {
335    use crate::Document;
336
337    #[test]
338    fn car_is_singular_noun() {
339        let doc = Document::new_plain_english_curated("car");
340        let tk = &doc.tokens().next().unwrap().kind;
341        assert!(tk.is_singular_noun());
342    }
343
344    #[test]
345    fn traffic_is_mass_noun_only() {
346        let doc = Document::new_plain_english_curated("traffic");
347        let tk = &doc.tokens().next().unwrap().kind;
348        assert!(tk.is_mass_noun_only());
349    }
350
351    #[test]
352    fn equipment_is_mass_noun() {
353        let doc = Document::new_plain_english_curated("equipment");
354        let tk = &doc.tokens().next().unwrap().kind;
355        assert!(tk.is_mass_noun());
356    }
357
358    #[test]
359    fn equipment_is_non_countable_noun() {
360        let doc = Document::new_plain_english_curated("equipment");
361        let tk = &doc.tokens().next().unwrap().kind;
362        assert!(tk.is_non_countable_noun());
363    }
364
365    #[test]
366    fn equipment_isnt_countable_noun() {
367        let doc = Document::new_plain_english_curated("equipment");
368        let tk = &doc.tokens().next().unwrap().kind;
369        assert!(!tk.is_countable_noun());
370    }
371}