harper_core/
token_kind.rs

1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{Number, Punctuation, Quote, TokenKind::Word, WordMetadata};
6
7/// Generate wrapper code to pass a function call to the inner [`WordMetadata`],  
8/// if the token is indeed a word, while also emitting method-level documentation.
9macro_rules! delegate_to_metadata {
10    ($($method:ident),* $(,)?) => {
11        $(
12            #[doc = concat!(
13                "Delegates to [`WordMetadata::",
14                stringify!($method),
15                "`] when this token is a word.\n\n",
16                "Returns `false` if the token is not a word."
17            )]
18            pub fn $method(&self) -> bool {
19                let Word(Some(metadata)) = self else {
20                    return false;
21                };
22                metadata.$method()
23            }
24        )*
25    };
26}
27
28/// The parsed value of a [`Token`](crate::Token).
29/// Has a variety of queries available.
30/// If there is a query missing, it may be easy to implement by just calling the
31/// `delegate_to_metadata` macro.
32#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
33#[serde(tag = "kind", content = "value")]
34pub enum TokenKind {
35    /// `None` if the word does not exist in the dictionary.
36    Word(Option<WordMetadata>),
37    Punctuation(Punctuation),
38    Decade,
39    Number(Number),
40    /// A sequence of " " spaces.
41    Space(usize),
42    /// A sequence of "\n" newlines
43    Newline(usize),
44    EmailAddress,
45    Url,
46    Hostname,
47    /// A special token used for things like inline code blocks that should be
48    /// ignored by all linters.
49    #[default]
50    Unlintable,
51    ParagraphBreak,
52    Regexish,
53}
54
55impl TokenKind {
56    // Word metadata delegation methods grouped by part of speech
57    delegate_to_metadata! {
58        // Nominal methods (nouns and pronouns)
59        is_nominal,
60        is_noun,
61        is_pronoun,
62        is_proper_noun,
63        is_singular_nominal,
64        is_plural_nominal,
65        is_possessive_nominal,
66        is_non_plural_nominal,
67        is_singular_noun,
68        is_plural_noun,
69        is_non_plural_noun,
70        is_countable_noun,
71        is_non_countable_noun,
72        is_mass_noun,
73        is_non_mass_noun,
74        is_singular_pronoun,
75        is_plural_pronoun,
76        is_non_plural_pronoun,
77        is_reflexive_pronoun,
78        is_first_person_singular_pronoun,
79        is_first_person_plural_pronoun,
80        is_second_person_pronoun,
81        is_third_person_pronoun,
82        is_third_person_singular_pronoun,
83        is_third_person_plural_pronoun,
84        is_object_pronoun,
85        is_possessive_noun,
86        is_possessive_pronoun,
87
88        // Verb methods
89        is_verb,
90        is_auxiliary_verb,
91        is_linking_verb,
92        is_verb_lemma,
93        is_verb_past_form,
94        is_verb_progressive_form,
95        is_verb_third_person_singular_present_form,
96
97        // Adjective methods
98        is_adjective,
99
100        // Adverb methods
101        is_adverb,
102
103        // Determiner methods
104        is_determiner,
105        is_demonstrative_determiner,
106        is_possessive_determiner,
107
108        // Conjunction methods
109        is_conjunction
110    }
111
112    // Word metadata delegation methods not generated by macro
113    pub fn is_preposition(&self) -> bool {
114        let Word(Some(metadata)) = self else {
115            return false;
116        };
117        metadata.preposition
118    }
119
120    pub fn is_swear(&self) -> bool {
121        let Word(Some(metadata)) = self else {
122            return false;
123        };
124        metadata.is_swear()
125    }
126
127    pub fn is_common_word(&self) -> bool {
128        let Word(Some(metadata)) = self else {
129            return true;
130        };
131        metadata.common
132    }
133
134    pub fn is_likely_homograph(&self) -> bool {
135        let Word(Some(metadata)) = self else {
136            return false;
137        };
138        metadata.is_likely_homograph()
139    }
140
141    // Punctuation and symbol is-methods
142
143    pub fn is_open_square(&self) -> bool {
144        matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
145    }
146
147    pub fn is_close_square(&self) -> bool {
148        matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
149    }
150
151    pub fn is_pipe(&self) -> bool {
152        matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
153    }
154
155    pub fn is_currency(&self) -> bool {
156        matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
157    }
158
159    pub fn is_ellipsis(&self) -> bool {
160        matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
161    }
162
163    pub fn is_hyphen(&self) -> bool {
164        matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
165    }
166
167    pub fn is_quote(&self) -> bool {
168        matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
169    }
170
171    pub fn is_apostrophe(&self) -> bool {
172        matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
173    }
174
175    pub fn is_period(&self) -> bool {
176        matches!(self, TokenKind::Punctuation(Punctuation::Period))
177    }
178
179    pub fn is_at(&self) -> bool {
180        matches!(self, TokenKind::Punctuation(Punctuation::At))
181    }
182
183    pub fn is_comma(&self) -> bool {
184        matches!(self, TokenKind::Punctuation(Punctuation::Comma))
185    }
186
187    // Miscellaneous is-methods
188
189    /// Checks whether a token is word-like--meaning it is more complex than punctuation and can
190    /// hold semantic meaning in the way a word does.
191    pub fn is_word_like(&self) -> bool {
192        matches!(
193            self,
194            TokenKind::Word(..)
195                | TokenKind::EmailAddress
196                | TokenKind::Hostname
197                | TokenKind::Decade
198                | TokenKind::Number(..)
199        )
200    }
201
202    pub(crate) fn is_chunk_terminator(&self) -> bool {
203        if self.is_sentence_terminator() {
204            return true;
205        }
206
207        match self {
208            TokenKind::Punctuation(punct) => {
209                matches!(
210                    punct,
211                    Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
212                )
213            }
214            _ => false,
215        }
216    }
217
218    pub(crate) fn is_sentence_terminator(&self) -> bool {
219        match self {
220            TokenKind::Punctuation(punct) => [
221                Punctuation::Period,
222                Punctuation::Bang,
223                Punctuation::Question,
224            ]
225            .contains(punct),
226            TokenKind::ParagraphBreak => true,
227            _ => false,
228        }
229    }
230
231    /// Used by `crate::parsers::CollapseIdentifiers`
232    /// TODO: Separate this into two functions and add OR functionality to
233    /// pattern matching
234    pub fn is_case_separator(&self) -> bool {
235        matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
236            || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
237    }
238
239    /// Checks whether the token is whitespace.
240    pub fn is_whitespace(&self) -> bool {
241        matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
242    }
243
244    pub fn is_upos(&self, upos: UPOS) -> bool {
245        let Some(Some(meta)) = self.as_word() else {
246            return false;
247        };
248
249        meta.pos_tag == Some(upos)
250    }
251
252    // Miscellaneous non-is methods
253
254    /// Checks that `self` is the same enum variant as `other`, regardless of
255    /// whether the inner metadata is also equal.
256    pub fn matches_variant_of(&self, other: &Self) -> bool {
257        self.with_default_data() == other.with_default_data()
258    }
259
260    /// Produces a copy of `self` with any inner data replaced with its default
261    /// value. Useful for making comparisons on just the variant of the
262    /// enum.
263    pub fn with_default_data(&self) -> Self {
264        match self {
265            TokenKind::Word(_) => TokenKind::Word(Default::default()),
266            TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
267            TokenKind::Number(..) => TokenKind::Number(Default::default()),
268            TokenKind::Space(_) => TokenKind::Space(Default::default()),
269            TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
270            _ => self.clone(),
271        }
272    }
273
274    /// Construct a [`TokenKind::Word`] with no metadata.
275    pub fn blank_word() -> Self {
276        Self::Word(None)
277    }
278
279    // Punctuation and symbol non-is methods
280
281    pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
282        self.as_mut_punctuation()?.as_mut_quote()
283    }
284
285    pub fn as_quote(&self) -> Option<&Quote> {
286        self.as_punctuation()?.as_quote()
287    }
288}
289
290#[cfg(test)]
291mod tests {
292    use crate::Document;
293
294    #[test]
295    fn car_is_singular_noun() {
296        let doc = Document::new_plain_english_curated("car");
297        let tk = &doc.tokens().next().unwrap().kind;
298        assert!(tk.is_singular_noun());
299    }
300
301    #[test]
302    fn traffic_is_mass_noun() {
303        let doc = Document::new_plain_english_curated("traffic");
304        let tk = &doc.tokens().next().unwrap().kind;
305        assert!(tk.is_mass_noun());
306    }
307
308    #[test]
309    fn equipment_is_mass_noun() {
310        let doc = Document::new_plain_english_curated("equipment");
311        let tk = &doc.tokens().next().unwrap().kind;
312        assert!(tk.is_mass_noun());
313    }
314
315    #[test]
316    fn equipment_is_non_countable_noun() {
317        let doc = Document::new_plain_english_curated("equipment");
318        let tk = &doc.tokens().next().unwrap().kind;
319        assert!(tk.is_non_countable_noun());
320    }
321
322    #[test]
323    fn equipment_isnt_countable_noun() {
324        let doc = Document::new_plain_english_curated("equipment");
325        let tk = &doc.tokens().next().unwrap().kind;
326        assert!(!tk.is_countable_noun());
327    }
328}