harper_core/
token_kind.rs

1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{Number, Punctuation, Quote, TokenKind::Word, WordMetadata};
6
7macro_rules! delegate_to_metadata {
8    ($($method:ident),* $(,)?) => {
9        $(
10            pub fn $method(&self) -> bool {
11                let Word(Some(metadata)) = self else {
12                    return false;
13                };
14                metadata.$method()
15            }
16        )*
17    };
18}
19
20#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
21#[serde(tag = "kind", content = "value")]
22pub enum TokenKind {
23    /// `None` if the word does not exist in the dictionary.
24    Word(Option<WordMetadata>),
25    Punctuation(Punctuation),
26    Decade,
27    Number(Number),
28    /// A sequence of " " spaces.
29    Space(usize),
30    /// A sequence of "\n" newlines
31    Newline(usize),
32    EmailAddress,
33    Url,
34    Hostname,
35    /// A special token used for things like inline code blocks that should be
36    /// ignored by all linters.
37    #[default]
38    Unlintable,
39    ParagraphBreak,
40    Regexish,
41}
42
43impl TokenKind {
44    // Word metadata delegation methods grouped by part of speech
45    delegate_to_metadata! {
46        // Nominal methods (nouns and pronouns)
47        is_nominal,
48        is_noun,
49        is_pronoun,
50        is_proper_noun,
51        is_singular_nominal,
52        is_plural_nominal,
53        is_possessive_nominal,
54        is_non_plural_nominal,
55        is_singular_noun,
56        is_plural_noun,
57        is_non_plural_noun,
58        is_countable_noun,
59        is_mass_noun,
60        is_singular_pronoun,
61        is_plural_pronoun,
62        is_non_plural_pronoun,
63        is_reflexive_pronoun,
64        is_first_person_singular_pronoun,
65        is_first_person_plural_pronoun,
66        is_second_person_pronoun,
67        is_third_person_pronoun,
68        is_third_person_singular_pronoun,
69        is_third_person_plural_pronoun,
70        is_object_pronoun,
71        is_possessive_noun,
72        is_possessive_pronoun,
73
74        // Verb methods
75        is_verb,
76        is_auxiliary_verb,
77        is_linking_verb,
78        is_verb_lemma,
79        is_verb_past_form,
80        is_verb_progressive_form,
81        is_verb_third_person_singular_present_form,
82
83        // Adjective methods
84        is_adjective,
85
86        // Adverb methods
87        is_adverb,
88
89        // Determiner methods
90        is_determiner,
91        is_demonstrative_determiner,
92        is_possessive_determiner,
93
94        // Conjunction methods
95        is_conjunction
96    }
97
98    // Word metadata delegation methods not generated by macro
99    pub fn is_preposition(&self) -> bool {
100        let Word(Some(metadata)) = self else {
101            return false;
102        };
103        metadata.preposition
104    }
105
106    pub fn is_swear(&self) -> bool {
107        let Word(Some(metadata)) = self else {
108            return false;
109        };
110        metadata.is_swear()
111    }
112
113    pub fn is_common_word(&self) -> bool {
114        let Word(Some(metadata)) = self else {
115            return true;
116        };
117        metadata.common
118    }
119
120    pub fn is_likely_homograph(&self) -> bool {
121        let Word(Some(metadata)) = self else {
122            return false;
123        };
124        metadata.is_likely_homograph()
125    }
126
127    // Punctuation and symbol is-methods
128
129    pub fn is_open_square(&self) -> bool {
130        matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
131    }
132
133    pub fn is_close_square(&self) -> bool {
134        matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
135    }
136
137    pub fn is_pipe(&self) -> bool {
138        matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
139    }
140
141    pub fn is_currency(&self) -> bool {
142        matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
143    }
144
145    pub fn is_ellipsis(&self) -> bool {
146        matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
147    }
148
149    pub fn is_hyphen(&self) -> bool {
150        matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
151    }
152
153    pub fn is_quote(&self) -> bool {
154        matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
155    }
156
157    pub fn is_apostrophe(&self) -> bool {
158        matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
159    }
160
161    pub fn is_period(&self) -> bool {
162        matches!(self, TokenKind::Punctuation(Punctuation::Period))
163    }
164
165    pub fn is_at(&self) -> bool {
166        matches!(self, TokenKind::Punctuation(Punctuation::At))
167    }
168
169    pub fn is_comma(&self) -> bool {
170        matches!(self, TokenKind::Punctuation(Punctuation::Comma))
171    }
172
173    // Miscellaneous is-methods
174
175    /// Checks whether a token is word-like--meaning it is more complex than punctuation and can
176    /// hold semantic meaning in the way a word does.
177    pub fn is_word_like(&self) -> bool {
178        matches!(
179            self,
180            TokenKind::Word(..)
181                | TokenKind::EmailAddress
182                | TokenKind::Hostname
183                | TokenKind::Decade
184                | TokenKind::Number(..)
185        )
186    }
187
188    pub(crate) fn is_chunk_terminator(&self) -> bool {
189        if self.is_sentence_terminator() {
190            return true;
191        }
192
193        match self {
194            TokenKind::Punctuation(punct) => {
195                matches!(
196                    punct,
197                    Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
198                )
199            }
200            _ => false,
201        }
202    }
203
204    pub(crate) fn is_sentence_terminator(&self) -> bool {
205        match self {
206            TokenKind::Punctuation(punct) => [
207                Punctuation::Period,
208                Punctuation::Bang,
209                Punctuation::Question,
210            ]
211            .contains(punct),
212            TokenKind::ParagraphBreak => true,
213            _ => false,
214        }
215    }
216
217    /// Used by `crate::parsers::CollapseIdentifiers`
218    /// TODO: Separate this into two functions and add OR functionality to
219    /// pattern matching
220    pub fn is_case_separator(&self) -> bool {
221        matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
222            || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
223    }
224
225    /// Checks whether the token is whitespace.
226    pub fn is_whitespace(&self) -> bool {
227        matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
228    }
229
230    pub fn is_upos(&self, upos: UPOS) -> bool {
231        let Some(Some(meta)) = self.as_word() else {
232            return false;
233        };
234
235        meta.pos_tag == Some(upos)
236    }
237
238    // Miscellaneous non-is methods
239
240    /// Checks that `self` is the same enum variant as `other`, regardless of
241    /// whether the inner metadata is also equal.
242    pub fn matches_variant_of(&self, other: &Self) -> bool {
243        self.with_default_data() == other.with_default_data()
244    }
245
246    /// Produces a copy of `self` with any inner data replaced with its default
247    /// value. Useful for making comparisons on just the variant of the
248    /// enum.
249    pub fn with_default_data(&self) -> Self {
250        match self {
251            TokenKind::Word(_) => TokenKind::Word(Default::default()),
252            TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
253            TokenKind::Number(..) => TokenKind::Number(Default::default()),
254            TokenKind::Space(_) => TokenKind::Space(Default::default()),
255            TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
256            _ => self.clone(),
257        }
258    }
259
260    /// Construct a [`TokenKind::Word`] with no metadata.
261    pub fn blank_word() -> Self {
262        Self::Word(None)
263    }
264
265    // Punctuation and symbol non-is methods
266
267    pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
268        self.as_mut_punctuation()?.as_mut_quote()
269    }
270
271    pub fn as_quote(&self) -> Option<&Quote> {
272        self.as_punctuation()?.as_quote()
273    }
274}
275
276#[cfg(test)]
277mod tests {
278    use crate::Document;
279
280    #[test]
281    fn car_is_singular_noun() {
282        let doc = Document::new_plain_english_curated("car");
283        let tk = &doc.tokens().next().unwrap().kind;
284        assert!(tk.is_singular_noun());
285    }
286
287    #[test]
288    fn traffic_is_mass_noun() {
289        let doc = Document::new_plain_english_curated("traffic");
290        let tk = &doc.tokens().next().unwrap().kind;
291        assert!(tk.is_mass_noun());
292    }
293}