1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{Number, Punctuation, Quote, TokenKind::Word, WordMetadata};
6
7macro_rules! delegate_to_metadata {
10 ($($method:ident),* $(,)?) => {
11 $(
12 #[doc = concat!(
13 "Delegates to [`WordMetadata::",
14 stringify!($method),
15 "`] when this token is a word.\n\n",
16 "Returns `false` if the token is not a word."
17 )]
18 pub fn $method(&self) -> bool {
19 let Word(Some(metadata)) = self else {
20 return false;
21 };
22 metadata.$method()
23 }
24 )*
25 };
26}
27
28#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
33#[serde(tag = "kind", content = "value")]
34pub enum TokenKind {
35 Word(Option<WordMetadata>),
37 Punctuation(Punctuation),
38 Decade,
39 Number(Number),
40 Space(usize),
42 Newline(usize),
44 EmailAddress,
45 Url,
46 Hostname,
47 #[default]
50 Unlintable,
51 ParagraphBreak,
52 Regexish,
53}
54
55impl TokenKind {
56 delegate_to_metadata! {
58 is_nominal,
60 is_noun,
61 is_pronoun,
62 is_proper_noun,
63 is_singular_nominal,
64 is_plural_nominal,
65 is_possessive_nominal,
66 is_non_plural_nominal,
67 is_singular_noun,
68 is_plural_noun,
69 is_non_plural_noun,
70 is_countable_noun,
71 is_non_countable_noun,
72 is_mass_noun,
73 is_non_mass_noun,
74 is_singular_pronoun,
75 is_plural_pronoun,
76 is_non_plural_pronoun,
77 is_reflexive_pronoun,
78 is_first_person_singular_pronoun,
79 is_first_person_plural_pronoun,
80 is_second_person_pronoun,
81 is_third_person_pronoun,
82 is_third_person_singular_pronoun,
83 is_third_person_plural_pronoun,
84 is_object_pronoun,
85 is_possessive_noun,
86 is_possessive_pronoun,
87
88 is_verb,
90 is_auxiliary_verb,
91 is_linking_verb,
92 is_verb_lemma,
93 is_verb_past_form,
94 is_verb_progressive_form,
95 is_verb_third_person_singular_present_form,
96
97 is_adjective,
99
100 is_adverb,
102
103 is_determiner,
105 is_demonstrative_determiner,
106 is_possessive_determiner,
107
108 is_conjunction
110 }
111
112 pub fn is_preposition(&self) -> bool {
114 let Word(Some(metadata)) = self else {
115 return false;
116 };
117 metadata.preposition
118 }
119
120 pub fn is_swear(&self) -> bool {
121 let Word(Some(metadata)) = self else {
122 return false;
123 };
124 metadata.is_swear()
125 }
126
127 pub fn is_common_word(&self) -> bool {
128 let Word(Some(metadata)) = self else {
129 return true;
130 };
131 metadata.common
132 }
133
134 pub fn is_likely_homograph(&self) -> bool {
135 let Word(Some(metadata)) = self else {
136 return false;
137 };
138 metadata.is_likely_homograph()
139 }
140
141 pub fn is_open_square(&self) -> bool {
144 matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
145 }
146
147 pub fn is_close_square(&self) -> bool {
148 matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
149 }
150
151 pub fn is_pipe(&self) -> bool {
152 matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
153 }
154
155 pub fn is_currency(&self) -> bool {
156 matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
157 }
158
159 pub fn is_ellipsis(&self) -> bool {
160 matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
161 }
162
163 pub fn is_hyphen(&self) -> bool {
164 matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
165 }
166
167 pub fn is_quote(&self) -> bool {
168 matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
169 }
170
171 pub fn is_apostrophe(&self) -> bool {
172 matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
173 }
174
175 pub fn is_period(&self) -> bool {
176 matches!(self, TokenKind::Punctuation(Punctuation::Period))
177 }
178
179 pub fn is_at(&self) -> bool {
180 matches!(self, TokenKind::Punctuation(Punctuation::At))
181 }
182
183 pub fn is_comma(&self) -> bool {
184 matches!(self, TokenKind::Punctuation(Punctuation::Comma))
185 }
186
187 pub fn is_word_like(&self) -> bool {
192 matches!(
193 self,
194 TokenKind::Word(..)
195 | TokenKind::EmailAddress
196 | TokenKind::Hostname
197 | TokenKind::Decade
198 | TokenKind::Number(..)
199 )
200 }
201
202 pub(crate) fn is_chunk_terminator(&self) -> bool {
203 if self.is_sentence_terminator() {
204 return true;
205 }
206
207 match self {
208 TokenKind::Punctuation(punct) => {
209 matches!(
210 punct,
211 Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
212 )
213 }
214 _ => false,
215 }
216 }
217
218 pub(crate) fn is_sentence_terminator(&self) -> bool {
219 match self {
220 TokenKind::Punctuation(punct) => [
221 Punctuation::Period,
222 Punctuation::Bang,
223 Punctuation::Question,
224 ]
225 .contains(punct),
226 TokenKind::ParagraphBreak => true,
227 _ => false,
228 }
229 }
230
231 pub fn is_case_separator(&self) -> bool {
235 matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
236 || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
237 }
238
239 pub fn is_whitespace(&self) -> bool {
241 matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
242 }
243
244 pub fn is_upos(&self, upos: UPOS) -> bool {
245 let Some(Some(meta)) = self.as_word() else {
246 return false;
247 };
248
249 meta.pos_tag == Some(upos)
250 }
251
252 pub fn matches_variant_of(&self, other: &Self) -> bool {
257 self.with_default_data() == other.with_default_data()
258 }
259
260 pub fn with_default_data(&self) -> Self {
264 match self {
265 TokenKind::Word(_) => TokenKind::Word(Default::default()),
266 TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
267 TokenKind::Number(..) => TokenKind::Number(Default::default()),
268 TokenKind::Space(_) => TokenKind::Space(Default::default()),
269 TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
270 _ => self.clone(),
271 }
272 }
273
274 pub fn blank_word() -> Self {
276 Self::Word(None)
277 }
278
279 pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
282 self.as_mut_punctuation()?.as_mut_quote()
283 }
284
285 pub fn as_quote(&self) -> Option<&Quote> {
286 self.as_punctuation()?.as_quote()
287 }
288}
289
290#[cfg(test)]
291mod tests {
292 use crate::Document;
293
294 #[test]
295 fn car_is_singular_noun() {
296 let doc = Document::new_plain_english_curated("car");
297 let tk = &doc.tokens().next().unwrap().kind;
298 assert!(tk.is_singular_noun());
299 }
300
301 #[test]
302 fn traffic_is_mass_noun() {
303 let doc = Document::new_plain_english_curated("traffic");
304 let tk = &doc.tokens().next().unwrap().kind;
305 assert!(tk.is_mass_noun());
306 }
307
308 #[test]
309 fn equipment_is_mass_noun() {
310 let doc = Document::new_plain_english_curated("equipment");
311 let tk = &doc.tokens().next().unwrap().kind;
312 assert!(tk.is_mass_noun());
313 }
314
315 #[test]
316 fn equipment_is_non_countable_noun() {
317 let doc = Document::new_plain_english_curated("equipment");
318 let tk = &doc.tokens().next().unwrap().kind;
319 assert!(tk.is_non_countable_noun());
320 }
321
322 #[test]
323 fn equipment_isnt_countable_noun() {
324 let doc = Document::new_plain_english_curated("equipment");
325 let tk = &doc.tokens().next().unwrap().kind;
326 assert!(!tk.is_countable_noun());
327 }
328}