1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{
6 DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word, dict_word_metadata::Person,
7};
8
9macro_rules! delegate_to_metadata {
12 ($($method:ident),* $(,)?) => {
13 $(
14 #[doc = concat!(
15 "Delegates to [`DictWordMetadata::",
16 stringify!($method),
17 "`] when this token is a word.\n\n",
18 "Returns `false` if the token is not a word."
19 )]
20 pub fn $method(&self) -> bool {
21 let Word(Some(metadata)) = self else {
22 return false;
23 };
24 metadata.$method()
25 }
26 )*
27 };
28}
29
30#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
35#[serde(tag = "kind", content = "value")]
36pub enum TokenKind {
37 Word(Option<DictWordMetadata>),
39 Punctuation(Punctuation),
40 Decade,
41 Number(Number),
42 Space(usize),
44 Newline(usize),
46 EmailAddress,
47 Url,
48 Hostname,
49 #[default]
52 Unlintable,
53 ParagraphBreak,
54 Regexish,
55 HeadingStart,
56}
57
58impl TokenKind {
59 delegate_to_metadata! {
61 is_nominal,
63 is_noun,
64 is_pronoun,
65 is_proper_noun,
66 is_singular_nominal,
67 is_plural_nominal,
68 is_possessive_nominal,
69 is_non_plural_nominal,
70 is_singular_noun,
71 is_plural_noun,
72 is_non_plural_noun,
73 is_non_possessive_noun,
74 is_countable_noun,
75 is_non_countable_noun,
76 is_mass_noun,
77 is_mass_noun_only,
78 is_non_mass_noun,
79 is_singular_pronoun,
80 is_plural_pronoun,
81 is_non_plural_pronoun,
82 is_reflexive_pronoun,
83 is_personal_pronoun,
84 is_first_person_singular_pronoun,
85 is_first_person_plural_pronoun,
86 is_second_person_pronoun,
87 is_third_person_pronoun,
88 is_third_person_singular_pronoun,
89 is_third_person_plural_pronoun,
90 is_subject_pronoun,
91 is_object_pronoun,
92 is_possessive_noun,
93 is_possessive_pronoun,
95
96 is_verb,
98 is_auxiliary_verb,
99 is_linking_verb,
100 is_verb_lemma,
101 is_verb_past_form,
102 is_verb_regular_past_form,
103 is_verb_simple_past_form,
104 is_verb_past_participle_form,
105 is_verb_simple_past_only,
106 is_verb_past_participle_only,
107 is_verb_progressive_form,
108 is_verb_third_person_singular_present_form,
109
110 is_adjective,
112 is_comparative_adjective,
113 is_superlative_adjective,
114 is_positive_adjective,
115
116 is_adverb,
118 is_manner_adverb,
119 is_frequency_adverb,
120 is_degree_adverb,
121
122 is_determiner,
124 is_demonstrative_determiner,
125 is_possessive_determiner,
126 is_quantifier,
127 is_non_quantifier_determiner,
128 is_non_demonstrative_determiner,
129
130 is_conjunction,
132
133 is_swear,
135 is_likely_homograph,
136
137 is_lowercase,
139 is_titlecase,
140 is_allcaps,
141 is_lower_camel,
142 is_upper_camel,
143 is_apostrophized,
144
145 is_roman_numerals
146 }
147
148 pub fn get_pronoun_person(&self) -> Option<Person> {
149 let Word(Some(metadata)) = self else {
150 return None;
151 };
152 metadata.get_person()
153 }
154
155 pub fn is_preposition(&self) -> bool {
157 let Word(Some(metadata)) = self else {
158 return false;
159 };
160 metadata.preposition
161 }
162
163 pub fn is_common_word(&self) -> bool {
166 let Word(Some(metadata)) = self else {
167 return true;
168 };
169 metadata.common
170 }
171
172 pub fn is_np_member(&self) -> bool {
174 let Word(Some(metadata)) = self else {
175 return false;
176 };
177 metadata.np_member.unwrap_or(false)
178 }
179
180 pub fn is_oov(&self) -> bool {
185 matches!(self, TokenKind::Word(None))
186 }
187
188 pub fn is_cardinal_number(&self) -> bool {
191 matches!(self, TokenKind::Number(Number { suffix: None, .. }))
192 }
193
194 pub fn is_ordinal_number(&self) -> bool {
195 matches!(
196 self,
197 TokenKind::Number(Number {
198 suffix: Some(_),
199 ..
200 })
201 )
202 }
203
204 pub fn is_open_square(&self) -> bool {
207 matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
208 }
209
210 pub fn is_close_square(&self) -> bool {
211 matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
212 }
213
214 pub fn is_less_than(&self) -> bool {
215 matches!(self, TokenKind::Punctuation(Punctuation::LessThan))
216 }
217
218 pub fn is_greater_than(&self) -> bool {
219 matches!(self, TokenKind::Punctuation(Punctuation::GreaterThan))
220 }
221
222 pub fn is_open_round(&self) -> bool {
223 matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
224 }
225
226 pub fn is_close_round(&self) -> bool {
227 matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
228 }
229
230 pub fn is_pipe(&self) -> bool {
231 matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
232 }
233
234 pub fn is_currency(&self) -> bool {
235 matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
236 }
237
238 pub fn is_ellipsis(&self) -> bool {
239 matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
240 }
241
242 pub fn is_hyphen(&self) -> bool {
244 matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
245 }
246
247 pub fn is_plus(&self) -> bool {
248 matches!(self, TokenKind::Punctuation(Punctuation::Plus))
249 }
250
251 pub fn is_quote(&self) -> bool {
252 matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
253 }
254
255 pub fn is_apostrophe(&self) -> bool {
256 matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
257 }
258
259 pub fn is_period(&self) -> bool {
260 matches!(self, TokenKind::Punctuation(Punctuation::Period))
261 }
262
263 pub fn is_at(&self) -> bool {
264 matches!(self, TokenKind::Punctuation(Punctuation::At))
265 }
266
267 pub fn is_comma(&self) -> bool {
268 matches!(self, TokenKind::Punctuation(Punctuation::Comma))
269 }
270
271 pub fn is_semicolon(&self) -> bool {
272 matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
273 }
274
275 pub fn is_acute(&self) -> bool {
276 matches!(self, TokenKind::Punctuation(Punctuation::Acute))
277 }
278
279 pub fn is_ampersand(&self) -> bool {
280 matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
281 }
282
283 pub fn is_backslash(&self) -> bool {
284 matches!(self, TokenKind::Punctuation(Punctuation::Backslash))
285 }
286
287 pub fn is_slash(&self) -> bool {
288 matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
289 }
290
291 pub fn is_percent(&self) -> bool {
292 matches!(self, TokenKind::Punctuation(Punctuation::Percent))
293 }
294
295 pub fn is_backtick(&self) -> bool {
296 matches!(self, TokenKind::Punctuation(Punctuation::Backtick))
297 }
298
299 pub fn is_word_like(&self) -> bool {
304 matches!(
305 self,
306 TokenKind::Word(..)
307 | TokenKind::EmailAddress
308 | TokenKind::Hostname
309 | TokenKind::Decade
310 | TokenKind::Number(..)
311 )
312 }
313
314 pub(crate) fn is_chunk_terminator(&self) -> bool {
315 if self.is_sentence_terminator() {
316 return true;
317 }
318
319 match self {
320 TokenKind::Punctuation(punct) => {
321 matches!(
322 punct,
323 Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
324 )
325 }
326 _ => false,
327 }
328 }
329
330 pub fn is_sentence_terminator(&self) -> bool {
331 match self {
332 TokenKind::Punctuation(punct) => [
333 Punctuation::Period,
334 Punctuation::Bang,
335 Punctuation::Question,
336 ]
337 .contains(punct),
338 TokenKind::ParagraphBreak => true,
339 _ => false,
340 }
341 }
342
343 pub fn is_case_separator(&self) -> bool {
347 matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
348 || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
349 }
350
351 pub fn is_whitespace(&self) -> bool {
353 matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
354 }
355
356 pub fn is_upos(&self, upos: UPOS) -> bool {
357 let Some(Some(meta)) = self.as_word() else {
358 return false;
359 };
360
361 meta.pos_tag == Some(upos)
362 }
363
364 pub fn matches_variant_of(&self, other: &Self) -> bool {
369 self.with_default_data() == other.with_default_data()
370 }
371
372 pub fn with_default_data(&self) -> Self {
376 match self {
377 TokenKind::Word(_) => TokenKind::Word(Default::default()),
378 TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
379 TokenKind::Number(..) => TokenKind::Number(Default::default()),
380 TokenKind::Space(_) => TokenKind::Space(Default::default()),
381 TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
382 _ => self.clone(),
383 }
384 }
385
386 pub fn blank_word() -> Self {
388 Self::Word(None)
389 }
390
391 pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
394 self.as_mut_punctuation()?.as_mut_quote()
395 }
396
397 pub fn as_quote(&self) -> Option<&Quote> {
398 self.as_punctuation()?.as_quote()
399 }
400}
401
402#[cfg(test)]
403mod tests {
404 use crate::Document;
405
406 #[test]
407 fn car_is_singular_noun() {
408 let doc = Document::new_plain_english_curated("car");
409 let tk = &doc.tokens().next().unwrap().kind;
410 assert!(tk.is_singular_noun());
411 }
412
413 #[test]
414 fn traffic_is_mass_noun_only() {
415 let doc = Document::new_plain_english_curated("traffic");
416 let tk = &doc.tokens().next().unwrap().kind;
417 assert!(tk.is_mass_noun_only());
418 }
419
420 #[test]
421 fn equipment_is_mass_noun() {
422 let doc = Document::new_plain_english_curated("equipment");
423 let tk = &doc.tokens().next().unwrap().kind;
424 assert!(tk.is_mass_noun());
425 }
426
427 #[test]
428 fn equipment_is_non_countable_noun() {
429 let doc = Document::new_plain_english_curated("equipment");
430 let tk = &doc.tokens().next().unwrap().kind;
431 assert!(tk.is_non_countable_noun());
432 }
433
434 #[test]
435 fn equipment_isnt_countable_noun() {
436 let doc = Document::new_plain_english_curated("equipment");
437 let tk = &doc.tokens().next().unwrap().kind;
438 assert!(!tk.is_countable_noun());
439 }
440
441 #[test]
442 fn ate_is_simple_past_only() {
443 let doc = Document::new_plain_english_curated("ate");
444 let tk = &doc.tokens().next().unwrap().kind;
445 assert!(tk.is_verb_simple_past_only());
446 assert!(!tk.is_verb_past_participle_only());
447 }
448
449 #[test]
450 fn eaten_is_past_participle_only() {
451 let doc = Document::new_plain_english_curated("eaten");
452 let tk = &doc.tokens().next().unwrap().kind;
453 assert!(tk.is_verb_past_participle_only());
454 assert!(!tk.is_verb_simple_past_only());
455 }
456
457 #[test]
458 fn thought_is_regular_past_form() {
459 let doc = Document::new_plain_english_curated("thought");
460 let tk = &doc.tokens().next().unwrap().kind;
461 assert!(tk.is_verb_regular_past_form());
462 }
463
464 #[test]
465 fn oov_word_is_oov() {
466 let doc = Document::new_plain_english_curated("nonexistentword");
467 let tk = &doc.tokens().next().unwrap().kind;
468 assert!(tk.is_oov());
469 }
470
471 #[test]
472 fn known_word_is_not_oov() {
473 let doc = Document::new_plain_english_curated("car");
474 let tk = &doc.tokens().next().unwrap().kind;
475 assert!(!tk.is_oov());
476 }
477
478 #[test]
479 fn non_word_tokens_are_not_oov() {
480 let doc = Document::new_plain_english_curated("Hello, world!");
481 let tokens: Vec<_> = doc.tokens().collect();
482
483 assert!(!tokens[1].kind.is_oov());
485 assert!(!tokens[3].kind.is_oov());
487 }
488}