1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word};
6
7macro_rules! delegate_to_metadata {
10 ($($method:ident),* $(,)?) => {
11 $(
12 #[doc = concat!(
13 "Delegates to [`DictWordMetadata::",
14 stringify!($method),
15 "`] when this token is a word.\n\n",
16 "Returns `false` if the token is not a word."
17 )]
18 pub fn $method(&self) -> bool {
19 let Word(Some(metadata)) = self else {
20 return false;
21 };
22 metadata.$method()
23 }
24 )*
25 };
26}
27
28#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
33#[serde(tag = "kind", content = "value")]
34pub enum TokenKind {
35 Word(Option<DictWordMetadata>),
37 Punctuation(Punctuation),
38 Decade,
39 Number(Number),
40 Space(usize),
42 Newline(usize),
44 EmailAddress,
45 Url,
46 Hostname,
47 #[default]
50 Unlintable,
51 ParagraphBreak,
52 Regexish,
53 HeadingStart,
54}
55
56impl TokenKind {
57 delegate_to_metadata! {
59 is_nominal,
61 is_noun,
62 is_pronoun,
63 is_proper_noun,
64 is_singular_nominal,
65 is_plural_nominal,
66 is_possessive_nominal,
67 is_non_plural_nominal,
68 is_singular_noun,
69 is_plural_noun,
70 is_non_plural_noun,
71 is_countable_noun,
72 is_non_countable_noun,
73 is_mass_noun,
74 is_mass_noun_only,
75 is_non_mass_noun,
76 is_singular_pronoun,
77 is_plural_pronoun,
78 is_non_plural_pronoun,
79 is_reflexive_pronoun,
80 is_personal_pronoun,
81 is_first_person_singular_pronoun,
82 is_first_person_plural_pronoun,
83 is_second_person_pronoun,
84 is_third_person_pronoun,
85 is_third_person_singular_pronoun,
86 is_third_person_plural_pronoun,
87 is_subject_pronoun,
88 is_object_pronoun,
89 is_possessive_noun,
90 is_possessive_pronoun,
91
92 is_verb,
94 is_auxiliary_verb,
95 is_linking_verb,
96 is_verb_lemma,
97 is_verb_past_form,
98 is_verb_simple_past_form,
99 is_verb_past_participle_form,
100 is_verb_progressive_form,
101 is_verb_third_person_singular_present_form,
102
103 is_adjective,
105 is_comparative_adjective,
106 is_superlative_adjective,
107 is_positive_adjective,
108
109 is_adverb,
111 is_manner_adverb,
112 is_frequency_adverb,
113 is_degree_adverb,
114
115 is_determiner,
117 is_demonstrative_determiner,
118 is_possessive_determiner,
119 is_quantifier,
120 is_non_quantifier_determiner,
121 is_non_demonstrative_determiner,
122
123 is_conjunction,
125
126 is_swear,
128 is_likely_homograph,
129
130 is_lowercase,
132 is_titlecase,
133 is_allcaps,
134 is_lower_camel,
135 is_upper_camel,
136 is_apostrophized,
137
138 is_roman_numerals
139 }
140
141 pub fn is_preposition(&self) -> bool {
143 let Word(Some(metadata)) = self else {
144 return false;
145 };
146 metadata.preposition
147 }
148
149 pub fn is_common_word(&self) -> bool {
152 let Word(Some(metadata)) = self else {
153 return true;
154 };
155 metadata.common
156 }
157
158 pub fn is_np_member(&self) -> bool {
160 let Word(Some(metadata)) = self else {
161 return false;
162 };
163 metadata.np_member.unwrap_or(false)
164 }
165
166 pub fn is_oov(&self) -> bool {
171 matches!(self, TokenKind::Word(None))
172 }
173
174 pub fn is_cardinal_number(&self) -> bool {
177 matches!(self, TokenKind::Number(Number { suffix: None, .. }))
178 }
179
180 pub fn is_ordinal_number(&self) -> bool {
181 matches!(
182 self,
183 TokenKind::Number(Number {
184 suffix: Some(_),
185 ..
186 })
187 )
188 }
189
190 pub fn is_open_square(&self) -> bool {
193 matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
194 }
195
196 pub fn is_close_square(&self) -> bool {
197 matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
198 }
199
200 pub fn is_open_round(&self) -> bool {
201 matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
202 }
203
204 pub fn is_close_round(&self) -> bool {
205 matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
206 }
207
208 pub fn is_pipe(&self) -> bool {
209 matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
210 }
211
212 pub fn is_currency(&self) -> bool {
213 matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
214 }
215
216 pub fn is_ellipsis(&self) -> bool {
217 matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
218 }
219
220 pub fn is_hyphen(&self) -> bool {
221 matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
222 }
223
224 pub fn is_quote(&self) -> bool {
225 matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
226 }
227
228 pub fn is_apostrophe(&self) -> bool {
229 matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
230 }
231
232 pub fn is_period(&self) -> bool {
233 matches!(self, TokenKind::Punctuation(Punctuation::Period))
234 }
235
236 pub fn is_at(&self) -> bool {
237 matches!(self, TokenKind::Punctuation(Punctuation::At))
238 }
239
240 pub fn is_comma(&self) -> bool {
241 matches!(self, TokenKind::Punctuation(Punctuation::Comma))
242 }
243
244 pub fn is_semicolon(&self) -> bool {
245 matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
246 }
247
248 pub fn is_ampersand(&self) -> bool {
249 matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
250 }
251
252 pub fn is_slash(&self) -> bool {
253 matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
254 }
255
256 pub fn is_word_like(&self) -> bool {
261 matches!(
262 self,
263 TokenKind::Word(..)
264 | TokenKind::EmailAddress
265 | TokenKind::Hostname
266 | TokenKind::Decade
267 | TokenKind::Number(..)
268 )
269 }
270
271 pub(crate) fn is_chunk_terminator(&self) -> bool {
272 if self.is_sentence_terminator() {
273 return true;
274 }
275
276 match self {
277 TokenKind::Punctuation(punct) => {
278 matches!(
279 punct,
280 Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
281 )
282 }
283 _ => false,
284 }
285 }
286
287 pub(crate) fn is_sentence_terminator(&self) -> bool {
288 match self {
289 TokenKind::Punctuation(punct) => [
290 Punctuation::Period,
291 Punctuation::Bang,
292 Punctuation::Question,
293 ]
294 .contains(punct),
295 TokenKind::ParagraphBreak => true,
296 _ => false,
297 }
298 }
299
300 pub fn is_case_separator(&self) -> bool {
304 matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
305 || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
306 }
307
308 pub fn is_whitespace(&self) -> bool {
310 matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
311 }
312
313 pub fn is_upos(&self, upos: UPOS) -> bool {
314 let Some(Some(meta)) = self.as_word() else {
315 return false;
316 };
317
318 meta.pos_tag == Some(upos)
319 }
320
321 pub fn matches_variant_of(&self, other: &Self) -> bool {
326 self.with_default_data() == other.with_default_data()
327 }
328
329 pub fn with_default_data(&self) -> Self {
333 match self {
334 TokenKind::Word(_) => TokenKind::Word(Default::default()),
335 TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
336 TokenKind::Number(..) => TokenKind::Number(Default::default()),
337 TokenKind::Space(_) => TokenKind::Space(Default::default()),
338 TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
339 _ => self.clone(),
340 }
341 }
342
343 pub fn blank_word() -> Self {
345 Self::Word(None)
346 }
347
348 pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
351 self.as_mut_punctuation()?.as_mut_quote()
352 }
353
354 pub fn as_quote(&self) -> Option<&Quote> {
355 self.as_punctuation()?.as_quote()
356 }
357}
358
359#[cfg(test)]
360mod tests {
361 use crate::Document;
362
363 #[test]
364 fn car_is_singular_noun() {
365 let doc = Document::new_plain_english_curated("car");
366 let tk = &doc.tokens().next().unwrap().kind;
367 assert!(tk.is_singular_noun());
368 }
369
370 #[test]
371 fn traffic_is_mass_noun_only() {
372 let doc = Document::new_plain_english_curated("traffic");
373 let tk = &doc.tokens().next().unwrap().kind;
374 assert!(tk.is_mass_noun_only());
375 }
376
377 #[test]
378 fn equipment_is_mass_noun() {
379 let doc = Document::new_plain_english_curated("equipment");
380 let tk = &doc.tokens().next().unwrap().kind;
381 assert!(tk.is_mass_noun());
382 }
383
384 #[test]
385 fn equipment_is_non_countable_noun() {
386 let doc = Document::new_plain_english_curated("equipment");
387 let tk = &doc.tokens().next().unwrap().kind;
388 assert!(tk.is_non_countable_noun());
389 }
390
391 #[test]
392 fn equipment_isnt_countable_noun() {
393 let doc = Document::new_plain_english_curated("equipment");
394 let tk = &doc.tokens().next().unwrap().kind;
395 assert!(!tk.is_countable_noun());
396 }
397
398 #[test]
399 fn oov_word_is_oov() {
400 let doc = Document::new_plain_english_curated("nonexistentword");
401 let tk = &doc.tokens().next().unwrap().kind;
402 assert!(tk.is_oov());
403 }
404
405 #[test]
406 fn known_word_is_not_oov() {
407 let doc = Document::new_plain_english_curated("car");
408 let tk = &doc.tokens().next().unwrap().kind;
409 assert!(!tk.is_oov());
410 }
411
412 #[test]
413 fn non_word_tokens_are_not_oov() {
414 let doc = Document::new_plain_english_curated("Hello, world!");
415 let tokens: Vec<_> = doc.tokens().collect();
416
417 assert!(!tokens[1].kind.is_oov());
419 assert!(!tokens[3].kind.is_oov());
421 }
422}