1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{
6 DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word, dict_word_metadata::Person,
7};
8
9macro_rules! delegate_to_metadata {
12 ($($method:ident),* $(,)?) => {
13 $(
14 #[doc = concat!(
15 "Delegates to [`DictWordMetadata::",
16 stringify!($method),
17 "`] when this token is a word.\n\n",
18 "Returns `false` if the token is not a word."
19 )]
20 pub fn $method(&self) -> bool {
21 let Word(Some(metadata)) = self else {
22 return false;
23 };
24 metadata.$method()
25 }
26 )*
27 };
28}
29
30#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
35#[serde(tag = "kind", content = "value")]
36pub enum TokenKind {
37 Word(Option<DictWordMetadata>),
39 Punctuation(Punctuation),
40 Decade,
41 Number(Number),
42 Space(usize),
44 Newline(usize),
46 EmailAddress,
47 Url,
48 Hostname,
49 #[default]
52 Unlintable,
53 ParagraphBreak,
54 Regexish,
55 HeadingStart,
56}
57
58impl TokenKind {
59 delegate_to_metadata! {
61 is_nominal,
63 is_noun,
64 is_pronoun,
65 is_proper_noun,
66 is_singular_nominal,
67 is_plural_nominal,
68 is_possessive_nominal,
69 is_non_plural_nominal,
70 is_singular_noun,
71 is_plural_noun,
72 is_non_plural_noun,
73 is_non_possessive_noun,
74 is_countable_noun,
75 is_non_countable_noun,
76 is_mass_noun,
77 is_mass_noun_only,
78 is_non_mass_noun,
79 is_singular_pronoun,
80 is_plural_pronoun,
81 is_non_plural_pronoun,
82 is_reflexive_pronoun,
83 is_personal_pronoun,
84 is_first_person_singular_pronoun,
85 is_first_person_plural_pronoun,
86 is_second_person_pronoun,
87 is_third_person_pronoun,
88 is_third_person_singular_pronoun,
89 is_third_person_plural_pronoun,
90 is_subject_pronoun,
91 is_object_pronoun,
92 is_possessive_noun,
93 is_possessive_pronoun,
94
95 is_verb,
97 is_auxiliary_verb,
98 is_linking_verb,
99 is_verb_lemma,
100 is_verb_past_form,
101 is_verb_simple_past_form,
102 is_verb_past_participle_form,
103 is_verb_progressive_form,
104 is_verb_third_person_singular_present_form,
105
106 is_adjective,
108 is_comparative_adjective,
109 is_superlative_adjective,
110 is_positive_adjective,
111
112 is_adverb,
114 is_manner_adverb,
115 is_frequency_adverb,
116 is_degree_adverb,
117
118 is_determiner,
120 is_demonstrative_determiner,
121 is_possessive_determiner,
122 is_quantifier,
123 is_non_quantifier_determiner,
124 is_non_demonstrative_determiner,
125
126 is_conjunction,
128
129 is_swear,
131 is_likely_homograph,
132
133 is_lowercase,
135 is_titlecase,
136 is_allcaps,
137 is_lower_camel,
138 is_upper_camel,
139 is_apostrophized,
140
141 is_roman_numerals
142 }
143
144 pub fn get_pronoun_person(&self) -> Option<Person> {
145 let Word(Some(metadata)) = self else {
146 return None;
147 };
148 metadata.get_person()
149 }
150
151 pub fn is_preposition(&self) -> bool {
153 let Word(Some(metadata)) = self else {
154 return false;
155 };
156 metadata.preposition
157 }
158
159 pub fn is_common_word(&self) -> bool {
162 let Word(Some(metadata)) = self else {
163 return true;
164 };
165 metadata.common
166 }
167
168 pub fn is_np_member(&self) -> bool {
170 let Word(Some(metadata)) = self else {
171 return false;
172 };
173 metadata.np_member.unwrap_or(false)
174 }
175
176 pub fn is_oov(&self) -> bool {
181 matches!(self, TokenKind::Word(None))
182 }
183
184 pub fn is_cardinal_number(&self) -> bool {
187 matches!(self, TokenKind::Number(Number { suffix: None, .. }))
188 }
189
190 pub fn is_ordinal_number(&self) -> bool {
191 matches!(
192 self,
193 TokenKind::Number(Number {
194 suffix: Some(_),
195 ..
196 })
197 )
198 }
199
200 pub fn is_open_square(&self) -> bool {
203 matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
204 }
205
206 pub fn is_close_square(&self) -> bool {
207 matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
208 }
209
210 pub fn is_less_than(&self) -> bool {
211 matches!(self, TokenKind::Punctuation(Punctuation::LessThan))
212 }
213
214 pub fn is_greater_than(&self) -> bool {
215 matches!(self, TokenKind::Punctuation(Punctuation::GreaterThan))
216 }
217
218 pub fn is_open_round(&self) -> bool {
219 matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
220 }
221
222 pub fn is_close_round(&self) -> bool {
223 matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
224 }
225
226 pub fn is_pipe(&self) -> bool {
227 matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
228 }
229
230 pub fn is_currency(&self) -> bool {
231 matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
232 }
233
234 pub fn is_ellipsis(&self) -> bool {
235 matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
236 }
237
238 pub fn is_hyphen(&self) -> bool {
239 matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
240 }
241
242 pub fn is_quote(&self) -> bool {
243 matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
244 }
245
246 pub fn is_apostrophe(&self) -> bool {
247 matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
248 }
249
250 pub fn is_period(&self) -> bool {
251 matches!(self, TokenKind::Punctuation(Punctuation::Period))
252 }
253
254 pub fn is_at(&self) -> bool {
255 matches!(self, TokenKind::Punctuation(Punctuation::At))
256 }
257
258 pub fn is_comma(&self) -> bool {
259 matches!(self, TokenKind::Punctuation(Punctuation::Comma))
260 }
261
262 pub fn is_semicolon(&self) -> bool {
263 matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
264 }
265
266 pub fn is_ampersand(&self) -> bool {
267 matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
268 }
269
270 pub fn is_slash(&self) -> bool {
271 matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
272 }
273
274 pub fn is_word_like(&self) -> bool {
279 matches!(
280 self,
281 TokenKind::Word(..)
282 | TokenKind::EmailAddress
283 | TokenKind::Hostname
284 | TokenKind::Decade
285 | TokenKind::Number(..)
286 )
287 }
288
289 pub(crate) fn is_chunk_terminator(&self) -> bool {
290 if self.is_sentence_terminator() {
291 return true;
292 }
293
294 match self {
295 TokenKind::Punctuation(punct) => {
296 matches!(
297 punct,
298 Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
299 )
300 }
301 _ => false,
302 }
303 }
304
305 pub fn is_sentence_terminator(&self) -> bool {
306 match self {
307 TokenKind::Punctuation(punct) => [
308 Punctuation::Period,
309 Punctuation::Bang,
310 Punctuation::Question,
311 ]
312 .contains(punct),
313 TokenKind::ParagraphBreak => true,
314 _ => false,
315 }
316 }
317
318 pub fn is_case_separator(&self) -> bool {
322 matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
323 || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
324 }
325
326 pub fn is_whitespace(&self) -> bool {
328 matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
329 }
330
331 pub fn is_upos(&self, upos: UPOS) -> bool {
332 let Some(Some(meta)) = self.as_word() else {
333 return false;
334 };
335
336 meta.pos_tag == Some(upos)
337 }
338
339 pub fn matches_variant_of(&self, other: &Self) -> bool {
344 self.with_default_data() == other.with_default_data()
345 }
346
347 pub fn with_default_data(&self) -> Self {
351 match self {
352 TokenKind::Word(_) => TokenKind::Word(Default::default()),
353 TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
354 TokenKind::Number(..) => TokenKind::Number(Default::default()),
355 TokenKind::Space(_) => TokenKind::Space(Default::default()),
356 TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
357 _ => self.clone(),
358 }
359 }
360
361 pub fn blank_word() -> Self {
363 Self::Word(None)
364 }
365
366 pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
369 self.as_mut_punctuation()?.as_mut_quote()
370 }
371
372 pub fn as_quote(&self) -> Option<&Quote> {
373 self.as_punctuation()?.as_quote()
374 }
375}
376
377#[cfg(test)]
378mod tests {
379 use crate::Document;
380
381 #[test]
382 fn car_is_singular_noun() {
383 let doc = Document::new_plain_english_curated("car");
384 let tk = &doc.tokens().next().unwrap().kind;
385 assert!(tk.is_singular_noun());
386 }
387
388 #[test]
389 fn traffic_is_mass_noun_only() {
390 let doc = Document::new_plain_english_curated("traffic");
391 let tk = &doc.tokens().next().unwrap().kind;
392 assert!(tk.is_mass_noun_only());
393 }
394
395 #[test]
396 fn equipment_is_mass_noun() {
397 let doc = Document::new_plain_english_curated("equipment");
398 let tk = &doc.tokens().next().unwrap().kind;
399 assert!(tk.is_mass_noun());
400 }
401
402 #[test]
403 fn equipment_is_non_countable_noun() {
404 let doc = Document::new_plain_english_curated("equipment");
405 let tk = &doc.tokens().next().unwrap().kind;
406 assert!(tk.is_non_countable_noun());
407 }
408
409 #[test]
410 fn equipment_isnt_countable_noun() {
411 let doc = Document::new_plain_english_curated("equipment");
412 let tk = &doc.tokens().next().unwrap().kind;
413 assert!(!tk.is_countable_noun());
414 }
415
416 #[test]
417 fn oov_word_is_oov() {
418 let doc = Document::new_plain_english_curated("nonexistentword");
419 let tk = &doc.tokens().next().unwrap().kind;
420 assert!(tk.is_oov());
421 }
422
423 #[test]
424 fn known_word_is_not_oov() {
425 let doc = Document::new_plain_english_curated("car");
426 let tk = &doc.tokens().next().unwrap().kind;
427 assert!(!tk.is_oov());
428 }
429
430 #[test]
431 fn non_word_tokens_are_not_oov() {
432 let doc = Document::new_plain_english_curated("Hello, world!");
433 let tokens: Vec<_> = doc.tokens().collect();
434
435 assert!(!tokens[1].kind.is_oov());
437 assert!(!tokens[3].kind.is_oov());
439 }
440}