1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{
6 DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word, dict_word_metadata::Person,
7};
8
9macro_rules! delegate_to_metadata {
12 ($($method:ident),* $(,)?) => {
13 $(
14 #[doc = concat!(
15 "Delegates to [`DictWordMetadata::",
16 stringify!($method),
17 "`] when this token is a word.\n\n",
18 "Returns `false` if the token is not a word."
19 )]
20 pub fn $method(&self) -> bool {
21 let Word(Some(metadata)) = self else {
22 return false;
23 };
24 metadata.$method()
25 }
26 )*
27 };
28}
29
30#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
35#[serde(tag = "kind", content = "value")]
36pub enum TokenKind {
37 Word(Option<DictWordMetadata>),
39 Punctuation(Punctuation),
40 Decade,
41 Number(Number),
42 Space(usize),
44 Newline(usize),
46 EmailAddress,
47 Url,
48 Hostname,
49 #[default]
52 Unlintable,
53 ParagraphBreak,
54 Regexish,
55 HeadingStart,
56}
57
58impl TokenKind {
59 delegate_to_metadata! {
61 is_nominal,
63 is_noun,
64 is_pronoun,
65 is_proper_noun,
66 is_singular_nominal,
67 is_plural_nominal,
68 is_possessive_nominal,
69 is_non_plural_nominal,
70 is_singular_noun,
71 is_plural_noun,
72 is_non_plural_noun,
73 is_non_possessive_noun,
74 is_countable_noun,
75 is_non_countable_noun,
76 is_mass_noun,
77 is_mass_noun_only,
78 is_non_mass_noun,
79 is_singular_pronoun,
80 is_plural_pronoun,
81 is_non_plural_pronoun,
82 is_reflexive_pronoun,
83 is_personal_pronoun,
84 is_first_person_singular_pronoun,
85 is_first_person_plural_pronoun,
86 is_second_person_pronoun,
87 is_third_person_pronoun,
88 is_third_person_singular_pronoun,
89 is_third_person_plural_pronoun,
90 is_subject_pronoun,
91 is_object_pronoun,
92 is_possessive_noun,
93 is_possessive_pronoun,
95
96 is_verb,
98 is_auxiliary_verb,
99 is_linking_verb,
100 is_verb_lemma,
101 is_verb_past_form,
102 is_verb_simple_past_form,
103 is_verb_past_participle_form,
104 is_verb_progressive_form,
105 is_verb_third_person_singular_present_form,
106
107 is_adjective,
109 is_comparative_adjective,
110 is_superlative_adjective,
111 is_positive_adjective,
112
113 is_adverb,
115 is_manner_adverb,
116 is_frequency_adverb,
117 is_degree_adverb,
118
119 is_determiner,
121 is_demonstrative_determiner,
122 is_possessive_determiner,
123 is_quantifier,
124 is_non_quantifier_determiner,
125 is_non_demonstrative_determiner,
126
127 is_conjunction,
129
130 is_swear,
132 is_likely_homograph,
133
134 is_lowercase,
136 is_titlecase,
137 is_allcaps,
138 is_lower_camel,
139 is_upper_camel,
140 is_apostrophized,
141
142 is_roman_numerals
143 }
144
145 pub fn get_pronoun_person(&self) -> Option<Person> {
146 let Word(Some(metadata)) = self else {
147 return None;
148 };
149 metadata.get_person()
150 }
151
152 pub fn is_preposition(&self) -> bool {
154 let Word(Some(metadata)) = self else {
155 return false;
156 };
157 metadata.preposition
158 }
159
160 pub fn is_common_word(&self) -> bool {
163 let Word(Some(metadata)) = self else {
164 return true;
165 };
166 metadata.common
167 }
168
169 pub fn is_np_member(&self) -> bool {
171 let Word(Some(metadata)) = self else {
172 return false;
173 };
174 metadata.np_member.unwrap_or(false)
175 }
176
177 pub fn is_oov(&self) -> bool {
182 matches!(self, TokenKind::Word(None))
183 }
184
185 pub fn is_cardinal_number(&self) -> bool {
188 matches!(self, TokenKind::Number(Number { suffix: None, .. }))
189 }
190
191 pub fn is_ordinal_number(&self) -> bool {
192 matches!(
193 self,
194 TokenKind::Number(Number {
195 suffix: Some(_),
196 ..
197 })
198 )
199 }
200
201 pub fn is_open_square(&self) -> bool {
204 matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
205 }
206
207 pub fn is_close_square(&self) -> bool {
208 matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
209 }
210
211 pub fn is_less_than(&self) -> bool {
212 matches!(self, TokenKind::Punctuation(Punctuation::LessThan))
213 }
214
215 pub fn is_greater_than(&self) -> bool {
216 matches!(self, TokenKind::Punctuation(Punctuation::GreaterThan))
217 }
218
219 pub fn is_open_round(&self) -> bool {
220 matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
221 }
222
223 pub fn is_close_round(&self) -> bool {
224 matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
225 }
226
227 pub fn is_pipe(&self) -> bool {
228 matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
229 }
230
231 pub fn is_currency(&self) -> bool {
232 matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
233 }
234
235 pub fn is_ellipsis(&self) -> bool {
236 matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
237 }
238
239 pub fn is_hyphen(&self) -> bool {
240 matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
241 }
242
243 pub fn is_quote(&self) -> bool {
244 matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
245 }
246
247 pub fn is_apostrophe(&self) -> bool {
248 matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
249 }
250
251 pub fn is_period(&self) -> bool {
252 matches!(self, TokenKind::Punctuation(Punctuation::Period))
253 }
254
255 pub fn is_at(&self) -> bool {
256 matches!(self, TokenKind::Punctuation(Punctuation::At))
257 }
258
259 pub fn is_comma(&self) -> bool {
260 matches!(self, TokenKind::Punctuation(Punctuation::Comma))
261 }
262
263 pub fn is_semicolon(&self) -> bool {
264 matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
265 }
266
267 pub fn is_ampersand(&self) -> bool {
268 matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
269 }
270
271 pub fn is_slash(&self) -> bool {
272 matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
273 }
274
275 pub fn is_word_like(&self) -> bool {
280 matches!(
281 self,
282 TokenKind::Word(..)
283 | TokenKind::EmailAddress
284 | TokenKind::Hostname
285 | TokenKind::Decade
286 | TokenKind::Number(..)
287 )
288 }
289
290 pub(crate) fn is_chunk_terminator(&self) -> bool {
291 if self.is_sentence_terminator() {
292 return true;
293 }
294
295 match self {
296 TokenKind::Punctuation(punct) => {
297 matches!(
298 punct,
299 Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
300 )
301 }
302 _ => false,
303 }
304 }
305
306 pub fn is_sentence_terminator(&self) -> bool {
307 match self {
308 TokenKind::Punctuation(punct) => [
309 Punctuation::Period,
310 Punctuation::Bang,
311 Punctuation::Question,
312 ]
313 .contains(punct),
314 TokenKind::ParagraphBreak => true,
315 _ => false,
316 }
317 }
318
319 pub fn is_case_separator(&self) -> bool {
323 matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
324 || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
325 }
326
327 pub fn is_whitespace(&self) -> bool {
329 matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
330 }
331
332 pub fn is_upos(&self, upos: UPOS) -> bool {
333 let Some(Some(meta)) = self.as_word() else {
334 return false;
335 };
336
337 meta.pos_tag == Some(upos)
338 }
339
340 pub fn matches_variant_of(&self, other: &Self) -> bool {
345 self.with_default_data() == other.with_default_data()
346 }
347
348 pub fn with_default_data(&self) -> Self {
352 match self {
353 TokenKind::Word(_) => TokenKind::Word(Default::default()),
354 TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
355 TokenKind::Number(..) => TokenKind::Number(Default::default()),
356 TokenKind::Space(_) => TokenKind::Space(Default::default()),
357 TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
358 _ => self.clone(),
359 }
360 }
361
362 pub fn blank_word() -> Self {
364 Self::Word(None)
365 }
366
367 pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
370 self.as_mut_punctuation()?.as_mut_quote()
371 }
372
373 pub fn as_quote(&self) -> Option<&Quote> {
374 self.as_punctuation()?.as_quote()
375 }
376}
377
378#[cfg(test)]
379mod tests {
380 use crate::Document;
381
382 #[test]
383 fn car_is_singular_noun() {
384 let doc = Document::new_plain_english_curated("car");
385 let tk = &doc.tokens().next().unwrap().kind;
386 assert!(tk.is_singular_noun());
387 }
388
389 #[test]
390 fn traffic_is_mass_noun_only() {
391 let doc = Document::new_plain_english_curated("traffic");
392 let tk = &doc.tokens().next().unwrap().kind;
393 assert!(tk.is_mass_noun_only());
394 }
395
396 #[test]
397 fn equipment_is_mass_noun() {
398 let doc = Document::new_plain_english_curated("equipment");
399 let tk = &doc.tokens().next().unwrap().kind;
400 assert!(tk.is_mass_noun());
401 }
402
403 #[test]
404 fn equipment_is_non_countable_noun() {
405 let doc = Document::new_plain_english_curated("equipment");
406 let tk = &doc.tokens().next().unwrap().kind;
407 assert!(tk.is_non_countable_noun());
408 }
409
410 #[test]
411 fn equipment_isnt_countable_noun() {
412 let doc = Document::new_plain_english_curated("equipment");
413 let tk = &doc.tokens().next().unwrap().kind;
414 assert!(!tk.is_countable_noun());
415 }
416
417 #[test]
418 fn oov_word_is_oov() {
419 let doc = Document::new_plain_english_curated("nonexistentword");
420 let tk = &doc.tokens().next().unwrap().kind;
421 assert!(tk.is_oov());
422 }
423
424 #[test]
425 fn known_word_is_not_oov() {
426 let doc = Document::new_plain_english_curated("car");
427 let tk = &doc.tokens().next().unwrap().kind;
428 assert!(!tk.is_oov());
429 }
430
431 #[test]
432 fn non_word_tokens_are_not_oov() {
433 let doc = Document::new_plain_english_curated("Hello, world!");
434 let tokens: Vec<_> = doc.tokens().collect();
435
436 assert!(!tokens[1].kind.is_oov());
438 assert!(!tokens[3].kind.is_oov());
440 }
441}