1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word};
6
7macro_rules! delegate_to_metadata {
10 ($($method:ident),* $(,)?) => {
11 $(
12 #[doc = concat!(
13 "Delegates to [`DictWordMetadata::",
14 stringify!($method),
15 "`] when this token is a word.\n\n",
16 "Returns `false` if the token is not a word."
17 )]
18 pub fn $method(&self) -> bool {
19 let Word(Some(metadata)) = self else {
20 return false;
21 };
22 metadata.$method()
23 }
24 )*
25 };
26}
27
28#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
33#[serde(tag = "kind", content = "value")]
34pub enum TokenKind {
35 Word(Option<DictWordMetadata>),
37 Punctuation(Punctuation),
38 Decade,
39 Number(Number),
40 Space(usize),
42 Newline(usize),
44 EmailAddress,
45 Url,
46 Hostname,
47 #[default]
50 Unlintable,
51 ParagraphBreak,
52 Regexish,
53}
54
55impl TokenKind {
56 delegate_to_metadata! {
58 is_nominal,
60 is_noun,
61 is_pronoun,
62 is_proper_noun,
63 is_singular_nominal,
64 is_plural_nominal,
65 is_possessive_nominal,
66 is_non_plural_nominal,
67 is_singular_noun,
68 is_plural_noun,
69 is_non_plural_noun,
70 is_countable_noun,
71 is_non_countable_noun,
72 is_mass_noun,
73 is_mass_noun_only,
74 is_non_mass_noun,
75 is_singular_pronoun,
76 is_plural_pronoun,
77 is_non_plural_pronoun,
78 is_reflexive_pronoun,
79 is_personal_pronoun,
80 is_first_person_singular_pronoun,
81 is_first_person_plural_pronoun,
82 is_second_person_pronoun,
83 is_third_person_pronoun,
84 is_third_person_singular_pronoun,
85 is_third_person_plural_pronoun,
86 is_subject_pronoun,
87 is_object_pronoun,
88 is_possessive_noun,
89 is_possessive_pronoun,
90
91 is_verb,
93 is_auxiliary_verb,
94 is_linking_verb,
95 is_verb_lemma,
96 is_verb_simple_past_form,
97 is_verb_past_participle_form,
98 is_verb_progressive_form,
99 is_verb_third_person_singular_present_form,
100
101 is_adjective,
103 is_comparative_adjective,
104 is_superlative_adjective,
105 is_positive_adjective,
106
107 is_adverb,
109
110 is_determiner,
112 is_demonstrative_determiner,
113 is_possessive_determiner,
114 is_quantifier,
115 is_non_quantifier_determiner,
116
117 is_conjunction,
119
120 is_swear,
122 is_likely_homograph,
123
124 is_lowercase,
126 is_titlecase,
127 is_allcaps,
128 is_lower_camel,
129 is_upper_camel,
130 is_apostrophized,
131
132 is_roman_numerals
133 }
134
135 pub fn is_preposition(&self) -> bool {
137 let Word(Some(metadata)) = self else {
138 return false;
139 };
140 metadata.preposition
141 }
142
143 pub fn is_common_word(&self) -> bool {
146 let Word(Some(metadata)) = self else {
147 return true;
148 };
149 metadata.common
150 }
151
152 pub fn is_np_member(&self) -> bool {
154 let Word(Some(metadata)) = self else {
155 return false;
156 };
157 metadata.np_member.unwrap_or(false)
158 }
159
160 pub fn is_oov(&self) -> bool {
165 matches!(self, TokenKind::Word(None))
166 }
167
168 pub fn is_open_square(&self) -> bool {
171 matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
172 }
173
174 pub fn is_close_square(&self) -> bool {
175 matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
176 }
177
178 pub fn is_open_round(&self) -> bool {
179 matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
180 }
181
182 pub fn is_close_round(&self) -> bool {
183 matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
184 }
185
186 pub fn is_pipe(&self) -> bool {
187 matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
188 }
189
190 pub fn is_currency(&self) -> bool {
191 matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
192 }
193
194 pub fn is_ellipsis(&self) -> bool {
195 matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
196 }
197
198 pub fn is_hyphen(&self) -> bool {
199 matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
200 }
201
202 pub fn is_quote(&self) -> bool {
203 matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
204 }
205
206 pub fn is_apostrophe(&self) -> bool {
207 matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
208 }
209
210 pub fn is_period(&self) -> bool {
211 matches!(self, TokenKind::Punctuation(Punctuation::Period))
212 }
213
214 pub fn is_at(&self) -> bool {
215 matches!(self, TokenKind::Punctuation(Punctuation::At))
216 }
217
218 pub fn is_comma(&self) -> bool {
219 matches!(self, TokenKind::Punctuation(Punctuation::Comma))
220 }
221
222 pub fn is_semicolon(&self) -> bool {
223 matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
224 }
225
226 pub fn is_ampersand(&self) -> bool {
227 matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
228 }
229
230 pub fn is_slash(&self) -> bool {
231 matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
232 }
233
234 pub fn is_word_like(&self) -> bool {
239 matches!(
240 self,
241 TokenKind::Word(..)
242 | TokenKind::EmailAddress
243 | TokenKind::Hostname
244 | TokenKind::Decade
245 | TokenKind::Number(..)
246 )
247 }
248
249 pub(crate) fn is_chunk_terminator(&self) -> bool {
250 if self.is_sentence_terminator() {
251 return true;
252 }
253
254 match self {
255 TokenKind::Punctuation(punct) => {
256 matches!(
257 punct,
258 Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
259 )
260 }
261 _ => false,
262 }
263 }
264
265 pub(crate) fn is_sentence_terminator(&self) -> bool {
266 match self {
267 TokenKind::Punctuation(punct) => [
268 Punctuation::Period,
269 Punctuation::Bang,
270 Punctuation::Question,
271 ]
272 .contains(punct),
273 TokenKind::ParagraphBreak => true,
274 _ => false,
275 }
276 }
277
278 pub fn is_case_separator(&self) -> bool {
282 matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
283 || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
284 }
285
286 pub fn is_whitespace(&self) -> bool {
288 matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
289 }
290
291 pub fn is_upos(&self, upos: UPOS) -> bool {
292 let Some(Some(meta)) = self.as_word() else {
293 return false;
294 };
295
296 meta.pos_tag == Some(upos)
297 }
298
299 pub fn matches_variant_of(&self, other: &Self) -> bool {
304 self.with_default_data() == other.with_default_data()
305 }
306
307 pub fn with_default_data(&self) -> Self {
311 match self {
312 TokenKind::Word(_) => TokenKind::Word(Default::default()),
313 TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
314 TokenKind::Number(..) => TokenKind::Number(Default::default()),
315 TokenKind::Space(_) => TokenKind::Space(Default::default()),
316 TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
317 _ => self.clone(),
318 }
319 }
320
321 pub fn blank_word() -> Self {
323 Self::Word(None)
324 }
325
326 pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
329 self.as_mut_punctuation()?.as_mut_quote()
330 }
331
332 pub fn as_quote(&self) -> Option<&Quote> {
333 self.as_punctuation()?.as_quote()
334 }
335}
336
337#[cfg(test)]
338mod tests {
339 use crate::Document;
340
341 #[test]
342 fn car_is_singular_noun() {
343 let doc = Document::new_plain_english_curated("car");
344 let tk = &doc.tokens().next().unwrap().kind;
345 assert!(tk.is_singular_noun());
346 }
347
348 #[test]
349 fn traffic_is_mass_noun_only() {
350 let doc = Document::new_plain_english_curated("traffic");
351 let tk = &doc.tokens().next().unwrap().kind;
352 assert!(tk.is_mass_noun_only());
353 }
354
355 #[test]
356 fn equipment_is_mass_noun() {
357 let doc = Document::new_plain_english_curated("equipment");
358 let tk = &doc.tokens().next().unwrap().kind;
359 assert!(tk.is_mass_noun());
360 }
361
362 #[test]
363 fn equipment_is_non_countable_noun() {
364 let doc = Document::new_plain_english_curated("equipment");
365 let tk = &doc.tokens().next().unwrap().kind;
366 assert!(tk.is_non_countable_noun());
367 }
368
369 #[test]
370 fn equipment_isnt_countable_noun() {
371 let doc = Document::new_plain_english_curated("equipment");
372 let tk = &doc.tokens().next().unwrap().kind;
373 assert!(!tk.is_countable_noun());
374 }
375
376 #[test]
377 fn oov_word_is_oov() {
378 let doc = Document::new_plain_english_curated("nonexistentword");
379 let tk = &doc.tokens().next().unwrap().kind;
380 assert!(tk.is_oov());
381 }
382
383 #[test]
384 fn known_word_is_not_oov() {
385 let doc = Document::new_plain_english_curated("car");
386 let tk = &doc.tokens().next().unwrap().kind;
387 assert!(!tk.is_oov());
388 }
389
390 #[test]
391 fn non_word_tokens_are_not_oov() {
392 let doc = Document::new_plain_english_curated("Hello, world!");
393 let tokens: Vec<_> = doc.tokens().collect();
394
395 assert!(!tokens[1].kind.is_oov());
397 assert!(!tokens[3].kind.is_oov());
399 }
400}