1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word};
6
7macro_rules! delegate_to_metadata {
10 ($($method:ident),* $(,)?) => {
11 $(
12 #[doc = concat!(
13 "Delegates to [`DictWordMetadata::",
14 stringify!($method),
15 "`] when this token is a word.\n\n",
16 "Returns `false` if the token is not a word."
17 )]
18 pub fn $method(&self) -> bool {
19 let Word(Some(metadata)) = self else {
20 return false;
21 };
22 metadata.$method()
23 }
24 )*
25 };
26}
27
28#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
33#[serde(tag = "kind", content = "value")]
34pub enum TokenKind {
35 Word(Option<DictWordMetadata>),
37 Punctuation(Punctuation),
38 Decade,
39 Number(Number),
40 Space(usize),
42 Newline(usize),
44 EmailAddress,
45 Url,
46 Hostname,
47 #[default]
50 Unlintable,
51 ParagraphBreak,
52 Regexish,
53}
54
55impl TokenKind {
56 delegate_to_metadata! {
58 is_nominal,
60 is_noun,
61 is_pronoun,
62 is_proper_noun,
63 is_singular_nominal,
64 is_plural_nominal,
65 is_possessive_nominal,
66 is_non_plural_nominal,
67 is_singular_noun,
68 is_plural_noun,
69 is_non_plural_noun,
70 is_countable_noun,
71 is_non_countable_noun,
72 is_mass_noun,
73 is_mass_noun_only,
74 is_non_mass_noun,
75 is_singular_pronoun,
76 is_plural_pronoun,
77 is_non_plural_pronoun,
78 is_reflexive_pronoun,
79 is_personal_pronoun,
80 is_first_person_singular_pronoun,
81 is_first_person_plural_pronoun,
82 is_second_person_pronoun,
83 is_third_person_pronoun,
84 is_third_person_singular_pronoun,
85 is_third_person_plural_pronoun,
86 is_subject_pronoun,
87 is_object_pronoun,
88 is_possessive_noun,
89 is_possessive_pronoun,
90
91 is_verb,
93 is_auxiliary_verb,
94 is_linking_verb,
95 is_verb_lemma,
96 is_verb_past_form,
97 is_verb_simple_past_form,
98 is_verb_past_participle_form,
99 is_verb_progressive_form,
100 is_verb_third_person_singular_present_form,
101
102 is_adjective,
104 is_comparative_adjective,
105 is_superlative_adjective,
106 is_positive_adjective,
107
108 is_adverb,
110 is_manner_adverb,
111 is_frequency_adverb,
112 is_degree_adverb,
113
114 is_determiner,
116 is_demonstrative_determiner,
117 is_possessive_determiner,
118 is_quantifier,
119 is_non_quantifier_determiner,
120 is_non_demonstrative_determiner,
121
122 is_conjunction,
124
125 is_swear,
127 is_likely_homograph,
128
129 is_lowercase,
131 is_titlecase,
132 is_allcaps,
133 is_lower_camel,
134 is_upper_camel,
135 is_apostrophized,
136
137 is_roman_numerals
138 }
139
140 pub fn is_preposition(&self) -> bool {
142 let Word(Some(metadata)) = self else {
143 return false;
144 };
145 metadata.preposition
146 }
147
148 pub fn is_common_word(&self) -> bool {
151 let Word(Some(metadata)) = self else {
152 return true;
153 };
154 metadata.common
155 }
156
157 pub fn is_np_member(&self) -> bool {
159 let Word(Some(metadata)) = self else {
160 return false;
161 };
162 metadata.np_member.unwrap_or(false)
163 }
164
165 pub fn is_oov(&self) -> bool {
170 matches!(self, TokenKind::Word(None))
171 }
172
173 pub fn is_open_square(&self) -> bool {
176 matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
177 }
178
179 pub fn is_close_square(&self) -> bool {
180 matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
181 }
182
183 pub fn is_open_round(&self) -> bool {
184 matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
185 }
186
187 pub fn is_close_round(&self) -> bool {
188 matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
189 }
190
191 pub fn is_pipe(&self) -> bool {
192 matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
193 }
194
195 pub fn is_currency(&self) -> bool {
196 matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
197 }
198
199 pub fn is_ellipsis(&self) -> bool {
200 matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
201 }
202
203 pub fn is_hyphen(&self) -> bool {
204 matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
205 }
206
207 pub fn is_quote(&self) -> bool {
208 matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
209 }
210
211 pub fn is_apostrophe(&self) -> bool {
212 matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
213 }
214
215 pub fn is_period(&self) -> bool {
216 matches!(self, TokenKind::Punctuation(Punctuation::Period))
217 }
218
219 pub fn is_at(&self) -> bool {
220 matches!(self, TokenKind::Punctuation(Punctuation::At))
221 }
222
223 pub fn is_comma(&self) -> bool {
224 matches!(self, TokenKind::Punctuation(Punctuation::Comma))
225 }
226
227 pub fn is_semicolon(&self) -> bool {
228 matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
229 }
230
231 pub fn is_ampersand(&self) -> bool {
232 matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
233 }
234
235 pub fn is_slash(&self) -> bool {
236 matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
237 }
238
239 pub fn is_word_like(&self) -> bool {
244 matches!(
245 self,
246 TokenKind::Word(..)
247 | TokenKind::EmailAddress
248 | TokenKind::Hostname
249 | TokenKind::Decade
250 | TokenKind::Number(..)
251 )
252 }
253
254 pub(crate) fn is_chunk_terminator(&self) -> bool {
255 if self.is_sentence_terminator() {
256 return true;
257 }
258
259 match self {
260 TokenKind::Punctuation(punct) => {
261 matches!(
262 punct,
263 Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
264 )
265 }
266 _ => false,
267 }
268 }
269
270 pub(crate) fn is_sentence_terminator(&self) -> bool {
271 match self {
272 TokenKind::Punctuation(punct) => [
273 Punctuation::Period,
274 Punctuation::Bang,
275 Punctuation::Question,
276 ]
277 .contains(punct),
278 TokenKind::ParagraphBreak => true,
279 _ => false,
280 }
281 }
282
283 pub fn is_case_separator(&self) -> bool {
287 matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
288 || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
289 }
290
291 pub fn is_whitespace(&self) -> bool {
293 matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
294 }
295
296 pub fn is_upos(&self, upos: UPOS) -> bool {
297 let Some(Some(meta)) = self.as_word() else {
298 return false;
299 };
300
301 meta.pos_tag == Some(upos)
302 }
303
304 pub fn matches_variant_of(&self, other: &Self) -> bool {
309 self.with_default_data() == other.with_default_data()
310 }
311
312 pub fn with_default_data(&self) -> Self {
316 match self {
317 TokenKind::Word(_) => TokenKind::Word(Default::default()),
318 TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
319 TokenKind::Number(..) => TokenKind::Number(Default::default()),
320 TokenKind::Space(_) => TokenKind::Space(Default::default()),
321 TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
322 _ => self.clone(),
323 }
324 }
325
326 pub fn blank_word() -> Self {
328 Self::Word(None)
329 }
330
331 pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
334 self.as_mut_punctuation()?.as_mut_quote()
335 }
336
337 pub fn as_quote(&self) -> Option<&Quote> {
338 self.as_punctuation()?.as_quote()
339 }
340}
341
342#[cfg(test)]
343mod tests {
344 use crate::Document;
345
346 #[test]
347 fn car_is_singular_noun() {
348 let doc = Document::new_plain_english_curated("car");
349 let tk = &doc.tokens().next().unwrap().kind;
350 assert!(tk.is_singular_noun());
351 }
352
353 #[test]
354 fn traffic_is_mass_noun_only() {
355 let doc = Document::new_plain_english_curated("traffic");
356 let tk = &doc.tokens().next().unwrap().kind;
357 assert!(tk.is_mass_noun_only());
358 }
359
360 #[test]
361 fn equipment_is_mass_noun() {
362 let doc = Document::new_plain_english_curated("equipment");
363 let tk = &doc.tokens().next().unwrap().kind;
364 assert!(tk.is_mass_noun());
365 }
366
367 #[test]
368 fn equipment_is_non_countable_noun() {
369 let doc = Document::new_plain_english_curated("equipment");
370 let tk = &doc.tokens().next().unwrap().kind;
371 assert!(tk.is_non_countable_noun());
372 }
373
374 #[test]
375 fn equipment_isnt_countable_noun() {
376 let doc = Document::new_plain_english_curated("equipment");
377 let tk = &doc.tokens().next().unwrap().kind;
378 assert!(!tk.is_countable_noun());
379 }
380
381 #[test]
382 fn oov_word_is_oov() {
383 let doc = Document::new_plain_english_curated("nonexistentword");
384 let tk = &doc.tokens().next().unwrap().kind;
385 assert!(tk.is_oov());
386 }
387
388 #[test]
389 fn known_word_is_not_oov() {
390 let doc = Document::new_plain_english_curated("car");
391 let tk = &doc.tokens().next().unwrap().kind;
392 assert!(!tk.is_oov());
393 }
394
395 #[test]
396 fn non_word_tokens_are_not_oov() {
397 let doc = Document::new_plain_english_curated("Hello, world!");
398 let tokens: Vec<_> = doc.tokens().collect();
399
400 assert!(!tokens[1].kind.is_oov());
402 assert!(!tokens[3].kind.is_oov());
404 }
405}