1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word};
6
7macro_rules! delegate_to_metadata {
10 ($($method:ident),* $(,)?) => {
11 $(
12 #[doc = concat!(
13 "Delegates to [`DictWordMetadata::",
14 stringify!($method),
15 "`] when this token is a word.\n\n",
16 "Returns `false` if the token is not a word."
17 )]
18 pub fn $method(&self) -> bool {
19 let Word(Some(metadata)) = self else {
20 return false;
21 };
22 metadata.$method()
23 }
24 )*
25 };
26}
27
28#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
33#[serde(tag = "kind", content = "value")]
34pub enum TokenKind {
35 Word(Option<DictWordMetadata>),
37 Punctuation(Punctuation),
38 Decade,
39 Number(Number),
40 Space(usize),
42 Newline(usize),
44 EmailAddress,
45 Url,
46 Hostname,
47 #[default]
50 Unlintable,
51 ParagraphBreak,
52 Regexish,
53}
54
55impl TokenKind {
56 delegate_to_metadata! {
58 is_nominal,
60 is_noun,
61 is_pronoun,
62 is_proper_noun,
63 is_singular_nominal,
64 is_plural_nominal,
65 is_possessive_nominal,
66 is_non_plural_nominal,
67 is_singular_noun,
68 is_plural_noun,
69 is_non_plural_noun,
70 is_countable_noun,
71 is_non_countable_noun,
72 is_mass_noun,
73 is_mass_noun_only,
74 is_non_mass_noun,
75 is_singular_pronoun,
76 is_plural_pronoun,
77 is_non_plural_pronoun,
78 is_reflexive_pronoun,
79 is_personal_pronoun,
80 is_first_person_singular_pronoun,
81 is_first_person_plural_pronoun,
82 is_second_person_pronoun,
83 is_third_person_pronoun,
84 is_third_person_singular_pronoun,
85 is_third_person_plural_pronoun,
86 is_subject_pronoun,
87 is_object_pronoun,
88 is_possessive_noun,
89 is_possessive_pronoun,
90
91 is_verb,
93 is_auxiliary_verb,
94 is_linking_verb,
95 is_verb_lemma,
96 is_verb_simple_past_form,
97 is_verb_progressive_form,
98 is_verb_third_person_singular_present_form,
99
100 is_adjective,
102 is_comparative_adjective,
103 is_superlative_adjective,
104 is_positive_adjective,
105
106 is_adverb,
108
109 is_determiner,
111 is_demonstrative_determiner,
112 is_possessive_determiner,
113 is_quantifier,
114 is_non_quantifier_determiner,
115
116 is_conjunction,
118
119 is_swear,
121 is_likely_homograph,
122
123 is_lowercase,
125 is_titlecase,
126 is_allcaps,
127 is_lower_camel,
128 is_upper_camel,
129 is_apostrophized,
130
131 is_roman_numerals
132 }
133
134 pub fn is_preposition(&self) -> bool {
136 let Word(Some(metadata)) = self else {
137 return false;
138 };
139 metadata.preposition
140 }
141
142 pub fn is_common_word(&self) -> bool {
145 let Word(Some(metadata)) = self else {
146 return true;
147 };
148 metadata.common
149 }
150
151 pub fn is_np_member(&self) -> bool {
153 let Word(Some(metadata)) = self else {
154 return false;
155 };
156 metadata.np_member.unwrap_or(false)
157 }
158
159 pub fn is_oov(&self) -> bool {
164 matches!(self, TokenKind::Word(None))
165 }
166
167 pub fn is_open_square(&self) -> bool {
170 matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
171 }
172
173 pub fn is_close_square(&self) -> bool {
174 matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
175 }
176
177 pub fn is_open_round(&self) -> bool {
178 matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
179 }
180
181 pub fn is_close_round(&self) -> bool {
182 matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
183 }
184
185 pub fn is_pipe(&self) -> bool {
186 matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
187 }
188
189 pub fn is_currency(&self) -> bool {
190 matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
191 }
192
193 pub fn is_ellipsis(&self) -> bool {
194 matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
195 }
196
197 pub fn is_hyphen(&self) -> bool {
198 matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
199 }
200
201 pub fn is_quote(&self) -> bool {
202 matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
203 }
204
205 pub fn is_apostrophe(&self) -> bool {
206 matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
207 }
208
209 pub fn is_period(&self) -> bool {
210 matches!(self, TokenKind::Punctuation(Punctuation::Period))
211 }
212
213 pub fn is_at(&self) -> bool {
214 matches!(self, TokenKind::Punctuation(Punctuation::At))
215 }
216
217 pub fn is_comma(&self) -> bool {
218 matches!(self, TokenKind::Punctuation(Punctuation::Comma))
219 }
220
221 pub fn is_semicolon(&self) -> bool {
222 matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
223 }
224
225 pub fn is_ampersand(&self) -> bool {
226 matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
227 }
228
229 pub fn is_slash(&self) -> bool {
230 matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
231 }
232
233 pub fn is_word_like(&self) -> bool {
238 matches!(
239 self,
240 TokenKind::Word(..)
241 | TokenKind::EmailAddress
242 | TokenKind::Hostname
243 | TokenKind::Decade
244 | TokenKind::Number(..)
245 )
246 }
247
248 pub(crate) fn is_chunk_terminator(&self) -> bool {
249 if self.is_sentence_terminator() {
250 return true;
251 }
252
253 match self {
254 TokenKind::Punctuation(punct) => {
255 matches!(
256 punct,
257 Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
258 )
259 }
260 _ => false,
261 }
262 }
263
264 pub(crate) fn is_sentence_terminator(&self) -> bool {
265 match self {
266 TokenKind::Punctuation(punct) => [
267 Punctuation::Period,
268 Punctuation::Bang,
269 Punctuation::Question,
270 ]
271 .contains(punct),
272 TokenKind::ParagraphBreak => true,
273 _ => false,
274 }
275 }
276
277 pub fn is_case_separator(&self) -> bool {
281 matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
282 || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
283 }
284
285 pub fn is_whitespace(&self) -> bool {
287 matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
288 }
289
290 pub fn is_upos(&self, upos: UPOS) -> bool {
291 let Some(Some(meta)) = self.as_word() else {
292 return false;
293 };
294
295 meta.pos_tag == Some(upos)
296 }
297
298 pub fn matches_variant_of(&self, other: &Self) -> bool {
303 self.with_default_data() == other.with_default_data()
304 }
305
306 pub fn with_default_data(&self) -> Self {
310 match self {
311 TokenKind::Word(_) => TokenKind::Word(Default::default()),
312 TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
313 TokenKind::Number(..) => TokenKind::Number(Default::default()),
314 TokenKind::Space(_) => TokenKind::Space(Default::default()),
315 TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
316 _ => self.clone(),
317 }
318 }
319
320 pub fn blank_word() -> Self {
322 Self::Word(None)
323 }
324
325 pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
328 self.as_mut_punctuation()?.as_mut_quote()
329 }
330
331 pub fn as_quote(&self) -> Option<&Quote> {
332 self.as_punctuation()?.as_quote()
333 }
334}
335
336#[cfg(test)]
337mod tests {
338 use crate::Document;
339
340 #[test]
341 fn car_is_singular_noun() {
342 let doc = Document::new_plain_english_curated("car");
343 let tk = &doc.tokens().next().unwrap().kind;
344 assert!(tk.is_singular_noun());
345 }
346
347 #[test]
348 fn traffic_is_mass_noun_only() {
349 let doc = Document::new_plain_english_curated("traffic");
350 let tk = &doc.tokens().next().unwrap().kind;
351 assert!(tk.is_mass_noun_only());
352 }
353
354 #[test]
355 fn equipment_is_mass_noun() {
356 let doc = Document::new_plain_english_curated("equipment");
357 let tk = &doc.tokens().next().unwrap().kind;
358 assert!(tk.is_mass_noun());
359 }
360
361 #[test]
362 fn equipment_is_non_countable_noun() {
363 let doc = Document::new_plain_english_curated("equipment");
364 let tk = &doc.tokens().next().unwrap().kind;
365 assert!(tk.is_non_countable_noun());
366 }
367
368 #[test]
369 fn equipment_isnt_countable_noun() {
370 let doc = Document::new_plain_english_curated("equipment");
371 let tk = &doc.tokens().next().unwrap().kind;
372 assert!(!tk.is_countable_noun());
373 }
374
375 #[test]
376 fn oov_word_is_oov() {
377 let doc = Document::new_plain_english_curated("nonexistentword");
378 let tk = &doc.tokens().next().unwrap().kind;
379 assert!(tk.is_oov());
380 }
381
382 #[test]
383 fn known_word_is_not_oov() {
384 let doc = Document::new_plain_english_curated("car");
385 let tk = &doc.tokens().next().unwrap().kind;
386 assert!(!tk.is_oov());
387 }
388
389 #[test]
390 fn non_word_tokens_are_not_oov() {
391 let doc = Document::new_plain_english_curated("Hello, world!");
392 let tokens: Vec<_> = doc.tokens().collect();
393
394 assert!(!tokens[1].kind.is_oov());
396 assert!(!tokens[3].kind.is_oov());
398 }
399}