1use harper_brill::UPOS;
2use is_macro::Is;
3use serde::{Deserialize, Serialize};
4
5use crate::{Number, Punctuation, Quote, TokenKind::Word, WordMetadata};
6
7macro_rules! delegate_to_metadata {
8 ($($method:ident),* $(,)?) => {
9 $(
10 pub fn $method(&self) -> bool {
11 let Word(Some(metadata)) = self else {
12 return false;
13 };
14 metadata.$method()
15 }
16 )*
17 };
18}
19
20#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
21#[serde(tag = "kind", content = "value")]
22pub enum TokenKind {
23 Word(Option<WordMetadata>),
25 Punctuation(Punctuation),
26 Decade,
27 Number(Number),
28 Space(usize),
30 Newline(usize),
32 EmailAddress,
33 Url,
34 Hostname,
35 #[default]
38 Unlintable,
39 ParagraphBreak,
40 Regexish,
41}
42
43impl TokenKind {
44 delegate_to_metadata! {
46 is_nominal,
48 is_noun,
49 is_pronoun,
50 is_proper_noun,
51 is_singular_nominal,
52 is_plural_nominal,
53 is_possessive_nominal,
54 is_non_plural_nominal,
55 is_singular_noun,
56 is_plural_noun,
57 is_non_plural_noun,
58 is_countable_noun,
59 is_mass_noun,
60 is_singular_pronoun,
61 is_plural_pronoun,
62 is_non_plural_pronoun,
63 is_reflexive_pronoun,
64 is_first_person_singular_pronoun,
65 is_first_person_plural_pronoun,
66 is_second_person_pronoun,
67 is_third_person_pronoun,
68 is_third_person_singular_pronoun,
69 is_third_person_plural_pronoun,
70 is_object_pronoun,
71 is_possessive_noun,
72 is_possessive_pronoun,
73
74 is_verb,
76 is_auxiliary_verb,
77 is_linking_verb,
78 is_verb_lemma,
79 is_verb_past_form,
80 is_verb_progressive_form,
81 is_verb_third_person_singular_present_form,
82
83 is_adjective,
85
86 is_adverb,
88
89 is_determiner,
91 is_demonstrative_determiner,
92 is_possessive_determiner,
93
94 is_conjunction
96 }
97
98 pub fn is_preposition(&self) -> bool {
100 let Word(Some(metadata)) = self else {
101 return false;
102 };
103 metadata.preposition
104 }
105
106 pub fn is_swear(&self) -> bool {
107 let Word(Some(metadata)) = self else {
108 return false;
109 };
110 metadata.is_swear()
111 }
112
113 pub fn is_common_word(&self) -> bool {
114 let Word(Some(metadata)) = self else {
115 return true;
116 };
117 metadata.common
118 }
119
120 pub fn is_likely_homograph(&self) -> bool {
121 let Word(Some(metadata)) = self else {
122 return false;
123 };
124 metadata.is_likely_homograph()
125 }
126
127 pub fn is_open_square(&self) -> bool {
130 matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
131 }
132
133 pub fn is_close_square(&self) -> bool {
134 matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
135 }
136
137 pub fn is_pipe(&self) -> bool {
138 matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
139 }
140
141 pub fn is_currency(&self) -> bool {
142 matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
143 }
144
145 pub fn is_ellipsis(&self) -> bool {
146 matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
147 }
148
149 pub fn is_hyphen(&self) -> bool {
150 matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
151 }
152
153 pub fn is_quote(&self) -> bool {
154 matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
155 }
156
157 pub fn is_apostrophe(&self) -> bool {
158 matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
159 }
160
161 pub fn is_period(&self) -> bool {
162 matches!(self, TokenKind::Punctuation(Punctuation::Period))
163 }
164
165 pub fn is_at(&self) -> bool {
166 matches!(self, TokenKind::Punctuation(Punctuation::At))
167 }
168
169 pub fn is_comma(&self) -> bool {
170 matches!(self, TokenKind::Punctuation(Punctuation::Comma))
171 }
172
173 pub fn is_word_like(&self) -> bool {
178 matches!(
179 self,
180 TokenKind::Word(..)
181 | TokenKind::EmailAddress
182 | TokenKind::Hostname
183 | TokenKind::Decade
184 | TokenKind::Number(..)
185 )
186 }
187
188 pub(crate) fn is_chunk_terminator(&self) -> bool {
189 if self.is_sentence_terminator() {
190 return true;
191 }
192
193 match self {
194 TokenKind::Punctuation(punct) => {
195 matches!(
196 punct,
197 Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
198 )
199 }
200 _ => false,
201 }
202 }
203
204 pub(crate) fn is_sentence_terminator(&self) -> bool {
205 match self {
206 TokenKind::Punctuation(punct) => [
207 Punctuation::Period,
208 Punctuation::Bang,
209 Punctuation::Question,
210 ]
211 .contains(punct),
212 TokenKind::ParagraphBreak => true,
213 _ => false,
214 }
215 }
216
217 pub fn is_case_separator(&self) -> bool {
221 matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
222 || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
223 }
224
225 pub fn is_whitespace(&self) -> bool {
227 matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
228 }
229
230 pub fn is_upos(&self, upos: UPOS) -> bool {
231 let Some(Some(meta)) = self.as_word() else {
232 return false;
233 };
234
235 meta.pos_tag == Some(upos)
236 }
237
238 pub fn matches_variant_of(&self, other: &Self) -> bool {
243 self.with_default_data() == other.with_default_data()
244 }
245
246 pub fn with_default_data(&self) -> Self {
250 match self {
251 TokenKind::Word(_) => TokenKind::Word(Default::default()),
252 TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
253 TokenKind::Number(..) => TokenKind::Number(Default::default()),
254 TokenKind::Space(_) => TokenKind::Space(Default::default()),
255 TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
256 _ => self.clone(),
257 }
258 }
259
260 pub fn blank_word() -> Self {
262 Self::Word(None)
263 }
264
265 pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
268 self.as_mut_punctuation()?.as_mut_quote()
269 }
270
271 pub fn as_quote(&self) -> Option<&Quote> {
272 self.as_punctuation()?.as_quote()
273 }
274}
275
276#[cfg(test)]
277mod tests {
278 use crate::Document;
279
280 #[test]
281 fn car_is_singular_noun() {
282 let doc = Document::new_plain_english_curated("car");
283 let tk = &doc.tokens().next().unwrap().kind;
284 assert!(tk.is_singular_noun());
285 }
286
287 #[test]
288 fn traffic_is_mass_noun() {
289 let doc = Document::new_plain_english_curated("traffic");
290 let tk = &doc.tokens().next().unwrap().kind;
291 assert!(tk.is_mass_noun());
292 }
293}