1use logos::Logos;
18use std::fmt;
19
20#[derive(Logos, Debug, PartialEq, Eq, Hash, Clone, serde::Serialize, serde::Deserialize)]
22pub enum Token {
23 #[token("::")]
25 LexMarker,
26
27 #[regex(r" {4}|\t", priority = 3)] Indentation,
30
31 Indent(Vec<(Token, std::ops::Range<usize>)>),
34 Dedent(Vec<(Token, std::ops::Range<usize>)>),
35
36 #[regex(r"\n", |lex| Some(lex.slice().to_owned()))]
38 BlankLine(Option<String>),
39
40 #[regex(r" {1,3}", |lex| Some(lex.slice().len()), priority = 1)]
42 Whitespace(usize),
44
45 #[token("-")]
47 Dash,
48 #[token(".")]
49 Period,
50 #[token("(")]
51 OpenParen,
52 #[token(")")]
53 CloseParen,
54 #[token(":")]
55 Colon,
56
57 #[token("!")]
59 ExclamationMark,
60 #[token("?")]
61 QuestionMark,
62 #[token(";")]
63 Semicolon,
64 #[token("¡")]
65 InvertedExclamationMark,
66 #[token("¿")]
67 InvertedQuestionMark,
68 #[token("…")]
69 Ellipsis,
70 #[token("。")]
71 IdeographicFullStop,
72 #[token("!")]
73 FullwidthExclamationMark,
74 #[token("?")]
75 FullwidthQuestionMark,
76 #[token("⁉")]
77 ExclamationQuestionMark,
78 #[token("⁈")]
79 QuestionExclamationMark,
80 #[token("؟")]
81 ArabicQuestionMark,
82 #[token("۔")]
83 ArabicFullStop,
84 #[token("؍")]
85 ArabicTripleDot,
86 #[token("،")]
87 ArabicComma,
88 #[token("।")]
89 Danda,
90 #[token("॥")]
91 DoubleDanda,
92 #[token("৷")]
93 BengaliCurrencyNumeratorFour,
94 #[token("።")]
95 EthiopianFullStop,
96 #[token("։")]
97 ArmenianFullStop,
98 #[token("།")]
99 TibetanShad,
100 #[token("๏")]
101 ThaiFongman,
102 #[token("၊")]
103 MyanmarComma,
104 #[token("။")]
105 MyanmarFullStop,
106
107 #[token(",")]
109 Comma,
110 #[token("\"")]
111 Quote,
112 #[token("=")]
113 Equals,
114
115 #[regex(r"[0-9]+", |lex| lex.slice().to_owned(), priority = 2)]
117 Number(String),
118
119 #[regex(r#"[^\s\n\t\-\.\(\):0-9,="!?;¡¿…。!?⁉⁈؟۔؍،।॥৷።։།๏၊။]+"#, |lex| lex.slice().to_owned())]
133 Text(String),
134}
135
136impl fmt::Display for Token {
137 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
138 let name = match self {
139 Token::LexMarker => "lex-marker",
140 Token::Indentation => "indentation",
141 Token::Indent(_) => "indent",
142 Token::Dedent(_) => "dedent",
143 Token::BlankLine(_) => "blank-line",
144 Token::Whitespace(_) => "whitespace",
145 Token::Dash => "dash",
146 Token::Period => "period",
147 Token::OpenParen => "open-paren",
148 Token::CloseParen => "close-paren",
149 Token::Colon => "colon",
150 Token::ExclamationMark => "exclamation-mark",
151 Token::QuestionMark => "question-mark",
152 Token::Semicolon => "semicolon",
153 Token::InvertedExclamationMark => "inverted-exclamation-mark",
154 Token::InvertedQuestionMark => "inverted-question-mark",
155 Token::Ellipsis => "ellipsis",
156 Token::IdeographicFullStop => "ideographic-full-stop",
157 Token::FullwidthExclamationMark => "fullwidth-exclamation-mark",
158 Token::FullwidthQuestionMark => "fullwidth-question-mark",
159 Token::ExclamationQuestionMark => "exclamation-question-mark",
160 Token::QuestionExclamationMark => "question-exclamation-mark",
161 Token::ArabicQuestionMark => "arabic-question-mark",
162 Token::ArabicFullStop => "arabic-full-stop",
163 Token::ArabicTripleDot => "arabic-triple-dot",
164 Token::ArabicComma => "arabic-comma",
165 Token::Danda => "danda",
166 Token::DoubleDanda => "double-danda",
167 Token::BengaliCurrencyNumeratorFour => "bengali-currency-numerator-four",
168 Token::EthiopianFullStop => "ethiopian-full-stop",
169 Token::ArmenianFullStop => "armenian-full-stop",
170 Token::TibetanShad => "tibetan-shad",
171 Token::ThaiFongman => "thai-fongman",
172 Token::MyanmarComma => "myanmar-comma",
173 Token::MyanmarFullStop => "myanmar-full-stop",
174 Token::Comma => "comma",
175 Token::Quote => "quote",
176 Token::Equals => "equals",
177 Token::Number(s) => return write!(f, "<number:{s}>"),
178 Token::Text(s) => return write!(f, "<text:{s}>"),
179 };
180 write!(f, "<{name}>")
181 }
182}
183
184impl Token {
185 pub fn simple_name(&self) -> &'static str {
187 match self {
188 Token::LexMarker => "LEX_MARKER",
189 Token::Indentation => "INDENTATION",
190 Token::Indent(_) => "INDENT",
191 Token::Dedent(_) => "DEDENT",
192 Token::BlankLine(_) => "BLANK_LINE",
193 Token::Whitespace(_) => "WHITESPACE",
194 Token::Dash => "DASH",
195 Token::Period => "PERIOD",
196 Token::OpenParen => "OPEN_PAREN",
197 Token::CloseParen => "CLOSE_PAREN",
198 Token::Colon => "COLON",
199 Token::ExclamationMark => "EXCLAMATION_MARK",
200 Token::QuestionMark => "QUESTION_MARK",
201 Token::Semicolon => "SEMICOLON",
202 Token::InvertedExclamationMark => "INVERTED_EXCLAMATION_MARK",
203 Token::InvertedQuestionMark => "INVERTED_QUESTION_MARK",
204 Token::Ellipsis => "ELLIPSIS",
205 Token::IdeographicFullStop => "IDEOGRAPHIC_FULL_STOP",
206 Token::FullwidthExclamationMark => "FULLWIDTH_EXCLAMATION_MARK",
207 Token::FullwidthQuestionMark => "FULLWIDTH_QUESTION_MARK",
208 Token::ExclamationQuestionMark => "EXCLAMATION_QUESTION_MARK",
209 Token::QuestionExclamationMark => "QUESTION_EXCLAMATION_MARK",
210 Token::ArabicQuestionMark => "ARABIC_QUESTION_MARK",
211 Token::ArabicFullStop => "ARABIC_FULL_STOP",
212 Token::ArabicTripleDot => "ARABIC_TRIPLE_DOT",
213 Token::ArabicComma => "ARABIC_COMMA",
214 Token::Danda => "DANDA",
215 Token::DoubleDanda => "DOUBLE_DANDA",
216 Token::BengaliCurrencyNumeratorFour => "BENGALI_CURRENCY_NUMERATOR_FOUR",
217 Token::EthiopianFullStop => "ETHIOPIAN_FULL_STOP",
218 Token::ArmenianFullStop => "ARMENIAN_FULL_STOP",
219 Token::TibetanShad => "TIBETAN_SHAD",
220 Token::ThaiFongman => "THAI_FONGMAN",
221 Token::MyanmarComma => "MYANMAR_COMMA",
222 Token::MyanmarFullStop => "MYANMAR_FULL_STOP",
223 Token::Comma => "COMMA",
224 Token::Quote => "QUOTE",
225 Token::Equals => "EQUALS",
226 Token::Number(_) => "NUMBER",
227 Token::Text(_) => "TEXT",
228 }
229 }
230
231 pub fn is_indent(&self) -> bool {
233 matches!(self, Token::Indentation)
234 }
235
236 pub fn is_indent_level(&self) -> bool {
238 matches!(self, Token::Indent(_))
239 }
240
241 pub fn is_dedent_level(&self) -> bool {
243 matches!(self, Token::Dedent(_))
244 }
245
246 pub fn is_whitespace(&self) -> bool {
248 matches!(
249 self,
250 Token::Indentation
251 | Token::Indent(_)
252 | Token::Dedent(_)
253 | Token::BlankLine(_)
254 | Token::Whitespace(_)
255 )
256 }
257
258 pub fn is_sequence_marker(&self) -> bool {
260 matches!(
261 self,
262 Token::Dash | Token::Period | Token::OpenParen | Token::CloseParen
263 )
264 }
265
266 pub fn is_number(&self) -> bool {
268 matches!(self, Token::Number(_))
269 }
270
271 pub fn is_text(&self) -> bool {
273 matches!(self, Token::Text(_))
274 }
275
276 pub fn is_end_punctuation(&self) -> bool {
277 matches!(
278 self,
279 Token::Period
280 | Token::ExclamationMark
281 | Token::QuestionMark
282 | Token::Semicolon
283 | Token::Comma
284 | Token::InvertedExclamationMark
285 | Token::InvertedQuestionMark
286 | Token::Ellipsis
287 | Token::IdeographicFullStop
288 | Token::FullwidthExclamationMark
289 | Token::FullwidthQuestionMark
290 | Token::ExclamationQuestionMark
291 | Token::QuestionExclamationMark
292 | Token::ArabicQuestionMark
293 | Token::ArabicFullStop
294 | Token::ArabicTripleDot
295 | Token::ArabicComma
296 | Token::Danda
297 | Token::DoubleDanda
298 | Token::BengaliCurrencyNumeratorFour
299 | Token::EthiopianFullStop
300 | Token::ArmenianFullStop
301 | Token::TibetanShad
302 | Token::ThaiFongman
303 | Token::MyanmarComma
304 | Token::MyanmarFullStop
305 )
306 }
307}
308
309#[cfg(test)]
310mod tests {
311 use super::*;
312 use crate::lex::lexing::tokenize;
313
314 #[test]
315 fn test_lex_marker() {
316 let tokens: Vec<_> = tokenize("::").into_iter().map(|(t, _)| t).collect();
317 assert_eq!(tokens, vec![Token::LexMarker]);
318 }
319
320 #[test]
321 fn test_indentation_tokens() {
322 let tokens: Vec<_> = tokenize(" ").into_iter().map(|(t, _)| t).collect();
324 assert_eq!(tokens, vec![Token::Indentation]);
325
326 let tokens: Vec<_> = tokenize("\t").into_iter().map(|(t, _)| t).collect();
328 assert_eq!(tokens, vec![Token::Indentation]);
329
330 let tokens: Vec<_> = tokenize(" ").into_iter().map(|(t, _)| t).collect(); assert_eq!(tokens, vec![Token::Indentation, Token::Indentation]);
333 }
334
335 #[test]
336 fn test_sequence_markers() {
337 let tokens: Vec<_> = tokenize("- . ( ) :").into_iter().map(|(t, _)| t).collect();
338 assert_eq!(
339 tokens,
340 vec![
341 Token::Dash,
342 Token::Whitespace(1),
343 Token::Period,
344 Token::Whitespace(1),
345 Token::OpenParen,
346 Token::Whitespace(1),
347 Token::CloseParen,
348 Token::Whitespace(1),
349 Token::Colon
350 ]
351 );
352 }
353
354 #[test]
355 fn test_text_tokens() {
356 let tokens: Vec<_> = tokenize("hello world")
357 .into_iter()
358 .map(|(t, _)| t)
359 .collect();
360 assert_eq!(
361 tokens,
362 vec![
363 Token::Text("hello".to_string()),
364 Token::Whitespace(1),
365 Token::Text("world".to_string())
366 ]
367 );
368 }
369
370 #[test]
371 fn test_mixed_content() {
372 let tokens: Vec<_> = tokenize("1. Hello world\n - Item 1")
373 .into_iter()
374 .map(|(t, _)| t)
375 .collect();
376 assert_eq!(
377 tokens,
378 vec![
379 Token::Number("1".to_string()),
380 Token::Period,
381 Token::Whitespace(1),
382 Token::Text("Hello".to_string()),
383 Token::Whitespace(1),
384 Token::Text("world".to_string()),
385 Token::BlankLine(Some("\n".to_string())),
386 Token::Indentation,
387 Token::Dash,
388 Token::Whitespace(1),
389 Token::Text("Item".to_string()),
390 Token::Whitespace(1),
391 Token::Number("1".to_string()),
392 ]
393 );
394 }
395
396 #[test]
397 fn test_number_tokens() {
398 let tokens: Vec<_> = tokenize("123 456").into_iter().map(|(t, _)| t).collect();
399 assert_eq!(
400 tokens,
401 vec![
402 Token::Number("123".to_string()),
403 Token::Whitespace(1),
404 Token::Number("456".to_string())
405 ]
406 );
407 }
408
409 #[test]
410 fn test_token_predicates() {
411 assert!(Token::Indentation.is_indent());
412 assert!(Token::Indent(vec![]).is_indent_level());
413 assert!(Token::Dedent(vec![]).is_dedent_level());
414 assert!(!Token::Text("".to_string()).is_indent());
415
416 assert!(Token::Indentation.is_whitespace());
417 assert!(Token::Indent(vec![]).is_whitespace());
418 assert!(Token::Dedent(vec![]).is_whitespace());
419 assert!(Token::BlankLine(Some("".to_string())).is_whitespace());
420 assert!(Token::Whitespace(1).is_whitespace());
421 assert!(!Token::Text("".to_string()).is_whitespace());
422
423 assert!(Token::Dash.is_sequence_marker());
424 assert!(Token::Period.is_sequence_marker());
425 assert!(!Token::Text("".to_string()).is_sequence_marker());
426 assert!(!Token::Number("".to_string()).is_sequence_marker());
427
428 assert!(Token::Text("".to_string()).is_text());
429 assert!(!Token::Dash.is_text());
430 assert!(!Token::Number("".to_string()).is_text());
431
432 assert!(Token::Number("".to_string()).is_number());
433 assert!(!Token::Text("".to_string()).is_number());
434 assert!(!Token::Dash.is_number());
435 }
436}