1use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13 pub start: usize,
15 pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20 fn from(range: Range<usize>) -> Self {
21 Self {
22 start: range.start,
23 end: range.end,
24 }
25 }
26}
27
28impl From<Span> for Range<usize> {
29 fn from(span: Span) -> Self {
30 span.start..span.end
31 }
32}
33
34#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] pub enum Token<'src> {
38 #[regex(r"\d{4}[-/]\d{1,2}[-/]\d{1,2}")]
42 Date(&'src str),
43
44 #[regex(r"(\d{1,3}(,\d{3})*|\d+)(\.\d*)?")]
51 Number(&'src str),
52
53 #[regex(r#""([^"\\]|\\.)*""#)]
56 String(&'src str),
57
58 #[regex(r"[A-Z][\p{L}0-9-]*(:([A-Z0-9][\p{L}0-9-]*)+)+")]
75 Account(&'src str),
76
77 #[regex(r"/[A-Z][A-Z0-9'._-]*|[A-Z][A-Z0-9'._-]*", priority = 3)]
85 Currency(&'src str),
86
87 #[regex(r"#[a-zA-Z0-9-_/.]+")]
89 Tag(&'src str),
90
91 #[regex(r"\^[a-zA-Z0-9-_/.]+")]
93 Link(&'src str),
94
95 #[token("txn")]
99 Txn,
100 #[token("balance")]
102 Balance,
103 #[token("open")]
105 Open,
106 #[token("close")]
108 Close,
109 #[token("commodity")]
111 Commodity,
112 #[token("pad")]
114 Pad,
115 #[token("event")]
117 Event,
118 #[token("query")]
120 Query,
121 #[token("note")]
123 Note,
124 #[token("document")]
126 Document,
127 #[token("price")]
129 Price,
130 #[token("custom")]
132 Custom,
133 #[token("option")]
135 Option_,
136 #[token("include")]
138 Include,
139 #[token("plugin")]
141 Plugin,
142 #[token("pushtag")]
144 Pushtag,
145 #[token("poptag")]
147 Poptag,
148 #[token("pushmeta")]
150 Pushmeta,
151 #[token("popmeta")]
153 Popmeta,
154 #[token("TRUE")]
156 #[token("True")]
157 #[token("true")]
158 True,
159 #[token("FALSE")]
161 #[token("False")]
162 #[token("false")]
163 False,
164 #[token("NULL")]
166 Null,
167
168 #[token("{{")]
172 LDoubleBrace,
173 #[token("}}")]
175 RDoubleBrace,
176 #[token("{#")]
178 LBraceHash,
179 #[token("{")]
181 LBrace,
182 #[token("}")]
184 RBrace,
185 #[token("(")]
187 LParen,
188 #[token(")")]
190 RParen,
191 #[token("@@")]
193 AtAt,
194 #[token("@")]
196 At,
197 #[token(":")]
199 Colon,
200 #[token(",")]
202 Comma,
203 #[token("~")]
205 Tilde,
206 #[token("|")]
208 Pipe,
209 #[token("+")]
211 Plus,
212 #[token("-")]
214 Minus,
215 #[token("*")]
217 Star,
218 #[token("/")]
220 Slash,
221
222 #[token("!")]
225 Pending,
226
227 #[regex(r"[PSTCURM?&]")]
230 Flag(&'src str),
231
232 #[regex(r"\r?\n")]
235 Newline,
236
237 #[regex(r";[^\n\r]*", allow_greedy = true)]
240 Comment(&'src str),
241
242 #[token("#")]
247 Hash,
248
249 #[regex(r"%[^\n\r]*", allow_greedy = true)]
252 PercentComment(&'src str),
253
254 #[regex(r"#![^\n\r]*", allow_greedy = true)]
257 Shebang(&'src str),
258
259 #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
262 EmacsDirective(&'src str),
263
264 #[regex(r"[a-z][a-zA-Z0-9_-]*:")]
269 MetaKey(&'src str),
270
271 Indent(usize),
275
276 DeepIndent(usize),
278
279 Error(&'src str),
282}
283
284impl Token<'_> {
285 pub const fn is_txn_flag(&self) -> bool {
288 match self {
289 Self::Star | Self::Pending | Self::Flag(_) | Self::Hash => true,
290 Self::Currency(s) => s.len() == 1,
292 _ => false,
293 }
294 }
295
296 pub const fn is_directive_keyword(&self) -> bool {
298 matches!(
299 self,
300 Self::Txn
301 | Self::Balance
302 | Self::Open
303 | Self::Close
304 | Self::Commodity
305 | Self::Pad
306 | Self::Event
307 | Self::Query
308 | Self::Note
309 | Self::Document
310 | Self::Price
311 | Self::Custom
312 | Self::Option_
313 | Self::Include
314 | Self::Plugin
315 | Self::Pushtag
316 | Self::Poptag
317 | Self::Pushmeta
318 | Self::Popmeta
319 )
320 }
321}
322
323impl fmt::Display for Token<'_> {
324 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
325 match self {
326 Self::Date(s) => write!(f, "{s}"),
327 Self::Number(s) => write!(f, "{s}"),
328 Self::String(s) => write!(f, "{s}"),
329 Self::Account(s) => write!(f, "{s}"),
330 Self::Currency(s) => write!(f, "{s}"),
331 Self::Tag(s) => write!(f, "{s}"),
332 Self::Link(s) => write!(f, "{s}"),
333 Self::Txn => write!(f, "txn"),
334 Self::Balance => write!(f, "balance"),
335 Self::Open => write!(f, "open"),
336 Self::Close => write!(f, "close"),
337 Self::Commodity => write!(f, "commodity"),
338 Self::Pad => write!(f, "pad"),
339 Self::Event => write!(f, "event"),
340 Self::Query => write!(f, "query"),
341 Self::Note => write!(f, "note"),
342 Self::Document => write!(f, "document"),
343 Self::Price => write!(f, "price"),
344 Self::Custom => write!(f, "custom"),
345 Self::Option_ => write!(f, "option"),
346 Self::Include => write!(f, "include"),
347 Self::Plugin => write!(f, "plugin"),
348 Self::Pushtag => write!(f, "pushtag"),
349 Self::Poptag => write!(f, "poptag"),
350 Self::Pushmeta => write!(f, "pushmeta"),
351 Self::Popmeta => write!(f, "popmeta"),
352 Self::True => write!(f, "TRUE"),
353 Self::False => write!(f, "FALSE"),
354 Self::Null => write!(f, "NULL"),
355 Self::LDoubleBrace => write!(f, "{{{{"),
356 Self::RDoubleBrace => write!(f, "}}}}"),
357 Self::LBraceHash => write!(f, "{{#"),
358 Self::LBrace => write!(f, "{{"),
359 Self::RBrace => write!(f, "}}"),
360 Self::LParen => write!(f, "("),
361 Self::RParen => write!(f, ")"),
362 Self::AtAt => write!(f, "@@"),
363 Self::At => write!(f, "@"),
364 Self::Colon => write!(f, ":"),
365 Self::Comma => write!(f, ","),
366 Self::Tilde => write!(f, "~"),
367 Self::Pipe => write!(f, "|"),
368 Self::Plus => write!(f, "+"),
369 Self::Minus => write!(f, "-"),
370 Self::Star => write!(f, "*"),
371 Self::Slash => write!(f, "/"),
372 Self::Pending => write!(f, "!"),
373 Self::Flag(s) => write!(f, "{s}"),
374 Self::Newline => write!(f, "\\n"),
375 Self::Comment(s) => write!(f, "{s}"),
376 Self::Hash => write!(f, "#"),
377 Self::PercentComment(s) => write!(f, "{s}"),
378 Self::Shebang(s) => write!(f, "{s}"),
379 Self::EmacsDirective(s) => write!(f, "{s}"),
380 Self::MetaKey(s) => write!(f, "{s}"),
381 Self::Indent(n) => write!(f, "<indent:{n}>"),
382 Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
383 Self::Error(s) => write!(f, "{s}"),
384 }
385 }
386}
387
388pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
395 let mut tokens = Vec::new();
396 let mut lexer = Token::lexer(source);
397 let mut at_line_start = true;
398 let mut last_newline_end = 0usize;
399
400 while let Some(result) = lexer.next() {
401 let span = lexer.span();
402
403 match result {
404 Ok(Token::Newline) => {
405 tokens.push((Token::Newline, span.clone().into()));
406 at_line_start = true;
407 last_newline_end = span.end;
408 }
409 Ok(Token::Hash) if at_line_start && span.start == last_newline_end => {
410 let comment_start = span.start;
413 let line_end = source[span.end..]
414 .find('\n')
415 .map_or(source.len(), |i| span.end + i);
416 let comment_text = &source[comment_start..line_end];
417 tokens.push((
418 Token::Comment(comment_text),
419 Span {
420 start: comment_start,
421 end: line_end,
422 },
423 ));
424 while let Some(peek_result) = lexer.next() {
426 let peek_span = lexer.span();
427 let peek_end = peek_span.end;
428 if peek_result == Ok(Token::Newline) {
429 tokens.push((Token::Newline, peek_span.into()));
430 at_line_start = true;
431 last_newline_end = peek_end;
432 break;
433 }
434 }
436 }
437 Ok(token) => {
438 if at_line_start && span.start > last_newline_end {
440 let leading = &source[last_newline_end..span.start];
443 let mut space_count = 0;
444 let mut char_count = 0;
445 for c in leading.chars() {
446 match c {
447 ' ' => {
448 space_count += 1;
449 char_count += 1;
450 }
451 '\t' => {
452 space_count += 4; char_count += 1;
454 }
455 _ => break,
456 }
457 }
458 if space_count >= 1 {
460 let indent_start = last_newline_end;
461 let indent_end = last_newline_end + char_count;
462 let indent_token = if space_count >= 3 {
468 Token::DeepIndent(space_count)
469 } else {
470 Token::Indent(space_count)
471 };
472 tokens.push((
473 indent_token,
474 Span {
475 start: indent_start,
476 end: indent_end,
477 },
478 ));
479 }
480 }
481 at_line_start = false;
482 tokens.push((token, span.into()));
483 }
484 Err(()) => {
485 at_line_start = false;
487 let invalid_text = &source[span.clone()];
488 tokens.push((Token::Error(invalid_text), span.into()));
489 }
490 }
491 }
492
493 tokens
494}
495
496#[cfg(test)]
497mod tests {
498 use super::*;
499
500 #[test]
501 fn test_tokenize_date() {
502 let tokens = tokenize("2024-01-15");
503 assert_eq!(tokens.len(), 1);
504 assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
505 }
506
507 #[test]
508 fn test_tokenize_date_single_digit_month() {
509 let tokens = tokenize("2024-1-15");
511 assert_eq!(tokens.len(), 1);
512 assert!(matches!(tokens[0].0, Token::Date("2024-1-15")));
513 }
514
515 #[test]
516 fn test_tokenize_date_single_digit_day() {
517 let tokens = tokenize("2024-01-5");
519 assert_eq!(tokens.len(), 1);
520 assert!(matches!(tokens[0].0, Token::Date("2024-01-5")));
521 }
522
523 #[test]
524 fn test_tokenize_date_single_digit_month_and_day() {
525 let tokens = tokenize("2024-1-1");
527 assert_eq!(tokens.len(), 1);
528 assert!(matches!(tokens[0].0, Token::Date("2024-1-1")));
529 }
530
531 #[test]
532 fn test_tokenize_date_slash_separator_single_digit() {
533 let tokens = tokenize("2024/1/5");
535 assert_eq!(tokens.len(), 1);
536 assert!(matches!(tokens[0].0, Token::Date("2024/1/5")));
537 }
538
539 #[test]
540 fn test_tokenize_number() {
541 let tokens = tokenize("1234.56");
542 assert_eq!(tokens.len(), 1);
543 assert!(matches!(tokens[0].0, Token::Number("1234.56")));
544
545 let tokens = tokenize("-1,234.56");
547 assert_eq!(tokens.len(), 2);
548 assert!(matches!(tokens[0].0, Token::Minus));
549 assert!(matches!(tokens[1].0, Token::Number("1,234.56")));
550 }
551
552 #[test]
553 fn test_tokenize_account() {
554 let tokens = tokenize("Assets:Bank:Checking");
555 assert_eq!(tokens.len(), 1);
556 assert!(matches!(
557 tokens[0].0,
558 Token::Account("Assets:Bank:Checking")
559 ));
560 }
561
562 #[test]
563 fn test_tokenize_account_unicode() {
564 let tokens = tokenize("Assets:CORP✨");
571 assert!(
572 !matches!(tokens[0].0, Token::Account("Assets:CORP✨")),
573 "Unicode emoji in account name should not tokenize as a valid Account"
574 );
575 assert!(
576 tokens.iter().any(|(t, _)| matches!(t, Token::Error(_))),
577 "Unicode emoji should produce at least one Error token"
578 );
579
580 let tokens = tokenize("Assets:沪深300");
582 assert!(
583 !matches!(tokens[0].0, Token::Account("Assets:沪深300")),
584 "CJK characters at the start of a sub-component should not tokenize as a valid Account"
585 );
586 assert!(
587 tokens.iter().any(|(t, _)| matches!(t, Token::Error(_))),
588 "CJK start should produce at least one Error token"
589 );
590
591 let tokens = tokenize("Assets:日本銀行");
593 assert!(
594 !matches!(tokens[0].0, Token::Account("Assets:日本銀行")),
595 "CJK sub-component start should not tokenize as a valid Account"
596 );
597 assert!(
598 tokens.iter().any(|(t, _)| matches!(t, Token::Error(_))),
599 "CJK sub-component start should produce at least one Error token"
600 );
601 }
602
603 #[test]
606 fn test_tokenize_account_unicode_letters_after_ascii_start() {
607 let tokens = tokenize("Assets:Banque-Épargne");
609 assert!(
610 matches!(tokens[0].0, Token::Account("Assets:Banque-Épargne")),
611 "accented Latin letter after ASCII start should tokenize as Account, got: {tokens:?}"
612 );
613
614 let tokens = tokenize("Assets:Müller");
616 assert!(
617 matches!(tokens[0].0, Token::Account("Assets:Müller")),
618 "German umlaut after ASCII start should tokenize as Account, got: {tokens:?}"
619 );
620
621 let tokens = tokenize("Assets:CorpJP日本");
623 assert!(
624 matches!(tokens[0].0, Token::Account("Assets:CorpJP日本")),
625 "CJK letters after ASCII start should tokenize as Account, got: {tokens:?}"
626 );
627 }
628
629 #[test]
630 fn test_tokenize_currency() {
631 let tokens = tokenize("USD");
632 assert_eq!(tokens.len(), 1);
633 assert!(matches!(tokens[0].0, Token::Currency("USD")));
634 }
635
636 #[test]
637 fn test_tokenize_single_char_currency() {
638 let tokens = tokenize("T");
640 assert_eq!(tokens.len(), 1);
641 assert!(matches!(tokens[0].0, Token::Currency("T")));
642
643 let tokens = tokenize("V");
644 assert_eq!(tokens.len(), 1);
645 assert!(matches!(tokens[0].0, Token::Currency("V")));
646
647 let tokens = tokenize("F");
648 assert_eq!(tokens.len(), 1);
649 assert!(matches!(tokens[0].0, Token::Currency("F")));
650 }
651
652 #[test]
653 fn test_single_char_currency_is_txn_flag() {
654 let token = Token::Currency("T");
656 assert!(token.is_txn_flag());
657
658 let token = Token::Currency("USD");
660 assert!(!token.is_txn_flag());
661 }
662
663 #[test]
664 fn test_tokenize_string() {
665 let tokens = tokenize(r#""Hello, World!""#);
666 assert_eq!(tokens.len(), 1);
667 assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
668 }
669
670 #[test]
671 fn test_tokenize_keywords() {
672 let tokens = tokenize("txn balance open close");
673 assert_eq!(tokens.len(), 4);
674 assert!(matches!(tokens[0].0, Token::Txn));
675 assert!(matches!(tokens[1].0, Token::Balance));
676 assert!(matches!(tokens[2].0, Token::Open));
677 assert!(matches!(tokens[3].0, Token::Close));
678 }
679
680 #[test]
681 fn test_tokenize_tag_and_link() {
682 let tokens = tokenize("#my-tag ^my-link");
683 assert_eq!(tokens.len(), 2);
684 assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
685 assert!(matches!(tokens[1].0, Token::Link("^my-link")));
686 }
687
688 #[test]
689 fn test_tokenize_comment() {
690 let tokens = tokenize("; This is a comment");
691 assert_eq!(tokens.len(), 1);
692 assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
693 }
694
695 #[test]
696 fn test_tokenize_indentation() {
697 let tokens = tokenize("txn\n Assets:Bank 100 USD");
698 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
700 }
701
702 #[test]
703 fn test_tokenize_transaction_line() {
704 let source = "2024-01-15 * \"Grocery Store\" #food\n Expenses:Food 50.00 USD";
705 let tokens = tokenize(source);
706
707 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
709 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
710 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
711 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
712 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
713 assert!(
714 tokens
715 .iter()
716 .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
717 );
718 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
719 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
720 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
721 }
722
723 #[test]
724 fn test_tokenize_metadata_key() {
725 let tokens = tokenize("filename:");
726 assert_eq!(tokens.len(), 1);
727 assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
728 }
729
730 #[test]
731 fn test_tokenize_punctuation() {
732 let tokens = tokenize("{ } @ @@ , ~");
733 let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
734 assert!(token_types.contains(&Token::LBrace));
735 assert!(token_types.contains(&Token::RBrace));
736 assert!(token_types.contains(&Token::At));
737 assert!(token_types.contains(&Token::AtAt));
738 assert!(token_types.contains(&Token::Comma));
739 assert!(token_types.contains(&Token::Tilde));
740 }
741}