1use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13 pub start: usize,
15 pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20 fn from(range: Range<usize>) -> Self {
21 Self {
22 start: range.start,
23 end: range.end,
24 }
25 }
26}
27
28impl From<Span> for Range<usize> {
29 fn from(span: Span) -> Self {
30 span.start..span.end
31 }
32}
33
34#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] pub enum Token<'src> {
38 #[regex(r"\d{4}[-/]\d{1,2}[-/]\d{1,2}")]
42 Date(&'src str),
43
44 #[regex(r"(\d{1,3}(,\d{3})*|\d+)(\.\d*)?")]
51 Number(&'src str),
52
53 #[regex(r#""([^"\\]|\\.)*""#)]
56 String(&'src str),
57
58 #[regex(r"[\p{Lu}\p{Lo}\p{Lt}][\p{L}0-9-]*(:([\p{Lu}\p{Lo}\p{Lt}0-9][\p{L}0-9-]*)+)+")]
73 Account(&'src str),
74
75 #[regex(r"/[A-Z][A-Z0-9'._-]*|[A-Z][A-Z0-9'._-]*", priority = 3)]
83 Currency(&'src str),
84
85 #[regex(r"#[a-zA-Z0-9-_/.]+")]
87 Tag(&'src str),
88
89 #[regex(r"\^[a-zA-Z0-9-_/.]+")]
91 Link(&'src str),
92
93 #[token("txn")]
97 Txn,
98 #[token("balance")]
100 Balance,
101 #[token("open")]
103 Open,
104 #[token("close")]
106 Close,
107 #[token("commodity")]
109 Commodity,
110 #[token("pad")]
112 Pad,
113 #[token("event")]
115 Event,
116 #[token("query")]
118 Query,
119 #[token("note")]
121 Note,
122 #[token("document")]
124 Document,
125 #[token("price")]
127 Price,
128 #[token("custom")]
130 Custom,
131 #[token("option")]
133 Option_,
134 #[token("include")]
136 Include,
137 #[token("plugin")]
139 Plugin,
140 #[token("pushtag")]
142 Pushtag,
143 #[token("poptag")]
145 Poptag,
146 #[token("pushmeta")]
148 Pushmeta,
149 #[token("popmeta")]
151 Popmeta,
152 #[token("TRUE")]
154 #[token("True")]
155 #[token("true")]
156 True,
157 #[token("FALSE")]
159 #[token("False")]
160 #[token("false")]
161 False,
162 #[token("NULL")]
164 Null,
165
166 #[token("{{")]
170 LDoubleBrace,
171 #[token("}}")]
173 RDoubleBrace,
174 #[token("{#")]
176 LBraceHash,
177 #[token("{")]
179 LBrace,
180 #[token("}")]
182 RBrace,
183 #[token("(")]
185 LParen,
186 #[token(")")]
188 RParen,
189 #[token("@@")]
191 AtAt,
192 #[token("@")]
194 At,
195 #[token(":")]
197 Colon,
198 #[token(",")]
200 Comma,
201 #[token("~")]
203 Tilde,
204 #[token("|")]
206 Pipe,
207 #[token("+")]
209 Plus,
210 #[token("-")]
212 Minus,
213 #[token("*")]
215 Star,
216 #[token("/")]
218 Slash,
219
220 #[token("!")]
223 Pending,
224
225 #[regex(r"[PSTCURM?&]")]
228 Flag(&'src str),
229
230 #[regex(r"\r?\n")]
233 Newline,
234
235 #[regex(r";[^\n\r]*", allow_greedy = true)]
238 Comment(&'src str),
239
240 #[token("#")]
245 Hash,
246
247 #[regex(r"%[^\n\r]*", allow_greedy = true)]
250 PercentComment(&'src str),
251
252 #[regex(r"#![^\n\r]*", allow_greedy = true)]
255 Shebang(&'src str),
256
257 #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
260 EmacsDirective(&'src str),
261
262 #[regex(r"[a-z][a-zA-Z0-9_-]*:")]
267 MetaKey(&'src str),
268
269 Indent(usize),
273
274 DeepIndent(usize),
276
277 Error(&'src str),
280}
281
282impl Token<'_> {
283 pub const fn is_txn_flag(&self) -> bool {
286 match self {
287 Self::Star | Self::Pending | Self::Flag(_) | Self::Hash => true,
288 Self::Currency(s) => s.len() == 1,
290 _ => false,
291 }
292 }
293
294 pub const fn is_directive_keyword(&self) -> bool {
296 matches!(
297 self,
298 Self::Txn
299 | Self::Balance
300 | Self::Open
301 | Self::Close
302 | Self::Commodity
303 | Self::Pad
304 | Self::Event
305 | Self::Query
306 | Self::Note
307 | Self::Document
308 | Self::Price
309 | Self::Custom
310 | Self::Option_
311 | Self::Include
312 | Self::Plugin
313 | Self::Pushtag
314 | Self::Poptag
315 | Self::Pushmeta
316 | Self::Popmeta
317 )
318 }
319}
320
321impl fmt::Display for Token<'_> {
322 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
323 match self {
324 Self::Date(s) => write!(f, "{s}"),
325 Self::Number(s) => write!(f, "{s}"),
326 Self::String(s) => write!(f, "{s}"),
327 Self::Account(s) => write!(f, "{s}"),
328 Self::Currency(s) => write!(f, "{s}"),
329 Self::Tag(s) => write!(f, "{s}"),
330 Self::Link(s) => write!(f, "{s}"),
331 Self::Txn => write!(f, "txn"),
332 Self::Balance => write!(f, "balance"),
333 Self::Open => write!(f, "open"),
334 Self::Close => write!(f, "close"),
335 Self::Commodity => write!(f, "commodity"),
336 Self::Pad => write!(f, "pad"),
337 Self::Event => write!(f, "event"),
338 Self::Query => write!(f, "query"),
339 Self::Note => write!(f, "note"),
340 Self::Document => write!(f, "document"),
341 Self::Price => write!(f, "price"),
342 Self::Custom => write!(f, "custom"),
343 Self::Option_ => write!(f, "option"),
344 Self::Include => write!(f, "include"),
345 Self::Plugin => write!(f, "plugin"),
346 Self::Pushtag => write!(f, "pushtag"),
347 Self::Poptag => write!(f, "poptag"),
348 Self::Pushmeta => write!(f, "pushmeta"),
349 Self::Popmeta => write!(f, "popmeta"),
350 Self::True => write!(f, "TRUE"),
351 Self::False => write!(f, "FALSE"),
352 Self::Null => write!(f, "NULL"),
353 Self::LDoubleBrace => write!(f, "{{{{"),
354 Self::RDoubleBrace => write!(f, "}}}}"),
355 Self::LBraceHash => write!(f, "{{#"),
356 Self::LBrace => write!(f, "{{"),
357 Self::RBrace => write!(f, "}}"),
358 Self::LParen => write!(f, "("),
359 Self::RParen => write!(f, ")"),
360 Self::AtAt => write!(f, "@@"),
361 Self::At => write!(f, "@"),
362 Self::Colon => write!(f, ":"),
363 Self::Comma => write!(f, ","),
364 Self::Tilde => write!(f, "~"),
365 Self::Pipe => write!(f, "|"),
366 Self::Plus => write!(f, "+"),
367 Self::Minus => write!(f, "-"),
368 Self::Star => write!(f, "*"),
369 Self::Slash => write!(f, "/"),
370 Self::Pending => write!(f, "!"),
371 Self::Flag(s) => write!(f, "{s}"),
372 Self::Newline => write!(f, "\\n"),
373 Self::Comment(s) => write!(f, "{s}"),
374 Self::Hash => write!(f, "#"),
375 Self::PercentComment(s) => write!(f, "{s}"),
376 Self::Shebang(s) => write!(f, "{s}"),
377 Self::EmacsDirective(s) => write!(f, "{s}"),
378 Self::MetaKey(s) => write!(f, "{s}"),
379 Self::Indent(n) => write!(f, "<indent:{n}>"),
380 Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
381 Self::Error(s) => write!(f, "{s}"),
382 }
383 }
384}
385
386pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
393 let mut tokens = Vec::new();
394 let mut lexer = Token::lexer(source);
395 let mut at_line_start = true;
396 let mut last_newline_end = 0usize;
397
398 while let Some(result) = lexer.next() {
399 let span = lexer.span();
400
401 match result {
402 Ok(Token::Newline) => {
403 tokens.push((Token::Newline, span.clone().into()));
404 at_line_start = true;
405 last_newline_end = span.end;
406 }
407 Ok(Token::Hash) if at_line_start && span.start == last_newline_end => {
408 let comment_start = span.start;
411 let line_end = source[span.end..]
412 .find('\n')
413 .map_or(source.len(), |i| span.end + i);
414 let comment_text = &source[comment_start..line_end];
415 tokens.push((
416 Token::Comment(comment_text),
417 Span {
418 start: comment_start,
419 end: line_end,
420 },
421 ));
422 while let Some(peek_result) = lexer.next() {
424 let peek_span = lexer.span();
425 let peek_end = peek_span.end;
426 if peek_result == Ok(Token::Newline) {
427 tokens.push((Token::Newline, peek_span.into()));
428 at_line_start = true;
429 last_newline_end = peek_end;
430 break;
431 }
432 }
434 }
435 Ok(token) => {
436 if at_line_start && span.start > last_newline_end {
438 let leading = &source[last_newline_end..span.start];
441 let mut space_count = 0;
442 let mut char_count = 0;
443 for c in leading.chars() {
444 match c {
445 ' ' => {
446 space_count += 1;
447 char_count += 1;
448 }
449 '\t' => {
450 space_count += 4; char_count += 1;
452 }
453 _ => break,
454 }
455 }
456 if space_count >= 1 {
458 let indent_start = last_newline_end;
459 let indent_end = last_newline_end + char_count;
460 let indent_token = if space_count >= 3 {
466 Token::DeepIndent(space_count)
467 } else {
468 Token::Indent(space_count)
469 };
470 tokens.push((
471 indent_token,
472 Span {
473 start: indent_start,
474 end: indent_end,
475 },
476 ));
477 }
478 }
479 at_line_start = false;
480 tokens.push((token, span.into()));
481 }
482 Err(()) => {
483 at_line_start = false;
485 let invalid_text = &source[span.clone()];
486 tokens.push((Token::Error(invalid_text), span.into()));
487 }
488 }
489 }
490
491 tokens
492}
493
494#[cfg(test)]
495mod tests {
496 use super::*;
497
498 #[test]
499 fn test_tokenize_date() {
500 let tokens = tokenize("2024-01-15");
501 assert_eq!(tokens.len(), 1);
502 assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
503 }
504
505 #[test]
506 fn test_tokenize_date_single_digit_month() {
507 let tokens = tokenize("2024-1-15");
509 assert_eq!(tokens.len(), 1);
510 assert!(matches!(tokens[0].0, Token::Date("2024-1-15")));
511 }
512
513 #[test]
514 fn test_tokenize_date_single_digit_day() {
515 let tokens = tokenize("2024-01-5");
517 assert_eq!(tokens.len(), 1);
518 assert!(matches!(tokens[0].0, Token::Date("2024-01-5")));
519 }
520
521 #[test]
522 fn test_tokenize_date_single_digit_month_and_day() {
523 let tokens = tokenize("2024-1-1");
525 assert_eq!(tokens.len(), 1);
526 assert!(matches!(tokens[0].0, Token::Date("2024-1-1")));
527 }
528
529 #[test]
530 fn test_tokenize_date_slash_separator_single_digit() {
531 let tokens = tokenize("2024/1/5");
533 assert_eq!(tokens.len(), 1);
534 assert!(matches!(tokens[0].0, Token::Date("2024/1/5")));
535 }
536
537 #[test]
538 fn test_tokenize_number() {
539 let tokens = tokenize("1234.56");
540 assert_eq!(tokens.len(), 1);
541 assert!(matches!(tokens[0].0, Token::Number("1234.56")));
542
543 let tokens = tokenize("-1,234.56");
545 assert_eq!(tokens.len(), 2);
546 assert!(matches!(tokens[0].0, Token::Minus));
547 assert!(matches!(tokens[1].0, Token::Number("1,234.56")));
548 }
549
550 #[test]
551 fn test_tokenize_account() {
552 let tokens = tokenize("Assets:Bank:Checking");
553 assert_eq!(tokens.len(), 1);
554 assert!(matches!(
555 tokens[0].0,
556 Token::Account("Assets:Bank:Checking")
557 ));
558 }
559
560 #[test]
561 fn test_tokenize_account_unicode() {
562 let tokens = tokenize("Assets:CORP✨");
567 assert!(
568 !matches!(tokens[0].0, Token::Account("Assets:CORP✨")),
569 "Unicode emoji in account name should not tokenize as a valid Account"
570 );
571 assert!(
572 tokens.iter().any(|(t, _)| matches!(t, Token::Error(_))),
573 "Unicode emoji should produce at least one Error token"
574 );
575
576 let tokens = tokenize("Assets:沪深300");
578 assert!(
579 matches!(tokens[0].0, Token::Account("Assets:沪深300")),
580 "CJK characters at the start of a sub-component should tokenize as Account"
581 );
582
583 let tokens = tokenize("Assets:日本銀行");
585 assert!(
586 matches!(tokens[0].0, Token::Account("Assets:日本銀行")),
587 "CJK sub-component should tokenize as Account"
588 );
589
590 let tokens = tokenize("Капитал:Retained");
592 assert!(
593 matches!(tokens[0].0, Token::Account("Капитал:Retained")),
594 "Cyrillic-starting account should tokenize as Account"
595 );
596
597 let tokens = tokenize("资产:银行:支票");
599 assert!(
600 matches!(tokens[0].0, Token::Account("资产:银行:支票")),
601 "Fully CJK account should tokenize as Account"
602 );
603 }
604
605 #[test]
608 fn test_tokenize_account_unicode_letters_after_ascii_start() {
609 let tokens = tokenize("Assets:Banque-Épargne");
611 assert!(
612 matches!(tokens[0].0, Token::Account("Assets:Banque-Épargne")),
613 "accented Latin letter after ASCII start should tokenize as Account, got: {tokens:?}"
614 );
615
616 let tokens = tokenize("Assets:Müller");
618 assert!(
619 matches!(tokens[0].0, Token::Account("Assets:Müller")),
620 "German umlaut after ASCII start should tokenize as Account, got: {tokens:?}"
621 );
622
623 let tokens = tokenize("Assets:CorpJP日本");
625 assert!(
626 matches!(tokens[0].0, Token::Account("Assets:CorpJP日本")),
627 "CJK letters after ASCII start should tokenize as Account, got: {tokens:?}"
628 );
629 }
630
631 #[test]
632 fn test_tokenize_currency() {
633 let tokens = tokenize("USD");
634 assert_eq!(tokens.len(), 1);
635 assert!(matches!(tokens[0].0, Token::Currency("USD")));
636 }
637
638 #[test]
639 fn test_tokenize_single_char_currency() {
640 let tokens = tokenize("T");
642 assert_eq!(tokens.len(), 1);
643 assert!(matches!(tokens[0].0, Token::Currency("T")));
644
645 let tokens = tokenize("V");
646 assert_eq!(tokens.len(), 1);
647 assert!(matches!(tokens[0].0, Token::Currency("V")));
648
649 let tokens = tokenize("F");
650 assert_eq!(tokens.len(), 1);
651 assert!(matches!(tokens[0].0, Token::Currency("F")));
652 }
653
654 #[test]
655 fn test_single_char_currency_is_txn_flag() {
656 let token = Token::Currency("T");
658 assert!(token.is_txn_flag());
659
660 let token = Token::Currency("USD");
662 assert!(!token.is_txn_flag());
663 }
664
665 #[test]
666 fn test_tokenize_string() {
667 let tokens = tokenize(r#""Hello, World!""#);
668 assert_eq!(tokens.len(), 1);
669 assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
670 }
671
672 #[test]
673 fn test_tokenize_keywords() {
674 let tokens = tokenize("txn balance open close");
675 assert_eq!(tokens.len(), 4);
676 assert!(matches!(tokens[0].0, Token::Txn));
677 assert!(matches!(tokens[1].0, Token::Balance));
678 assert!(matches!(tokens[2].0, Token::Open));
679 assert!(matches!(tokens[3].0, Token::Close));
680 }
681
682 #[test]
683 fn test_tokenize_tag_and_link() {
684 let tokens = tokenize("#my-tag ^my-link");
685 assert_eq!(tokens.len(), 2);
686 assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
687 assert!(matches!(tokens[1].0, Token::Link("^my-link")));
688 }
689
690 #[test]
691 fn test_tokenize_comment() {
692 let tokens = tokenize("; This is a comment");
693 assert_eq!(tokens.len(), 1);
694 assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
695 }
696
697 #[test]
698 fn test_tokenize_indentation() {
699 let tokens = tokenize("txn\n Assets:Bank 100 USD");
700 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
702 }
703
704 #[test]
705 fn test_tokenize_transaction_line() {
706 let source = "2024-01-15 * \"Grocery Store\" #food\n Expenses:Food 50.00 USD";
707 let tokens = tokenize(source);
708
709 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
711 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
712 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
713 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
714 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
715 assert!(
716 tokens
717 .iter()
718 .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
719 );
720 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
721 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
722 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
723 }
724
725 #[test]
726 fn test_tokenize_metadata_key() {
727 let tokens = tokenize("filename:");
728 assert_eq!(tokens.len(), 1);
729 assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
730 }
731
732 #[test]
733 fn test_tokenize_punctuation() {
734 let tokens = tokenize("{ } @ @@ , ~");
735 let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
736 assert!(token_types.contains(&Token::LBrace));
737 assert!(token_types.contains(&Token::RBrace));
738 assert!(token_types.contains(&Token::At));
739 assert!(token_types.contains(&Token::AtAt));
740 assert!(token_types.contains(&Token::Comma));
741 assert!(token_types.contains(&Token::Tilde));
742 }
743}