1use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13 pub start: usize,
15 pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20 fn from(range: Range<usize>) -> Self {
21 Self {
22 start: range.start,
23 end: range.end,
24 }
25 }
26}
27
28impl From<Span> for Range<usize> {
29 fn from(span: Span) -> Self {
30 span.start..span.end
31 }
32}
33
34#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] pub enum Token<'src> {
38 #[regex(r"\d{4}[-/]\d{2}[-/]\d{2}")]
41 Date(&'src str),
42
43 #[regex(r"-?(\.\d+|(\d{1,3}(,\d{3})*|\d+)(\.\d*)?)")]
47 Number(&'src str),
48
49 #[regex(r#""([^"\\]|\\.)*""#)]
52 String(&'src str),
53
54 #[regex(r"([A-Z]|[^\x00-\x7F])([A-Za-z0-9-]|[^\x00-\x7F])*(:([A-Z0-9]|[^\x00-\x7F])([A-Za-z0-9-]|[^\x00-\x7F])*)+")]
63 Account(&'src str),
64
65 #[regex(r"/[A-Z][A-Z0-9'._-]*|[A-Z][A-Z0-9'._-]*", priority = 3)]
73 Currency(&'src str),
74
75 #[regex(r"#[a-zA-Z0-9-_/.]+")]
77 Tag(&'src str),
78
79 #[regex(r"\^[a-zA-Z0-9-_/.]+")]
81 Link(&'src str),
82
83 #[token("txn")]
87 Txn,
88 #[token("balance")]
90 Balance,
91 #[token("open")]
93 Open,
94 #[token("close")]
96 Close,
97 #[token("commodity")]
99 Commodity,
100 #[token("pad")]
102 Pad,
103 #[token("event")]
105 Event,
106 #[token("query")]
108 Query,
109 #[token("note")]
111 Note,
112 #[token("document")]
114 Document,
115 #[token("price")]
117 Price,
118 #[token("custom")]
120 Custom,
121 #[token("option")]
123 Option_,
124 #[token("include")]
126 Include,
127 #[token("plugin")]
129 Plugin,
130 #[token("pushtag")]
132 Pushtag,
133 #[token("poptag")]
135 Poptag,
136 #[token("pushmeta")]
138 Pushmeta,
139 #[token("popmeta")]
141 Popmeta,
142 #[token("TRUE")]
144 #[token("True")]
145 #[token("true")]
146 True,
147 #[token("FALSE")]
149 #[token("False")]
150 #[token("false")]
151 False,
152 #[token("NULL")]
154 Null,
155
156 #[token("{{")]
160 LDoubleBrace,
161 #[token("}}")]
163 RDoubleBrace,
164 #[token("{#")]
166 LBraceHash,
167 #[token("{")]
169 LBrace,
170 #[token("}")]
172 RBrace,
173 #[token("(")]
175 LParen,
176 #[token(")")]
178 RParen,
179 #[token("@@")]
181 AtAt,
182 #[token("@")]
184 At,
185 #[token(":")]
187 Colon,
188 #[token(",")]
190 Comma,
191 #[token("~")]
193 Tilde,
194 #[token("|")]
196 Pipe,
197 #[token("+")]
199 Plus,
200 #[token("-")]
202 Minus,
203 #[token("*")]
205 Star,
206 #[token("/")]
208 Slash,
209
210 #[token("!")]
213 Pending,
214
215 #[regex(r"[PSTCURM?&]")]
218 Flag(&'src str),
219
220 #[regex(r"\r?\n")]
223 Newline,
224
225 #[regex(r";[^\n\r]*", allow_greedy = true)]
228 Comment(&'src str),
229
230 #[token("#")]
235 Hash,
236
237 #[regex(r"%[^\n\r]*", allow_greedy = true)]
240 PercentComment(&'src str),
241
242 #[regex(r"#![^\n\r]*", allow_greedy = true)]
245 Shebang(&'src str),
246
247 #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
250 EmacsDirective(&'src str),
251
252 #[regex(r"[a-zA-Z][a-zA-Z0-9_-]*:")]
256 MetaKey(&'src str),
257
258 Indent(usize),
262
263 DeepIndent(usize),
265
266 Error(&'src str),
269}
270
271impl Token<'_> {
272 pub const fn is_txn_flag(&self) -> bool {
275 match self {
276 Self::Star | Self::Pending | Self::Flag(_) | Self::Hash => true,
277 Self::Currency(s) => s.len() == 1,
279 _ => false,
280 }
281 }
282
283 pub const fn is_directive_keyword(&self) -> bool {
285 matches!(
286 self,
287 Self::Txn
288 | Self::Balance
289 | Self::Open
290 | Self::Close
291 | Self::Commodity
292 | Self::Pad
293 | Self::Event
294 | Self::Query
295 | Self::Note
296 | Self::Document
297 | Self::Price
298 | Self::Custom
299 | Self::Option_
300 | Self::Include
301 | Self::Plugin
302 | Self::Pushtag
303 | Self::Poptag
304 | Self::Pushmeta
305 | Self::Popmeta
306 )
307 }
308}
309
310impl fmt::Display for Token<'_> {
311 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
312 match self {
313 Self::Date(s) => write!(f, "{s}"),
314 Self::Number(s) => write!(f, "{s}"),
315 Self::String(s) => write!(f, "{s}"),
316 Self::Account(s) => write!(f, "{s}"),
317 Self::Currency(s) => write!(f, "{s}"),
318 Self::Tag(s) => write!(f, "{s}"),
319 Self::Link(s) => write!(f, "{s}"),
320 Self::Txn => write!(f, "txn"),
321 Self::Balance => write!(f, "balance"),
322 Self::Open => write!(f, "open"),
323 Self::Close => write!(f, "close"),
324 Self::Commodity => write!(f, "commodity"),
325 Self::Pad => write!(f, "pad"),
326 Self::Event => write!(f, "event"),
327 Self::Query => write!(f, "query"),
328 Self::Note => write!(f, "note"),
329 Self::Document => write!(f, "document"),
330 Self::Price => write!(f, "price"),
331 Self::Custom => write!(f, "custom"),
332 Self::Option_ => write!(f, "option"),
333 Self::Include => write!(f, "include"),
334 Self::Plugin => write!(f, "plugin"),
335 Self::Pushtag => write!(f, "pushtag"),
336 Self::Poptag => write!(f, "poptag"),
337 Self::Pushmeta => write!(f, "pushmeta"),
338 Self::Popmeta => write!(f, "popmeta"),
339 Self::True => write!(f, "TRUE"),
340 Self::False => write!(f, "FALSE"),
341 Self::Null => write!(f, "NULL"),
342 Self::LDoubleBrace => write!(f, "{{{{"),
343 Self::RDoubleBrace => write!(f, "}}}}"),
344 Self::LBraceHash => write!(f, "{{#"),
345 Self::LBrace => write!(f, "{{"),
346 Self::RBrace => write!(f, "}}"),
347 Self::LParen => write!(f, "("),
348 Self::RParen => write!(f, ")"),
349 Self::AtAt => write!(f, "@@"),
350 Self::At => write!(f, "@"),
351 Self::Colon => write!(f, ":"),
352 Self::Comma => write!(f, ","),
353 Self::Tilde => write!(f, "~"),
354 Self::Pipe => write!(f, "|"),
355 Self::Plus => write!(f, "+"),
356 Self::Minus => write!(f, "-"),
357 Self::Star => write!(f, "*"),
358 Self::Slash => write!(f, "/"),
359 Self::Pending => write!(f, "!"),
360 Self::Flag(s) => write!(f, "{s}"),
361 Self::Newline => write!(f, "\\n"),
362 Self::Comment(s) => write!(f, "{s}"),
363 Self::Hash => write!(f, "#"),
364 Self::PercentComment(s) => write!(f, "{s}"),
365 Self::Shebang(s) => write!(f, "{s}"),
366 Self::EmacsDirective(s) => write!(f, "{s}"),
367 Self::MetaKey(s) => write!(f, "{s}"),
368 Self::Indent(n) => write!(f, "<indent:{n}>"),
369 Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
370 Self::Error(s) => write!(f, "{s}"),
371 }
372 }
373}
374
375pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
382 let mut tokens = Vec::new();
383 let mut lexer = Token::lexer(source);
384 let mut at_line_start = true;
385 let mut last_newline_end = 0usize;
386
387 while let Some(result) = lexer.next() {
388 let span = lexer.span();
389
390 match result {
391 Ok(Token::Newline) => {
392 tokens.push((Token::Newline, span.clone().into()));
393 at_line_start = true;
394 last_newline_end = span.end;
395 }
396 Ok(Token::Hash) if at_line_start && span.start == last_newline_end => {
397 let comment_start = span.start;
400 let line_end = source[span.end..]
401 .find('\n')
402 .map_or(source.len(), |i| span.end + i);
403 let comment_text = &source[comment_start..line_end];
404 tokens.push((
405 Token::Comment(comment_text),
406 Span {
407 start: comment_start,
408 end: line_end,
409 },
410 ));
411 while let Some(peek_result) = lexer.next() {
413 let peek_span = lexer.span();
414 let peek_end = peek_span.end;
415 if peek_result == Ok(Token::Newline) {
416 tokens.push((Token::Newline, peek_span.into()));
417 at_line_start = true;
418 last_newline_end = peek_end;
419 break;
420 }
421 }
423 }
424 Ok(token) => {
425 if at_line_start && span.start > last_newline_end {
427 let leading = &source[last_newline_end..span.start];
430 let mut space_count = 0;
431 let mut char_count = 0;
432 for c in leading.chars() {
433 match c {
434 ' ' => {
435 space_count += 1;
436 char_count += 1;
437 }
438 '\t' => {
439 space_count += 4; char_count += 1;
441 }
442 _ => break,
443 }
444 }
445 if space_count >= 1 {
447 let indent_start = last_newline_end;
448 let indent_end = last_newline_end + char_count;
449 let indent_token = if space_count >= 3 {
455 Token::DeepIndent(space_count)
456 } else {
457 Token::Indent(space_count)
458 };
459 tokens.push((
460 indent_token,
461 Span {
462 start: indent_start,
463 end: indent_end,
464 },
465 ));
466 }
467 }
468 at_line_start = false;
469 tokens.push((token, span.into()));
470 }
471 Err(()) => {
472 at_line_start = false;
474 let invalid_text = &source[span.clone()];
475 tokens.push((Token::Error(invalid_text), span.into()));
476 }
477 }
478 }
479
480 tokens
481}
482
483#[cfg(test)]
484mod tests {
485 use super::*;
486
487 #[test]
488 fn test_tokenize_date() {
489 let tokens = tokenize("2024-01-15");
490 assert_eq!(tokens.len(), 1);
491 assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
492 }
493
494 #[test]
495 fn test_tokenize_number() {
496 let tokens = tokenize("1234.56");
497 assert_eq!(tokens.len(), 1);
498 assert!(matches!(tokens[0].0, Token::Number("1234.56")));
499
500 let tokens = tokenize("-1,234.56");
501 assert_eq!(tokens.len(), 1);
502 assert!(matches!(tokens[0].0, Token::Number("-1,234.56")));
503 }
504
505 #[test]
506 fn test_tokenize_account() {
507 let tokens = tokenize("Assets:Bank:Checking");
508 assert_eq!(tokens.len(), 1);
509 assert!(matches!(
510 tokens[0].0,
511 Token::Account("Assets:Bank:Checking")
512 ));
513 }
514
515 #[test]
516 fn test_tokenize_account_unicode() {
517 let tokens = tokenize("Assets:CORP✨");
522 assert_eq!(tokens.len(), 1);
523 assert!(matches!(tokens[0].0, Token::Account("Assets:CORP✨")));
524
525 let tokens = tokenize("Assets:沪深300");
527 assert_eq!(tokens.len(), 1);
528 assert!(matches!(tokens[0].0, Token::Account("Assets:沪深300")));
529
530 let tokens = tokenize("Assets:日本銀行");
532 assert_eq!(tokens.len(), 1);
533 assert!(matches!(tokens[0].0, Token::Account("Assets:日本銀行")));
534
535 let tokens = tokenize("Assets:Café");
537 assert_eq!(tokens.len(), 1);
538 assert!(matches!(tokens[0].0, Token::Account("Assets:Café")));
539
540 let tokens = tokenize("Assets:€uro");
542 assert_eq!(tokens.len(), 1);
543 assert!(matches!(tokens[0].0, Token::Account("Assets:€uro")));
544
545 let tokens = tokenize("Assets:Test💰Account");
547 assert_eq!(tokens.len(), 1);
548 assert!(matches!(
549 tokens[0].0,
550 Token::Account("Assets:Test💰Account")
551 ));
552 }
553
554 #[test]
555 fn test_tokenize_currency() {
556 let tokens = tokenize("USD");
557 assert_eq!(tokens.len(), 1);
558 assert!(matches!(tokens[0].0, Token::Currency("USD")));
559 }
560
561 #[test]
562 fn test_tokenize_single_char_currency() {
563 let tokens = tokenize("T");
565 assert_eq!(tokens.len(), 1);
566 assert!(matches!(tokens[0].0, Token::Currency("T")));
567
568 let tokens = tokenize("V");
569 assert_eq!(tokens.len(), 1);
570 assert!(matches!(tokens[0].0, Token::Currency("V")));
571
572 let tokens = tokenize("F");
573 assert_eq!(tokens.len(), 1);
574 assert!(matches!(tokens[0].0, Token::Currency("F")));
575 }
576
577 #[test]
578 fn test_single_char_currency_is_txn_flag() {
579 let token = Token::Currency("T");
581 assert!(token.is_txn_flag());
582
583 let token = Token::Currency("USD");
585 assert!(!token.is_txn_flag());
586 }
587
588 #[test]
589 fn test_tokenize_string() {
590 let tokens = tokenize(r#""Hello, World!""#);
591 assert_eq!(tokens.len(), 1);
592 assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
593 }
594
595 #[test]
596 fn test_tokenize_keywords() {
597 let tokens = tokenize("txn balance open close");
598 assert_eq!(tokens.len(), 4);
599 assert!(matches!(tokens[0].0, Token::Txn));
600 assert!(matches!(tokens[1].0, Token::Balance));
601 assert!(matches!(tokens[2].0, Token::Open));
602 assert!(matches!(tokens[3].0, Token::Close));
603 }
604
605 #[test]
606 fn test_tokenize_tag_and_link() {
607 let tokens = tokenize("#my-tag ^my-link");
608 assert_eq!(tokens.len(), 2);
609 assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
610 assert!(matches!(tokens[1].0, Token::Link("^my-link")));
611 }
612
613 #[test]
614 fn test_tokenize_comment() {
615 let tokens = tokenize("; This is a comment");
616 assert_eq!(tokens.len(), 1);
617 assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
618 }
619
620 #[test]
621 fn test_tokenize_indentation() {
622 let tokens = tokenize("txn\n Assets:Bank 100 USD");
623 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
625 }
626
627 #[test]
628 fn test_tokenize_transaction_line() {
629 let source = "2024-01-15 * \"Grocery Store\" #food\n Expenses:Food 50.00 USD";
630 let tokens = tokenize(source);
631
632 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
634 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
635 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
636 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
637 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
638 assert!(
639 tokens
640 .iter()
641 .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
642 );
643 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
644 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
645 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
646 }
647
648 #[test]
649 fn test_tokenize_metadata_key() {
650 let tokens = tokenize("filename:");
651 assert_eq!(tokens.len(), 1);
652 assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
653 }
654
655 #[test]
656 fn test_tokenize_punctuation() {
657 let tokens = tokenize("{ } @ @@ , ~");
658 let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
659 assert!(token_types.contains(&Token::LBrace));
660 assert!(token_types.contains(&Token::RBrace));
661 assert!(token_types.contains(&Token::At));
662 assert!(token_types.contains(&Token::AtAt));
663 assert!(token_types.contains(&Token::Comma));
664 assert!(token_types.contains(&Token::Tilde));
665 }
666}