Skip to main content

rustledger_parser/
logos_lexer.rs

1//! SIMD-accelerated lexer using Logos.
2//!
3//! This module provides a fast tokenizer for Beancount syntax using the Logos crate,
4//! which generates a DFA-based lexer with SIMD optimizations where available.
5
6use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10/// A span in the source code (byte offsets).
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13    /// Start byte offset (inclusive).
14    pub start: usize,
15    /// End byte offset (exclusive).
16    pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20    fn from(range: Range<usize>) -> Self {
21        Self {
22            start: range.start,
23            end: range.end,
24        }
25    }
26}
27
28impl From<Span> for Range<usize> {
29    fn from(span: Span) -> Self {
30        span.start..span.end
31    }
32}
33
34/// Token types produced by the Logos lexer.
35#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] // Skip horizontal whitespace (spaces and tabs)
37pub enum Token<'src> {
38    // ===== Literals =====
39    /// A date in YYYY-MM-DD, YYYY-M-D, YYYY/MM/DD, or YYYY/M/D format.
40    /// Single-digit month and day are accepted (e.g., 2024-1-5).
41    #[regex(r"\d{4}[-/]\d{1,2}[-/]\d{1,2}")]
42    Date(&'src str),
43
44    /// A number with optional thousands separators and decimals.
45    /// Examples: 123, 1,234.56, 1234.5678, 1. (trailing decimal)
46    /// Negative numbers are handled as unary minus (`-` token + number)
47    /// to allow subtraction expressions like `3-2` to parse correctly.
48    /// Python beancount v3 requires an integer part before the decimal point.
49    /// Leading decimals like `.50` are rejected per the beancount v3 spec.
50    #[regex(r"(\d{1,3}(,\d{3})*|\d+)(\.\d*)?")]
51    Number(&'src str),
52
53    /// A double-quoted string (handles escape sequences).
54    /// The slice includes the quotes.
55    #[regex(r#""([^"\\]|\\.)*""#)]
56    String(&'src str),
57
58    /// An account name like Assets:Bank:Checking, Капитал:Retained-Earnings,
59    /// or 资产:银行:支票.
60    ///
61    /// The first component starts with an uppercase letter (`\p{Lu}`), a
62    /// letter without case like CJK ideographs (`\p{Lo}`), or a titlecase
63    /// letter (`\p{Lt}`). Sub-components may also start with a digit.
64    /// Subsequent characters can be any Unicode letter, digit, or hyphen.
65    ///
66    /// Note: The beancount v3 spec restricts the first character to ASCII
67    /// `[A-Z]`, but this is an artifact of the C flex lexer's poor Unicode
68    /// support, not a meaningful language design choice (see
69    /// beancount/beancount#161, #398, #733).
70    ///
71    /// The account type prefix is validated later against options (`name_assets`, etc.).
72    #[regex(r"[\p{Lu}\p{Lo}\p{Lt}][\p{L}0-9-]*(:([\p{Lu}\p{Lo}\p{Lt}0-9][\p{L}0-9-]*)+)+")]
73    Account(&'src str),
74
75    /// A currency/commodity code like USD, EUR, AAPL, BTC, or single-char tickers like T, V, F.
76    /// Uppercase letters, can contain digits, apostrophes, dots, underscores, hyphens.
77    /// Single-character currencies (e.g., T for AT&T, V for Visa) are valid NYSE/NASDAQ tickers.
78    /// Note: Single-char currencies are disambiguated from transaction flags in the parser.
79    /// Also supports `/` prefix for options/futures contracts (e.g., `/ESM24`, `/LOX21_211204_P100.25`).
80    /// The `/` prefix requires an uppercase letter first to avoid matching `/1.14` as currency.
81    /// Priority 3 ensures Currency wins over Flag for single uppercase letters.
82    #[regex(r"/[A-Z][A-Z0-9'._-]*|[A-Z][A-Z0-9'._-]*", priority = 3)]
83    Currency(&'src str),
84
85    /// A tag like #tag-name.
86    #[regex(r"#[a-zA-Z0-9-_/.]+")]
87    Tag(&'src str),
88
89    /// A link like ^link-name.
90    #[regex(r"\^[a-zA-Z0-9-_/.]+")]
91    Link(&'src str),
92
93    // ===== Keywords =====
94    // Using #[token] for exact matches (higher priority than regex)
95    /// The `txn` keyword for transactions.
96    #[token("txn")]
97    Txn,
98    /// The `balance` directive keyword.
99    #[token("balance")]
100    Balance,
101    /// The `open` directive keyword.
102    #[token("open")]
103    Open,
104    /// The `close` directive keyword.
105    #[token("close")]
106    Close,
107    /// The `commodity` directive keyword.
108    #[token("commodity")]
109    Commodity,
110    /// The `pad` directive keyword.
111    #[token("pad")]
112    Pad,
113    /// The `event` directive keyword.
114    #[token("event")]
115    Event,
116    /// The `query` directive keyword.
117    #[token("query")]
118    Query,
119    /// The `note` directive keyword.
120    #[token("note")]
121    Note,
122    /// The `document` directive keyword.
123    #[token("document")]
124    Document,
125    /// The `price` directive keyword.
126    #[token("price")]
127    Price,
128    /// The `custom` directive keyword.
129    #[token("custom")]
130    Custom,
131    /// The `option` directive keyword.
132    #[token("option")]
133    Option_,
134    /// The `include` directive keyword.
135    #[token("include")]
136    Include,
137    /// The `plugin` directive keyword.
138    #[token("plugin")]
139    Plugin,
140    /// The `pushtag` directive keyword.
141    #[token("pushtag")]
142    Pushtag,
143    /// The `poptag` directive keyword.
144    #[token("poptag")]
145    Poptag,
146    /// The `pushmeta` directive keyword.
147    #[token("pushmeta")]
148    Pushmeta,
149    /// The `popmeta` directive keyword.
150    #[token("popmeta")]
151    Popmeta,
152    /// The `TRUE` boolean literal (also True, true).
153    #[token("TRUE")]
154    #[token("True")]
155    #[token("true")]
156    True,
157    /// The `FALSE` boolean literal (also False, false).
158    #[token("FALSE")]
159    #[token("False")]
160    #[token("false")]
161    False,
162    /// The `NULL` literal.
163    #[token("NULL")]
164    Null,
165
166    // ===== Punctuation =====
167    // Order matters: longer tokens first
168    /// Double left brace `{{` for cost specifications (legacy total cost).
169    #[token("{{")]
170    LDoubleBrace,
171    /// Double right brace `}}` for cost specifications.
172    #[token("}}")]
173    RDoubleBrace,
174    /// Left brace with hash `{#` for total cost (new syntax).
175    #[token("{#")]
176    LBraceHash,
177    /// Left brace `{` for cost specifications.
178    #[token("{")]
179    LBrace,
180    /// Right brace `}` for cost specifications.
181    #[token("}")]
182    RBrace,
183    /// Left parenthesis `(` for expressions.
184    #[token("(")]
185    LParen,
186    /// Right parenthesis `)` for expressions.
187    #[token(")")]
188    RParen,
189    /// Double at-sign `@@` for total cost.
190    #[token("@@")]
191    AtAt,
192    /// At-sign `@` for unit cost.
193    #[token("@")]
194    At,
195    /// Colon `:` separator.
196    #[token(":")]
197    Colon,
198    /// Comma `,` separator.
199    #[token(",")]
200    Comma,
201    /// Tilde `~` for tolerance.
202    #[token("~")]
203    Tilde,
204    /// Pipe `|` for deprecated payee/narration separator.
205    #[token("|")]
206    Pipe,
207    /// Plus `+` operator.
208    #[token("+")]
209    Plus,
210    /// Minus `-` operator.
211    #[token("-")]
212    Minus,
213    /// Star `*` for cleared transactions and multiplication.
214    #[token("*")]
215    Star,
216    /// Slash `/` for division.
217    #[token("/")]
218    Slash,
219
220    // ===== Transaction Flags =====
221    /// Pending flag `!` for incomplete transactions.
222    #[token("!")]
223    Pending,
224
225    /// Other transaction flags: P S T C U R M ? &
226    /// Note: # and % are handled as comments when followed by space
227    #[regex(r"[PSTCURM?&]")]
228    Flag(&'src str),
229
230    // ===== Structural =====
231    /// Newline (significant in Beancount for directive boundaries).
232    #[regex(r"\r?\n")]
233    Newline,
234
235    /// A comment starting with semicolon.
236    /// The slice includes the semicolon.
237    #[regex(r";[^\n\r]*", allow_greedy = true)]
238    Comment(&'src str),
239
240    /// Hash token `#` used as separator in cost specs: `{per_unit # total currency}`
241    /// Note: In Python beancount, `#` is only a comment at the START of a line.
242    /// Mid-line `# text` is NOT a comment - it's either a cost separator or syntax error.
243    /// Start-of-line hash comments are handled in post-processing (tokenize function).
244    #[token("#")]
245    Hash,
246
247    /// A percent comment (ledger-style).
248    /// Python beancount accepts % as a comment character for ledger compatibility.
249    #[regex(r"%[^\n\r]*", allow_greedy = true)]
250    PercentComment(&'src str),
251
252    /// Shebang line at start of file (e.g., #!/usr/bin/env bean-web).
253    /// Treated as a comment-like directive to skip.
254    #[regex(r"#![^\n\r]*", allow_greedy = true)]
255    Shebang(&'src str),
256
257    /// Emacs org-mode directive (e.g., "#+STARTUP: showall").
258    /// These are Emacs configuration lines that should be skipped.
259    #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
260    EmacsDirective(&'src str),
261
262    /// A metadata key (identifier followed by colon).
263    /// Examples: filename:, lineno:, custom-key:, nameOnCard:
264    /// The slice includes the trailing colon. Keys must start with a lowercase ASCII letter
265    /// per the beancount v3 spec. Keys starting with uppercase are rejected.
266    #[regex(r"[a-z][a-zA-Z0-9_-]*:")]
267    MetaKey(&'src str),
268
269    /// Indentation token (inserted by post-processing, not by Logos).
270    /// Contains the number of leading spaces.
271    /// This is a placeholder - actual indentation detection happens in [`tokenize`].
272    Indent(usize),
273
274    /// Deep indentation (3+ spaces) - used for posting-level metadata.
275    DeepIndent(usize),
276
277    /// Error token for unrecognized input.
278    /// Contains the invalid source text for better error messages.
279    Error(&'src str),
280}
281
282impl Token<'_> {
283    /// Returns true if this is a transaction flag (* or !).
284    /// Single-character currencies (e.g., T, P, C) can also be used as flags.
285    pub const fn is_txn_flag(&self) -> bool {
286        match self {
287            Self::Star | Self::Pending | Self::Flag(_) | Self::Hash => true,
288            // Single-char currencies can be used as transaction flags
289            Self::Currency(s) => s.len() == 1,
290            _ => false,
291        }
292    }
293
294    /// Returns true if this is a keyword that starts a directive.
295    pub const fn is_directive_keyword(&self) -> bool {
296        matches!(
297            self,
298            Self::Txn
299                | Self::Balance
300                | Self::Open
301                | Self::Close
302                | Self::Commodity
303                | Self::Pad
304                | Self::Event
305                | Self::Query
306                | Self::Note
307                | Self::Document
308                | Self::Price
309                | Self::Custom
310                | Self::Option_
311                | Self::Include
312                | Self::Plugin
313                | Self::Pushtag
314                | Self::Poptag
315                | Self::Pushmeta
316                | Self::Popmeta
317        )
318    }
319}
320
321impl fmt::Display for Token<'_> {
322    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
323        match self {
324            Self::Date(s) => write!(f, "{s}"),
325            Self::Number(s) => write!(f, "{s}"),
326            Self::String(s) => write!(f, "{s}"),
327            Self::Account(s) => write!(f, "{s}"),
328            Self::Currency(s) => write!(f, "{s}"),
329            Self::Tag(s) => write!(f, "{s}"),
330            Self::Link(s) => write!(f, "{s}"),
331            Self::Txn => write!(f, "txn"),
332            Self::Balance => write!(f, "balance"),
333            Self::Open => write!(f, "open"),
334            Self::Close => write!(f, "close"),
335            Self::Commodity => write!(f, "commodity"),
336            Self::Pad => write!(f, "pad"),
337            Self::Event => write!(f, "event"),
338            Self::Query => write!(f, "query"),
339            Self::Note => write!(f, "note"),
340            Self::Document => write!(f, "document"),
341            Self::Price => write!(f, "price"),
342            Self::Custom => write!(f, "custom"),
343            Self::Option_ => write!(f, "option"),
344            Self::Include => write!(f, "include"),
345            Self::Plugin => write!(f, "plugin"),
346            Self::Pushtag => write!(f, "pushtag"),
347            Self::Poptag => write!(f, "poptag"),
348            Self::Pushmeta => write!(f, "pushmeta"),
349            Self::Popmeta => write!(f, "popmeta"),
350            Self::True => write!(f, "TRUE"),
351            Self::False => write!(f, "FALSE"),
352            Self::Null => write!(f, "NULL"),
353            Self::LDoubleBrace => write!(f, "{{{{"),
354            Self::RDoubleBrace => write!(f, "}}}}"),
355            Self::LBraceHash => write!(f, "{{#"),
356            Self::LBrace => write!(f, "{{"),
357            Self::RBrace => write!(f, "}}"),
358            Self::LParen => write!(f, "("),
359            Self::RParen => write!(f, ")"),
360            Self::AtAt => write!(f, "@@"),
361            Self::At => write!(f, "@"),
362            Self::Colon => write!(f, ":"),
363            Self::Comma => write!(f, ","),
364            Self::Tilde => write!(f, "~"),
365            Self::Pipe => write!(f, "|"),
366            Self::Plus => write!(f, "+"),
367            Self::Minus => write!(f, "-"),
368            Self::Star => write!(f, "*"),
369            Self::Slash => write!(f, "/"),
370            Self::Pending => write!(f, "!"),
371            Self::Flag(s) => write!(f, "{s}"),
372            Self::Newline => write!(f, "\\n"),
373            Self::Comment(s) => write!(f, "{s}"),
374            Self::Hash => write!(f, "#"),
375            Self::PercentComment(s) => write!(f, "{s}"),
376            Self::Shebang(s) => write!(f, "{s}"),
377            Self::EmacsDirective(s) => write!(f, "{s}"),
378            Self::MetaKey(s) => write!(f, "{s}"),
379            Self::Indent(n) => write!(f, "<indent:{n}>"),
380            Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
381            Self::Error(s) => write!(f, "{s}"),
382        }
383    }
384}
385
386/// Tokenize source code into a vector of (Token, Span) pairs.
387///
388/// This function:
389/// 1. Runs the Logos lexer for fast tokenization
390/// 2. Post-processes to detect indentation at line starts
391/// 3. Handles lexer errors by producing Error tokens
392pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
393    let mut tokens = Vec::new();
394    let mut lexer = Token::lexer(source);
395    let mut at_line_start = true;
396    let mut last_newline_end = 0usize;
397
398    while let Some(result) = lexer.next() {
399        let span = lexer.span();
400
401        match result {
402            Ok(Token::Newline) => {
403                tokens.push((Token::Newline, span.clone().into()));
404                at_line_start = true;
405                last_newline_end = span.end;
406            }
407            Ok(Token::Hash) if at_line_start && span.start == last_newline_end => {
408                // Hash at very start of line (no indentation) is a comment
409                // Find end of line and create a comment token for the whole line
410                let comment_start = span.start;
411                let line_end = source[span.end..]
412                    .find('\n')
413                    .map_or(source.len(), |i| span.end + i);
414                let comment_text = &source[comment_start..line_end];
415                tokens.push((
416                    Token::Comment(comment_text),
417                    Span {
418                        start: comment_start,
419                        end: line_end,
420                    },
421                ));
422                // Skip lexer tokens until we reach the newline
423                while let Some(peek_result) = lexer.next() {
424                    let peek_span = lexer.span();
425                    let peek_end = peek_span.end;
426                    if peek_result == Ok(Token::Newline) {
427                        tokens.push((Token::Newline, peek_span.into()));
428                        at_line_start = true;
429                        last_newline_end = peek_end;
430                        break;
431                    }
432                    // Skip other tokens on the comment line
433                }
434            }
435            Ok(token) => {
436                // Check for indentation at line start
437                if at_line_start && span.start > last_newline_end {
438                    // Count leading whitespace between last newline and this token
439                    // Tabs count as indentation (treat 1 tab as 4 spaces for counting purposes)
440                    let leading = &source[last_newline_end..span.start];
441                    let mut space_count = 0;
442                    let mut char_count = 0;
443                    for c in leading.chars() {
444                        match c {
445                            ' ' => {
446                                space_count += 1;
447                                char_count += 1;
448                            }
449                            '\t' => {
450                                space_count += 4; // Treat tab as 4 spaces
451                                char_count += 1;
452                            }
453                            _ => break,
454                        }
455                    }
456                    // Python beancount accepts 1+ space for metadata indentation
457                    if space_count >= 1 {
458                        let indent_start = last_newline_end;
459                        let indent_end = last_newline_end + char_count;
460                        // Use DeepIndent for 3+ spaces (posting metadata level).
461                        // Python beancount allows flexible indentation where posting
462                        // metadata just needs to be more indented than the posting.
463                        // Common patterns: 2-space posting / 4-space meta, or
464                        // 1-space posting / 3-space meta (as in beancount_reds_plugins).
465                        let indent_token = if space_count >= 3 {
466                            Token::DeepIndent(space_count)
467                        } else {
468                            Token::Indent(space_count)
469                        };
470                        tokens.push((
471                            indent_token,
472                            Span {
473                                start: indent_start,
474                                end: indent_end,
475                            },
476                        ));
477                    }
478                }
479                at_line_start = false;
480                tokens.push((token, span.into()));
481            }
482            Err(()) => {
483                // Lexer error - produce an Error token with the invalid source text
484                at_line_start = false;
485                let invalid_text = &source[span.clone()];
486                tokens.push((Token::Error(invalid_text), span.into()));
487            }
488        }
489    }
490
491    tokens
492}
493
494#[cfg(test)]
495mod tests {
496    use super::*;
497
498    #[test]
499    fn test_tokenize_date() {
500        let tokens = tokenize("2024-01-15");
501        assert_eq!(tokens.len(), 1);
502        assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
503    }
504
505    #[test]
506    fn test_tokenize_date_single_digit_month() {
507        // Single-digit month should be tokenized as Date
508        let tokens = tokenize("2024-1-15");
509        assert_eq!(tokens.len(), 1);
510        assert!(matches!(tokens[0].0, Token::Date("2024-1-15")));
511    }
512
513    #[test]
514    fn test_tokenize_date_single_digit_day() {
515        // Single-digit day should be tokenized as Date
516        let tokens = tokenize("2024-01-5");
517        assert_eq!(tokens.len(), 1);
518        assert!(matches!(tokens[0].0, Token::Date("2024-01-5")));
519    }
520
521    #[test]
522    fn test_tokenize_date_single_digit_month_and_day() {
523        // Single-digit month and day should be tokenized as Date
524        let tokens = tokenize("2024-1-1");
525        assert_eq!(tokens.len(), 1);
526        assert!(matches!(tokens[0].0, Token::Date("2024-1-1")));
527    }
528
529    #[test]
530    fn test_tokenize_date_slash_separator_single_digit() {
531        // Slash separator with single-digit parts
532        let tokens = tokenize("2024/1/5");
533        assert_eq!(tokens.len(), 1);
534        assert!(matches!(tokens[0].0, Token::Date("2024/1/5")));
535    }
536
537    #[test]
538    fn test_tokenize_number() {
539        let tokens = tokenize("1234.56");
540        assert_eq!(tokens.len(), 1);
541        assert!(matches!(tokens[0].0, Token::Number("1234.56")));
542
543        // Negative numbers are now Minus + Number (enables subtraction expressions)
544        let tokens = tokenize("-1,234.56");
545        assert_eq!(tokens.len(), 2);
546        assert!(matches!(tokens[0].0, Token::Minus));
547        assert!(matches!(tokens[1].0, Token::Number("1,234.56")));
548    }
549
550    #[test]
551    fn test_tokenize_account() {
552        let tokens = tokenize("Assets:Bank:Checking");
553        assert_eq!(tokens.len(), 1);
554        assert!(matches!(
555            tokens[0].0,
556            Token::Account("Assets:Bank:Checking")
557        ));
558    }
559
560    #[test]
561    fn test_tokenize_account_unicode() {
562        // Unicode uppercase letters and CJK characters are valid at the
563        // start of account components. Emoji and symbols are not.
564
565        // Non-letter (emoji) after valid ASCII start — still invalid
566        let tokens = tokenize("Assets:CORP✨");
567        assert!(
568            !matches!(tokens[0].0, Token::Account("Assets:CORP✨")),
569            "Unicode emoji in account name should not tokenize as a valid Account"
570        );
571        assert!(
572            tokens.iter().any(|(t, _)| matches!(t, Token::Error(_))),
573            "Unicode emoji should produce at least one Error token"
574        );
575
576        // CJK sub-component start — now valid (CJK ideographs are \p{Lo})
577        let tokens = tokenize("Assets:沪深300");
578        assert!(
579            matches!(tokens[0].0, Token::Account("Assets:沪深300")),
580            "CJK characters at the start of a sub-component should tokenize as Account"
581        );
582
583        // Full CJK sub-component — valid
584        let tokens = tokenize("Assets:日本銀行");
585        assert!(
586            matches!(tokens[0].0, Token::Account("Assets:日本銀行")),
587            "CJK sub-component should tokenize as Account"
588        );
589
590        // Cyrillic account type — valid (Cyrillic uppercase is \p{Lu})
591        let tokens = tokenize("Капитал:Retained");
592        assert!(
593            matches!(tokens[0].0, Token::Account("Капитал:Retained")),
594            "Cyrillic-starting account should tokenize as Account"
595        );
596
597        // Fully CJK account — valid
598        let tokens = tokenize("资产:银行:支票");
599        assert!(
600            matches!(tokens[0].0, Token::Account("资产:银行:支票")),
601            "Fully CJK account should tokenize as Account"
602        );
603    }
604
605    /// Regression for issue #736/#739: Unicode letters AFTER an ASCII start
606    /// in account sub-components are valid per the beancount v3 spec.
607    #[test]
608    fn test_tokenize_account_unicode_letters_after_ascii_start() {
609        // French: É after ASCII start
610        let tokens = tokenize("Assets:Banque-Épargne");
611        assert!(
612            matches!(tokens[0].0, Token::Account("Assets:Banque-Épargne")),
613            "accented Latin letter after ASCII start should tokenize as Account, got: {tokens:?}"
614        );
615
616        // German: ü after ASCII start
617        let tokens = tokenize("Assets:Müller");
618        assert!(
619            matches!(tokens[0].0, Token::Account("Assets:Müller")),
620            "German umlaut after ASCII start should tokenize as Account, got: {tokens:?}"
621        );
622
623        // Mixed CJK after ASCII start — letters are allowed
624        let tokens = tokenize("Assets:CorpJP日本");
625        assert!(
626            matches!(tokens[0].0, Token::Account("Assets:CorpJP日本")),
627            "CJK letters after ASCII start should tokenize as Account, got: {tokens:?}"
628        );
629    }
630
631    #[test]
632    fn test_tokenize_currency() {
633        let tokens = tokenize("USD");
634        assert_eq!(tokens.len(), 1);
635        assert!(matches!(tokens[0].0, Token::Currency("USD")));
636    }
637
638    #[test]
639    fn test_tokenize_single_char_currency() {
640        // Single-char NYSE/NASDAQ tickers: T (AT&T), V (Visa), F (Ford), X (US Steel)
641        let tokens = tokenize("T");
642        assert_eq!(tokens.len(), 1);
643        assert!(matches!(tokens[0].0, Token::Currency("T")));
644
645        let tokens = tokenize("V");
646        assert_eq!(tokens.len(), 1);
647        assert!(matches!(tokens[0].0, Token::Currency("V")));
648
649        let tokens = tokenize("F");
650        assert_eq!(tokens.len(), 1);
651        assert!(matches!(tokens[0].0, Token::Currency("F")));
652    }
653
654    #[test]
655    fn test_single_char_currency_is_txn_flag() {
656        // Single-char currencies should be recognized as potential transaction flags
657        let token = Token::Currency("T");
658        assert!(token.is_txn_flag());
659
660        // Multi-char currencies should NOT be transaction flags
661        let token = Token::Currency("USD");
662        assert!(!token.is_txn_flag());
663    }
664
665    #[test]
666    fn test_tokenize_string() {
667        let tokens = tokenize(r#""Hello, World!""#);
668        assert_eq!(tokens.len(), 1);
669        assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
670    }
671
672    #[test]
673    fn test_tokenize_keywords() {
674        let tokens = tokenize("txn balance open close");
675        assert_eq!(tokens.len(), 4);
676        assert!(matches!(tokens[0].0, Token::Txn));
677        assert!(matches!(tokens[1].0, Token::Balance));
678        assert!(matches!(tokens[2].0, Token::Open));
679        assert!(matches!(tokens[3].0, Token::Close));
680    }
681
682    #[test]
683    fn test_tokenize_tag_and_link() {
684        let tokens = tokenize("#my-tag ^my-link");
685        assert_eq!(tokens.len(), 2);
686        assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
687        assert!(matches!(tokens[1].0, Token::Link("^my-link")));
688    }
689
690    #[test]
691    fn test_tokenize_comment() {
692        let tokens = tokenize("; This is a comment");
693        assert_eq!(tokens.len(), 1);
694        assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
695    }
696
697    #[test]
698    fn test_tokenize_indentation() {
699        let tokens = tokenize("txn\n  Assets:Bank 100 USD");
700        // Should have: Txn, Newline, Indent, Account, Number, Currency
701        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
702    }
703
704    #[test]
705    fn test_tokenize_transaction_line() {
706        let source = "2024-01-15 * \"Grocery Store\" #food\n  Expenses:Food 50.00 USD";
707        let tokens = tokenize(source);
708
709        // Check key tokens are present
710        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
711        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
712        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
713        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
714        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
715        assert!(
716            tokens
717                .iter()
718                .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
719        );
720        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
721        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
722        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
723    }
724
725    #[test]
726    fn test_tokenize_metadata_key() {
727        let tokens = tokenize("filename:");
728        assert_eq!(tokens.len(), 1);
729        assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
730    }
731
732    #[test]
733    fn test_tokenize_punctuation() {
734        let tokens = tokenize("{ } @ @@ , ~");
735        let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
736        assert!(token_types.contains(&Token::LBrace));
737        assert!(token_types.contains(&Token::RBrace));
738        assert!(token_types.contains(&Token::At));
739        assert!(token_types.contains(&Token::AtAt));
740        assert!(token_types.contains(&Token::Comma));
741        assert!(token_types.contains(&Token::Tilde));
742    }
743}