Skip to main content

rustledger_parser/
logos_lexer.rs

1//! SIMD-accelerated lexer using Logos.
2//!
3//! This module provides a fast tokenizer for Beancount syntax using the Logos crate,
4//! which generates a DFA-based lexer with SIMD optimizations where available.
5
6use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10/// A span in the source code (byte offsets).
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13    /// Start byte offset (inclusive).
14    pub start: usize,
15    /// End byte offset (exclusive).
16    pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20    fn from(range: Range<usize>) -> Self {
21        Self {
22            start: range.start,
23            end: range.end,
24        }
25    }
26}
27
28impl From<Span> for Range<usize> {
29    fn from(span: Span) -> Self {
30        span.start..span.end
31    }
32}
33
34/// Token types produced by the Logos lexer.
35#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] // Skip horizontal whitespace (spaces and tabs)
37pub enum Token<'src> {
38    // ===== Literals =====
39    /// A date in YYYY-MM-DD, YYYY-M-D, YYYY/MM/DD, or YYYY/M/D format.
40    /// Single-digit month and day are accepted (e.g., 2024-1-5).
41    #[regex(r"\d{4}[-/]\d{1,2}[-/]\d{1,2}")]
42    Date(&'src str),
43
44    /// A number with optional thousands separators and decimals.
45    /// Examples: 123, 1,234.56, 1234.5678, 1. (trailing decimal)
46    /// Negative numbers are handled as unary minus (`-` token + number)
47    /// to allow subtraction expressions like `3-2` to parse correctly.
48    /// Python beancount v3 requires an integer part before the decimal point.
49    /// Leading decimals like `.50` are rejected per the beancount v3 spec.
50    #[regex(r"(\d{1,3}(,\d{3})*|\d+)(\.\d*)?")]
51    Number(&'src str),
52
53    /// A double-quoted string (handles escape sequences).
54    /// The slice includes the quotes.
55    #[regex(r#""([^"\\]|\\.)*""#)]
56    String(&'src str),
57
58    /// An account name like Assets:Bank:Checking, Aktiva:Banque-Épargne, or Assets:Müller.
59    /// Must start with a capitalized word (account type prefix) and have at least one sub-account.
60    ///
61    /// Per the beancount v3 spec (`formats/beancount/v3/spec/lexical.md`):
62    ///
63    /// ```text
64    /// component = ascii_start (alphanumeric_dash | utf8_char)*
65    /// ```
66    ///
67    /// Each segment must START with an ASCII uppercase letter (or digit, for
68    /// sub-segments), but subsequent characters may include Unicode letters —
69    /// e.g. `Banque-Épargne` (French), `Müller` (German), `CorpJP日本` after an
70    /// ASCII start. Symbols, emoji, and non-letter Unicode are not allowed, and
71    /// segments starting with non-ASCII are still rejected (matching beancount).
72    ///
73    /// The account type prefix is validated later against options (`name_assets`, etc.).
74    #[regex(r"[A-Z][\p{L}0-9-]*(:([A-Z0-9][\p{L}0-9-]*)+)+")]
75    Account(&'src str),
76
77    /// A currency/commodity code like USD, EUR, AAPL, BTC, or single-char tickers like T, V, F.
78    /// Uppercase letters, can contain digits, apostrophes, dots, underscores, hyphens.
79    /// Single-character currencies (e.g., T for AT&T, V for Visa) are valid NYSE/NASDAQ tickers.
80    /// Note: Single-char currencies are disambiguated from transaction flags in the parser.
81    /// Also supports `/` prefix for options/futures contracts (e.g., `/ESM24`, `/LOX21_211204_P100.25`).
82    /// The `/` prefix requires an uppercase letter first to avoid matching `/1.14` as currency.
83    /// Priority 3 ensures Currency wins over Flag for single uppercase letters.
84    #[regex(r"/[A-Z][A-Z0-9'._-]*|[A-Z][A-Z0-9'._-]*", priority = 3)]
85    Currency(&'src str),
86
87    /// A tag like #tag-name.
88    #[regex(r"#[a-zA-Z0-9-_/.]+")]
89    Tag(&'src str),
90
91    /// A link like ^link-name.
92    #[regex(r"\^[a-zA-Z0-9-_/.]+")]
93    Link(&'src str),
94
95    // ===== Keywords =====
96    // Using #[token] for exact matches (higher priority than regex)
97    /// The `txn` keyword for transactions.
98    #[token("txn")]
99    Txn,
100    /// The `balance` directive keyword.
101    #[token("balance")]
102    Balance,
103    /// The `open` directive keyword.
104    #[token("open")]
105    Open,
106    /// The `close` directive keyword.
107    #[token("close")]
108    Close,
109    /// The `commodity` directive keyword.
110    #[token("commodity")]
111    Commodity,
112    /// The `pad` directive keyword.
113    #[token("pad")]
114    Pad,
115    /// The `event` directive keyword.
116    #[token("event")]
117    Event,
118    /// The `query` directive keyword.
119    #[token("query")]
120    Query,
121    /// The `note` directive keyword.
122    #[token("note")]
123    Note,
124    /// The `document` directive keyword.
125    #[token("document")]
126    Document,
127    /// The `price` directive keyword.
128    #[token("price")]
129    Price,
130    /// The `custom` directive keyword.
131    #[token("custom")]
132    Custom,
133    /// The `option` directive keyword.
134    #[token("option")]
135    Option_,
136    /// The `include` directive keyword.
137    #[token("include")]
138    Include,
139    /// The `plugin` directive keyword.
140    #[token("plugin")]
141    Plugin,
142    /// The `pushtag` directive keyword.
143    #[token("pushtag")]
144    Pushtag,
145    /// The `poptag` directive keyword.
146    #[token("poptag")]
147    Poptag,
148    /// The `pushmeta` directive keyword.
149    #[token("pushmeta")]
150    Pushmeta,
151    /// The `popmeta` directive keyword.
152    #[token("popmeta")]
153    Popmeta,
154    /// The `TRUE` boolean literal (also True, true).
155    #[token("TRUE")]
156    #[token("True")]
157    #[token("true")]
158    True,
159    /// The `FALSE` boolean literal (also False, false).
160    #[token("FALSE")]
161    #[token("False")]
162    #[token("false")]
163    False,
164    /// The `NULL` literal.
165    #[token("NULL")]
166    Null,
167
168    // ===== Punctuation =====
169    // Order matters: longer tokens first
170    /// Double left brace `{{` for cost specifications (legacy total cost).
171    #[token("{{")]
172    LDoubleBrace,
173    /// Double right brace `}}` for cost specifications.
174    #[token("}}")]
175    RDoubleBrace,
176    /// Left brace with hash `{#` for total cost (new syntax).
177    #[token("{#")]
178    LBraceHash,
179    /// Left brace `{` for cost specifications.
180    #[token("{")]
181    LBrace,
182    /// Right brace `}` for cost specifications.
183    #[token("}")]
184    RBrace,
185    /// Left parenthesis `(` for expressions.
186    #[token("(")]
187    LParen,
188    /// Right parenthesis `)` for expressions.
189    #[token(")")]
190    RParen,
191    /// Double at-sign `@@` for total cost.
192    #[token("@@")]
193    AtAt,
194    /// At-sign `@` for unit cost.
195    #[token("@")]
196    At,
197    /// Colon `:` separator.
198    #[token(":")]
199    Colon,
200    /// Comma `,` separator.
201    #[token(",")]
202    Comma,
203    /// Tilde `~` for tolerance.
204    #[token("~")]
205    Tilde,
206    /// Pipe `|` for deprecated payee/narration separator.
207    #[token("|")]
208    Pipe,
209    /// Plus `+` operator.
210    #[token("+")]
211    Plus,
212    /// Minus `-` operator.
213    #[token("-")]
214    Minus,
215    /// Star `*` for cleared transactions and multiplication.
216    #[token("*")]
217    Star,
218    /// Slash `/` for division.
219    #[token("/")]
220    Slash,
221
222    // ===== Transaction Flags =====
223    /// Pending flag `!` for incomplete transactions.
224    #[token("!")]
225    Pending,
226
227    /// Other transaction flags: P S T C U R M ? &
228    /// Note: # and % are handled as comments when followed by space
229    #[regex(r"[PSTCURM?&]")]
230    Flag(&'src str),
231
232    // ===== Structural =====
233    /// Newline (significant in Beancount for directive boundaries).
234    #[regex(r"\r?\n")]
235    Newline,
236
237    /// A comment starting with semicolon.
238    /// The slice includes the semicolon.
239    #[regex(r";[^\n\r]*", allow_greedy = true)]
240    Comment(&'src str),
241
242    /// Hash token `#` used as separator in cost specs: `{per_unit # total currency}`
243    /// Note: In Python beancount, `#` is only a comment at the START of a line.
244    /// Mid-line `# text` is NOT a comment - it's either a cost separator or syntax error.
245    /// Start-of-line hash comments are handled in post-processing (tokenize function).
246    #[token("#")]
247    Hash,
248
249    /// A percent comment (ledger-style).
250    /// Python beancount accepts % as a comment character for ledger compatibility.
251    #[regex(r"%[^\n\r]*", allow_greedy = true)]
252    PercentComment(&'src str),
253
254    /// Shebang line at start of file (e.g., #!/usr/bin/env bean-web).
255    /// Treated as a comment-like directive to skip.
256    #[regex(r"#![^\n\r]*", allow_greedy = true)]
257    Shebang(&'src str),
258
259    /// Emacs org-mode directive (e.g., "#+STARTUP: showall").
260    /// These are Emacs configuration lines that should be skipped.
261    #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
262    EmacsDirective(&'src str),
263
264    /// A metadata key (identifier followed by colon).
265    /// Examples: filename:, lineno:, custom-key:, nameOnCard:
266    /// The slice includes the trailing colon. Keys must start with a lowercase ASCII letter
267    /// per the beancount v3 spec. Keys starting with uppercase are rejected.
268    #[regex(r"[a-z][a-zA-Z0-9_-]*:")]
269    MetaKey(&'src str),
270
271    /// Indentation token (inserted by post-processing, not by Logos).
272    /// Contains the number of leading spaces.
273    /// This is a placeholder - actual indentation detection happens in [`tokenize`].
274    Indent(usize),
275
276    /// Deep indentation (3+ spaces) - used for posting-level metadata.
277    DeepIndent(usize),
278
279    /// Error token for unrecognized input.
280    /// Contains the invalid source text for better error messages.
281    Error(&'src str),
282}
283
284impl Token<'_> {
285    /// Returns true if this is a transaction flag (* or !).
286    /// Single-character currencies (e.g., T, P, C) can also be used as flags.
287    pub const fn is_txn_flag(&self) -> bool {
288        match self {
289            Self::Star | Self::Pending | Self::Flag(_) | Self::Hash => true,
290            // Single-char currencies can be used as transaction flags
291            Self::Currency(s) => s.len() == 1,
292            _ => false,
293        }
294    }
295
296    /// Returns true if this is a keyword that starts a directive.
297    pub const fn is_directive_keyword(&self) -> bool {
298        matches!(
299            self,
300            Self::Txn
301                | Self::Balance
302                | Self::Open
303                | Self::Close
304                | Self::Commodity
305                | Self::Pad
306                | Self::Event
307                | Self::Query
308                | Self::Note
309                | Self::Document
310                | Self::Price
311                | Self::Custom
312                | Self::Option_
313                | Self::Include
314                | Self::Plugin
315                | Self::Pushtag
316                | Self::Poptag
317                | Self::Pushmeta
318                | Self::Popmeta
319        )
320    }
321}
322
323impl fmt::Display for Token<'_> {
324    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
325        match self {
326            Self::Date(s) => write!(f, "{s}"),
327            Self::Number(s) => write!(f, "{s}"),
328            Self::String(s) => write!(f, "{s}"),
329            Self::Account(s) => write!(f, "{s}"),
330            Self::Currency(s) => write!(f, "{s}"),
331            Self::Tag(s) => write!(f, "{s}"),
332            Self::Link(s) => write!(f, "{s}"),
333            Self::Txn => write!(f, "txn"),
334            Self::Balance => write!(f, "balance"),
335            Self::Open => write!(f, "open"),
336            Self::Close => write!(f, "close"),
337            Self::Commodity => write!(f, "commodity"),
338            Self::Pad => write!(f, "pad"),
339            Self::Event => write!(f, "event"),
340            Self::Query => write!(f, "query"),
341            Self::Note => write!(f, "note"),
342            Self::Document => write!(f, "document"),
343            Self::Price => write!(f, "price"),
344            Self::Custom => write!(f, "custom"),
345            Self::Option_ => write!(f, "option"),
346            Self::Include => write!(f, "include"),
347            Self::Plugin => write!(f, "plugin"),
348            Self::Pushtag => write!(f, "pushtag"),
349            Self::Poptag => write!(f, "poptag"),
350            Self::Pushmeta => write!(f, "pushmeta"),
351            Self::Popmeta => write!(f, "popmeta"),
352            Self::True => write!(f, "TRUE"),
353            Self::False => write!(f, "FALSE"),
354            Self::Null => write!(f, "NULL"),
355            Self::LDoubleBrace => write!(f, "{{{{"),
356            Self::RDoubleBrace => write!(f, "}}}}"),
357            Self::LBraceHash => write!(f, "{{#"),
358            Self::LBrace => write!(f, "{{"),
359            Self::RBrace => write!(f, "}}"),
360            Self::LParen => write!(f, "("),
361            Self::RParen => write!(f, ")"),
362            Self::AtAt => write!(f, "@@"),
363            Self::At => write!(f, "@"),
364            Self::Colon => write!(f, ":"),
365            Self::Comma => write!(f, ","),
366            Self::Tilde => write!(f, "~"),
367            Self::Pipe => write!(f, "|"),
368            Self::Plus => write!(f, "+"),
369            Self::Minus => write!(f, "-"),
370            Self::Star => write!(f, "*"),
371            Self::Slash => write!(f, "/"),
372            Self::Pending => write!(f, "!"),
373            Self::Flag(s) => write!(f, "{s}"),
374            Self::Newline => write!(f, "\\n"),
375            Self::Comment(s) => write!(f, "{s}"),
376            Self::Hash => write!(f, "#"),
377            Self::PercentComment(s) => write!(f, "{s}"),
378            Self::Shebang(s) => write!(f, "{s}"),
379            Self::EmacsDirective(s) => write!(f, "{s}"),
380            Self::MetaKey(s) => write!(f, "{s}"),
381            Self::Indent(n) => write!(f, "<indent:{n}>"),
382            Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
383            Self::Error(s) => write!(f, "{s}"),
384        }
385    }
386}
387
388/// Tokenize source code into a vector of (Token, Span) pairs.
389///
390/// This function:
391/// 1. Runs the Logos lexer for fast tokenization
392/// 2. Post-processes to detect indentation at line starts
393/// 3. Handles lexer errors by producing Error tokens
394pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
395    let mut tokens = Vec::new();
396    let mut lexer = Token::lexer(source);
397    let mut at_line_start = true;
398    let mut last_newline_end = 0usize;
399
400    while let Some(result) = lexer.next() {
401        let span = lexer.span();
402
403        match result {
404            Ok(Token::Newline) => {
405                tokens.push((Token::Newline, span.clone().into()));
406                at_line_start = true;
407                last_newline_end = span.end;
408            }
409            Ok(Token::Hash) if at_line_start && span.start == last_newline_end => {
410                // Hash at very start of line (no indentation) is a comment
411                // Find end of line and create a comment token for the whole line
412                let comment_start = span.start;
413                let line_end = source[span.end..]
414                    .find('\n')
415                    .map_or(source.len(), |i| span.end + i);
416                let comment_text = &source[comment_start..line_end];
417                tokens.push((
418                    Token::Comment(comment_text),
419                    Span {
420                        start: comment_start,
421                        end: line_end,
422                    },
423                ));
424                // Skip lexer tokens until we reach the newline
425                while let Some(peek_result) = lexer.next() {
426                    let peek_span = lexer.span();
427                    let peek_end = peek_span.end;
428                    if peek_result == Ok(Token::Newline) {
429                        tokens.push((Token::Newline, peek_span.into()));
430                        at_line_start = true;
431                        last_newline_end = peek_end;
432                        break;
433                    }
434                    // Skip other tokens on the comment line
435                }
436            }
437            Ok(token) => {
438                // Check for indentation at line start
439                if at_line_start && span.start > last_newline_end {
440                    // Count leading whitespace between last newline and this token
441                    // Tabs count as indentation (treat 1 tab as 4 spaces for counting purposes)
442                    let leading = &source[last_newline_end..span.start];
443                    let mut space_count = 0;
444                    let mut char_count = 0;
445                    for c in leading.chars() {
446                        match c {
447                            ' ' => {
448                                space_count += 1;
449                                char_count += 1;
450                            }
451                            '\t' => {
452                                space_count += 4; // Treat tab as 4 spaces
453                                char_count += 1;
454                            }
455                            _ => break,
456                        }
457                    }
458                    // Python beancount accepts 1+ space for metadata indentation
459                    if space_count >= 1 {
460                        let indent_start = last_newline_end;
461                        let indent_end = last_newline_end + char_count;
462                        // Use DeepIndent for 3+ spaces (posting metadata level).
463                        // Python beancount allows flexible indentation where posting
464                        // metadata just needs to be more indented than the posting.
465                        // Common patterns: 2-space posting / 4-space meta, or
466                        // 1-space posting / 3-space meta (as in beancount_reds_plugins).
467                        let indent_token = if space_count >= 3 {
468                            Token::DeepIndent(space_count)
469                        } else {
470                            Token::Indent(space_count)
471                        };
472                        tokens.push((
473                            indent_token,
474                            Span {
475                                start: indent_start,
476                                end: indent_end,
477                            },
478                        ));
479                    }
480                }
481                at_line_start = false;
482                tokens.push((token, span.into()));
483            }
484            Err(()) => {
485                // Lexer error - produce an Error token with the invalid source text
486                at_line_start = false;
487                let invalid_text = &source[span.clone()];
488                tokens.push((Token::Error(invalid_text), span.into()));
489            }
490        }
491    }
492
493    tokens
494}
495
496#[cfg(test)]
497mod tests {
498    use super::*;
499
500    #[test]
501    fn test_tokenize_date() {
502        let tokens = tokenize("2024-01-15");
503        assert_eq!(tokens.len(), 1);
504        assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
505    }
506
507    #[test]
508    fn test_tokenize_date_single_digit_month() {
509        // Single-digit month should be tokenized as Date
510        let tokens = tokenize("2024-1-15");
511        assert_eq!(tokens.len(), 1);
512        assert!(matches!(tokens[0].0, Token::Date("2024-1-15")));
513    }
514
515    #[test]
516    fn test_tokenize_date_single_digit_day() {
517        // Single-digit day should be tokenized as Date
518        let tokens = tokenize("2024-01-5");
519        assert_eq!(tokens.len(), 1);
520        assert!(matches!(tokens[0].0, Token::Date("2024-01-5")));
521    }
522
523    #[test]
524    fn test_tokenize_date_single_digit_month_and_day() {
525        // Single-digit month and day should be tokenized as Date
526        let tokens = tokenize("2024-1-1");
527        assert_eq!(tokens.len(), 1);
528        assert!(matches!(tokens[0].0, Token::Date("2024-1-1")));
529    }
530
531    #[test]
532    fn test_tokenize_date_slash_separator_single_digit() {
533        // Slash separator with single-digit parts
534        let tokens = tokenize("2024/1/5");
535        assert_eq!(tokens.len(), 1);
536        assert!(matches!(tokens[0].0, Token::Date("2024/1/5")));
537    }
538
539    #[test]
540    fn test_tokenize_number() {
541        let tokens = tokenize("1234.56");
542        assert_eq!(tokens.len(), 1);
543        assert!(matches!(tokens[0].0, Token::Number("1234.56")));
544
545        // Negative numbers are now Minus + Number (enables subtraction expressions)
546        let tokens = tokenize("-1,234.56");
547        assert_eq!(tokens.len(), 2);
548        assert!(matches!(tokens[0].0, Token::Minus));
549        assert!(matches!(tokens[1].0, Token::Number("1,234.56")));
550    }
551
552    #[test]
553    fn test_tokenize_account() {
554        let tokens = tokenize("Assets:Bank:Checking");
555        assert_eq!(tokens.len(), 1);
556        assert!(matches!(
557            tokens[0].0,
558            Token::Account("Assets:Bank:Checking")
559        ));
560    }
561
562    #[test]
563    fn test_tokenize_account_unicode() {
564        // Per the beancount v3 spec, account name segments must START with an
565        // ASCII uppercase letter (or digit for sub-segments), but subsequent
566        // characters may include Unicode letters. Symbols/emoji and segments
567        // that start with non-ASCII remain invalid.
568
569        // Non-letter (emoji) after valid ASCII start - tokenizes as partial Account + Error
570        let tokens = tokenize("Assets:CORP✨");
571        assert!(
572            !matches!(tokens[0].0, Token::Account("Assets:CORP✨")),
573            "Unicode emoji in account name should not tokenize as a valid Account"
574        );
575        assert!(
576            tokens.iter().any(|(t, _)| matches!(t, Token::Error(_))),
577            "Unicode emoji should produce at least one Error token"
578        );
579
580        // Sub-component starts with non-ASCII (CJK) — should be rejected
581        let tokens = tokenize("Assets:沪深300");
582        assert!(
583            !matches!(tokens[0].0, Token::Account("Assets:沪深300")),
584            "CJK characters at the start of a sub-component should not tokenize as a valid Account"
585        );
586        assert!(
587            tokens.iter().any(|(t, _)| matches!(t, Token::Error(_))),
588            "CJK start should produce at least one Error token"
589        );
590
591        // Full CJK component - sub-component starts with non-ASCII, should be rejected
592        let tokens = tokenize("Assets:日本銀行");
593        assert!(
594            !matches!(tokens[0].0, Token::Account("Assets:日本銀行")),
595            "CJK sub-component start should not tokenize as a valid Account"
596        );
597        assert!(
598            tokens.iter().any(|(t, _)| matches!(t, Token::Error(_))),
599            "CJK sub-component start should produce at least one Error token"
600        );
601    }
602
603    /// Regression for issue #736/#739: Unicode letters AFTER an ASCII start
604    /// in account sub-components are valid per the beancount v3 spec.
605    #[test]
606    fn test_tokenize_account_unicode_letters_after_ascii_start() {
607        // French: É after ASCII start
608        let tokens = tokenize("Assets:Banque-Épargne");
609        assert!(
610            matches!(tokens[0].0, Token::Account("Assets:Banque-Épargne")),
611            "accented Latin letter after ASCII start should tokenize as Account, got: {tokens:?}"
612        );
613
614        // German: ü after ASCII start
615        let tokens = tokenize("Assets:Müller");
616        assert!(
617            matches!(tokens[0].0, Token::Account("Assets:Müller")),
618            "German umlaut after ASCII start should tokenize as Account, got: {tokens:?}"
619        );
620
621        // Mixed CJK after ASCII start — letters are allowed
622        let tokens = tokenize("Assets:CorpJP日本");
623        assert!(
624            matches!(tokens[0].0, Token::Account("Assets:CorpJP日本")),
625            "CJK letters after ASCII start should tokenize as Account, got: {tokens:?}"
626        );
627    }
628
629    #[test]
630    fn test_tokenize_currency() {
631        let tokens = tokenize("USD");
632        assert_eq!(tokens.len(), 1);
633        assert!(matches!(tokens[0].0, Token::Currency("USD")));
634    }
635
636    #[test]
637    fn test_tokenize_single_char_currency() {
638        // Single-char NYSE/NASDAQ tickers: T (AT&T), V (Visa), F (Ford), X (US Steel)
639        let tokens = tokenize("T");
640        assert_eq!(tokens.len(), 1);
641        assert!(matches!(tokens[0].0, Token::Currency("T")));
642
643        let tokens = tokenize("V");
644        assert_eq!(tokens.len(), 1);
645        assert!(matches!(tokens[0].0, Token::Currency("V")));
646
647        let tokens = tokenize("F");
648        assert_eq!(tokens.len(), 1);
649        assert!(matches!(tokens[0].0, Token::Currency("F")));
650    }
651
652    #[test]
653    fn test_single_char_currency_is_txn_flag() {
654        // Single-char currencies should be recognized as potential transaction flags
655        let token = Token::Currency("T");
656        assert!(token.is_txn_flag());
657
658        // Multi-char currencies should NOT be transaction flags
659        let token = Token::Currency("USD");
660        assert!(!token.is_txn_flag());
661    }
662
663    #[test]
664    fn test_tokenize_string() {
665        let tokens = tokenize(r#""Hello, World!""#);
666        assert_eq!(tokens.len(), 1);
667        assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
668    }
669
670    #[test]
671    fn test_tokenize_keywords() {
672        let tokens = tokenize("txn balance open close");
673        assert_eq!(tokens.len(), 4);
674        assert!(matches!(tokens[0].0, Token::Txn));
675        assert!(matches!(tokens[1].0, Token::Balance));
676        assert!(matches!(tokens[2].0, Token::Open));
677        assert!(matches!(tokens[3].0, Token::Close));
678    }
679
680    #[test]
681    fn test_tokenize_tag_and_link() {
682        let tokens = tokenize("#my-tag ^my-link");
683        assert_eq!(tokens.len(), 2);
684        assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
685        assert!(matches!(tokens[1].0, Token::Link("^my-link")));
686    }
687
688    #[test]
689    fn test_tokenize_comment() {
690        let tokens = tokenize("; This is a comment");
691        assert_eq!(tokens.len(), 1);
692        assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
693    }
694
695    #[test]
696    fn test_tokenize_indentation() {
697        let tokens = tokenize("txn\n  Assets:Bank 100 USD");
698        // Should have: Txn, Newline, Indent, Account, Number, Currency
699        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
700    }
701
702    #[test]
703    fn test_tokenize_transaction_line() {
704        let source = "2024-01-15 * \"Grocery Store\" #food\n  Expenses:Food 50.00 USD";
705        let tokens = tokenize(source);
706
707        // Check key tokens are present
708        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
709        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
710        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
711        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
712        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
713        assert!(
714            tokens
715                .iter()
716                .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
717        );
718        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
719        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
720        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
721    }
722
723    #[test]
724    fn test_tokenize_metadata_key() {
725        let tokens = tokenize("filename:");
726        assert_eq!(tokens.len(), 1);
727        assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
728    }
729
730    #[test]
731    fn test_tokenize_punctuation() {
732        let tokens = tokenize("{ } @ @@ , ~");
733        let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
734        assert!(token_types.contains(&Token::LBrace));
735        assert!(token_types.contains(&Token::RBrace));
736        assert!(token_types.contains(&Token::At));
737        assert!(token_types.contains(&Token::AtAt));
738        assert!(token_types.contains(&Token::Comma));
739        assert!(token_types.contains(&Token::Tilde));
740    }
741}