rustledger_parser/
logos_lexer.rs

1//! SIMD-accelerated lexer using Logos.
2//!
3//! This module provides a fast tokenizer for Beancount syntax using the Logos crate,
4//! which generates a DFA-based lexer with SIMD optimizations where available.
5
6use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10/// A span in the source code (byte offsets).
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13    /// Start byte offset (inclusive).
14    pub start: usize,
15    /// End byte offset (exclusive).
16    pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20    fn from(range: Range<usize>) -> Self {
21        Self {
22            start: range.start,
23            end: range.end,
24        }
25    }
26}
27
28impl From<Span> for Range<usize> {
29    fn from(span: Span) -> Self {
30        span.start..span.end
31    }
32}
33
34/// Token types produced by the Logos lexer.
35#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] // Skip horizontal whitespace (spaces and tabs)
37pub enum Token<'src> {
38    // ===== Literals =====
39    /// A date in YYYY-MM-DD or YYYY/MM/DD format.
40    #[regex(r"\d{4}[-/]\d{2}[-/]\d{2}")]
41    Date(&'src str),
42
43    /// A number with optional sign, thousands separators, and decimals.
44    /// Examples: 123, -456, 1,234.56, 1234.5678, .50, -.50, 1. (trailing decimal)
45    /// Python beancount accepts trailing decimal (e.g., "1." meaning "1.0").
46    #[regex(r"-?(\.\d+|(\d{1,3}(,\d{3})*|\d+)(\.\d*)?)")]
47    Number(&'src str),
48
49    /// A double-quoted string (handles escape sequences).
50    /// The slice includes the quotes.
51    #[regex(r#""([^"\\]|\\.)*""#)]
52    String(&'src str),
53
54    /// An account name like Assets:Bank:Checking, Aktiva:Bank:Girokonto, or Ciste-jmeni:Stav.
55    /// Must start with a capitalized word (account type prefix) and have at least one sub-account.
56    /// Account type prefix can contain hyphens (e.g., Ciste-jmeni for Czech "Čisté jmění").
57    /// Sub-accounts must start with uppercase letter, digit, or non-ASCII character (matching Python beancount).
58    /// Supports Unicode letters, symbols, and emojis (e.g., Expenses:École, Assets:沪深300, Assets:CORP✨).
59    /// Pattern matches beancount's lexer.l: `([A-Z]|UTF-8-ONLY)([A-Za-z0-9-]|UTF-8-ONLY)*`.
60    /// `[^\x00-\x7F]` matches any non-ASCII UTF-8 character (equivalent to beancount's UTF-8-ONLY).
61    /// The account type prefix is validated later against options (`name_assets`, etc.).
62    #[regex(r"([A-Z]|[^\x00-\x7F])([A-Za-z0-9-]|[^\x00-\x7F])*(:([A-Z0-9]|[^\x00-\x7F])([A-Za-z0-9-]|[^\x00-\x7F])*)+")]
63    Account(&'src str),
64
65    /// A currency/commodity code like USD, EUR, AAPL, BTC.
66    /// Uppercase letters, can contain digits, apostrophes, dots, underscores, hyphens.
67    /// Note: This pattern is lower priority than Account, Keywords, and Flags.
68    /// Currency must have at least 2 characters to avoid conflict with single-letter flags.
69    /// Also supports `/` prefix for options/futures contracts (e.g., `/ESM24`, `/LOX21_211204_P100.25`).
70    /// The `/` prefix requires an uppercase letter first to avoid matching `/1.14` as currency.
71    #[regex(r"/[A-Z][A-Z0-9'._-]*|[A-Z][A-Z0-9'._-]+")]
72    Currency(&'src str),
73
74    /// A tag like #tag-name.
75    #[regex(r"#[a-zA-Z0-9-_/.]+")]
76    Tag(&'src str),
77
78    /// A link like ^link-name.
79    #[regex(r"\^[a-zA-Z0-9-_/.]+")]
80    Link(&'src str),
81
82    // ===== Keywords =====
83    // Using #[token] for exact matches (higher priority than regex)
84    /// The `txn` keyword for transactions.
85    #[token("txn")]
86    Txn,
87    /// The `balance` directive keyword.
88    #[token("balance")]
89    Balance,
90    /// The `open` directive keyword.
91    #[token("open")]
92    Open,
93    /// The `close` directive keyword.
94    #[token("close")]
95    Close,
96    /// The `commodity` directive keyword.
97    #[token("commodity")]
98    Commodity,
99    /// The `pad` directive keyword.
100    #[token("pad")]
101    Pad,
102    /// The `event` directive keyword.
103    #[token("event")]
104    Event,
105    /// The `query` directive keyword.
106    #[token("query")]
107    Query,
108    /// The `note` directive keyword.
109    #[token("note")]
110    Note,
111    /// The `document` directive keyword.
112    #[token("document")]
113    Document,
114    /// The `price` directive keyword.
115    #[token("price")]
116    Price,
117    /// The `custom` directive keyword.
118    #[token("custom")]
119    Custom,
120    /// The `option` directive keyword.
121    #[token("option")]
122    Option_,
123    /// The `include` directive keyword.
124    #[token("include")]
125    Include,
126    /// The `plugin` directive keyword.
127    #[token("plugin")]
128    Plugin,
129    /// The `pushtag` directive keyword.
130    #[token("pushtag")]
131    Pushtag,
132    /// The `poptag` directive keyword.
133    #[token("poptag")]
134    Poptag,
135    /// The `pushmeta` directive keyword.
136    #[token("pushmeta")]
137    Pushmeta,
138    /// The `popmeta` directive keyword.
139    #[token("popmeta")]
140    Popmeta,
141    /// The `TRUE` boolean literal (also True, true).
142    #[token("TRUE")]
143    #[token("True")]
144    #[token("true")]
145    True,
146    /// The `FALSE` boolean literal (also False, false).
147    #[token("FALSE")]
148    #[token("False")]
149    #[token("false")]
150    False,
151    /// The `NULL` literal.
152    #[token("NULL")]
153    Null,
154
155    // ===== Punctuation =====
156    // Order matters: longer tokens first
157    /// Double left brace `{{` for cost specifications (legacy total cost).
158    #[token("{{")]
159    LDoubleBrace,
160    /// Double right brace `}}` for cost specifications.
161    #[token("}}")]
162    RDoubleBrace,
163    /// Left brace with hash `{#` for total cost (new syntax).
164    #[token("{#")]
165    LBraceHash,
166    /// Left brace `{` for cost specifications.
167    #[token("{")]
168    LBrace,
169    /// Right brace `}` for cost specifications.
170    #[token("}")]
171    RBrace,
172    /// Left parenthesis `(` for expressions.
173    #[token("(")]
174    LParen,
175    /// Right parenthesis `)` for expressions.
176    #[token(")")]
177    RParen,
178    /// Double at-sign `@@` for total cost.
179    #[token("@@")]
180    AtAt,
181    /// At-sign `@` for unit cost.
182    #[token("@")]
183    At,
184    /// Colon `:` separator.
185    #[token(":")]
186    Colon,
187    /// Comma `,` separator.
188    #[token(",")]
189    Comma,
190    /// Tilde `~` for tolerance.
191    #[token("~")]
192    Tilde,
193    /// Pipe `|` for deprecated payee/narration separator.
194    #[token("|")]
195    Pipe,
196    /// Plus `+` operator.
197    #[token("+")]
198    Plus,
199    /// Minus `-` operator.
200    #[token("-")]
201    Minus,
202    /// Star `*` for cleared transactions and multiplication.
203    #[token("*")]
204    Star,
205    /// Slash `/` for division.
206    #[token("/")]
207    Slash,
208
209    // ===== Transaction Flags =====
210    /// Pending flag `!` for incomplete transactions.
211    #[token("!")]
212    Pending,
213
214    /// Other transaction flags: P S T C U R M ? &
215    /// Note: # and % are handled as comments when followed by space
216    #[regex(r"[PSTCURM?&]")]
217    Flag(&'src str),
218
219    // ===== Structural =====
220    /// Newline (significant in Beancount for directive boundaries).
221    #[regex(r"\r?\n")]
222    Newline,
223
224    /// A comment starting with semicolon.
225    /// The slice includes the semicolon.
226    #[regex(r";[^\n\r]*", allow_greedy = true)]
227    Comment(&'src str),
228
229    /// Hash token `#` used as separator in cost specs: `{per_unit # total currency}`
230    /// Note: In Python beancount, `#` is only a comment at the START of a line.
231    /// Mid-line `# text` is NOT a comment - it's either a cost separator or syntax error.
232    /// Start-of-line hash comments are handled in post-processing (tokenize function).
233    #[token("#")]
234    Hash,
235
236    /// A percent comment (ledger-style).
237    /// Python beancount accepts % as a comment character for ledger compatibility.
238    #[regex(r"%[^\n\r]*", allow_greedy = true)]
239    PercentComment(&'src str),
240
241    /// Shebang line at start of file (e.g., #!/usr/bin/env bean-web).
242    /// Treated as a comment-like directive to skip.
243    #[regex(r"#![^\n\r]*", allow_greedy = true)]
244    Shebang(&'src str),
245
246    /// Emacs org-mode directive (e.g., "#+STARTUP: showall").
247    /// These are Emacs configuration lines that should be skipped.
248    #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
249    EmacsDirective(&'src str),
250
251    /// A metadata key (identifier followed by colon).
252    /// Examples: filename:, lineno:, custom-key:, nameOnCard:
253    /// The slice includes the trailing colon. Keys can use camelCase or `snake_case`.
254    #[regex(r"[a-zA-Z][a-zA-Z0-9_-]*:")]
255    MetaKey(&'src str),
256
257    /// Indentation token (inserted by post-processing, not by Logos).
258    /// Contains the number of leading spaces.
259    /// This is a placeholder - actual indentation detection happens in [`tokenize`].
260    Indent(usize),
261
262    /// Deep indentation (4+ spaces) - used for posting-level metadata.
263    DeepIndent(usize),
264
265    /// Error token for unrecognized input.
266    /// Contains the invalid source text for better error messages.
267    Error(&'src str),
268}
269
270impl Token<'_> {
271    /// Returns true if this is a transaction flag (* or !).
272    pub const fn is_txn_flag(&self) -> bool {
273        matches!(
274            self,
275            Self::Star | Self::Pending | Self::Flag(_) | Self::Hash
276        )
277    }
278
279    /// Returns true if this is a keyword that starts a directive.
280    pub const fn is_directive_keyword(&self) -> bool {
281        matches!(
282            self,
283            Self::Txn
284                | Self::Balance
285                | Self::Open
286                | Self::Close
287                | Self::Commodity
288                | Self::Pad
289                | Self::Event
290                | Self::Query
291                | Self::Note
292                | Self::Document
293                | Self::Price
294                | Self::Custom
295                | Self::Option_
296                | Self::Include
297                | Self::Plugin
298                | Self::Pushtag
299                | Self::Poptag
300                | Self::Pushmeta
301                | Self::Popmeta
302        )
303    }
304}
305
306impl fmt::Display for Token<'_> {
307    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
308        match self {
309            Self::Date(s) => write!(f, "{s}"),
310            Self::Number(s) => write!(f, "{s}"),
311            Self::String(s) => write!(f, "{s}"),
312            Self::Account(s) => write!(f, "{s}"),
313            Self::Currency(s) => write!(f, "{s}"),
314            Self::Tag(s) => write!(f, "{s}"),
315            Self::Link(s) => write!(f, "{s}"),
316            Self::Txn => write!(f, "txn"),
317            Self::Balance => write!(f, "balance"),
318            Self::Open => write!(f, "open"),
319            Self::Close => write!(f, "close"),
320            Self::Commodity => write!(f, "commodity"),
321            Self::Pad => write!(f, "pad"),
322            Self::Event => write!(f, "event"),
323            Self::Query => write!(f, "query"),
324            Self::Note => write!(f, "note"),
325            Self::Document => write!(f, "document"),
326            Self::Price => write!(f, "price"),
327            Self::Custom => write!(f, "custom"),
328            Self::Option_ => write!(f, "option"),
329            Self::Include => write!(f, "include"),
330            Self::Plugin => write!(f, "plugin"),
331            Self::Pushtag => write!(f, "pushtag"),
332            Self::Poptag => write!(f, "poptag"),
333            Self::Pushmeta => write!(f, "pushmeta"),
334            Self::Popmeta => write!(f, "popmeta"),
335            Self::True => write!(f, "TRUE"),
336            Self::False => write!(f, "FALSE"),
337            Self::Null => write!(f, "NULL"),
338            Self::LDoubleBrace => write!(f, "{{{{"),
339            Self::RDoubleBrace => write!(f, "}}}}"),
340            Self::LBraceHash => write!(f, "{{#"),
341            Self::LBrace => write!(f, "{{"),
342            Self::RBrace => write!(f, "}}"),
343            Self::LParen => write!(f, "("),
344            Self::RParen => write!(f, ")"),
345            Self::AtAt => write!(f, "@@"),
346            Self::At => write!(f, "@"),
347            Self::Colon => write!(f, ":"),
348            Self::Comma => write!(f, ","),
349            Self::Tilde => write!(f, "~"),
350            Self::Pipe => write!(f, "|"),
351            Self::Plus => write!(f, "+"),
352            Self::Minus => write!(f, "-"),
353            Self::Star => write!(f, "*"),
354            Self::Slash => write!(f, "/"),
355            Self::Pending => write!(f, "!"),
356            Self::Flag(s) => write!(f, "{s}"),
357            Self::Newline => write!(f, "\\n"),
358            Self::Comment(s) => write!(f, "{s}"),
359            Self::Hash => write!(f, "#"),
360            Self::PercentComment(s) => write!(f, "{s}"),
361            Self::Shebang(s) => write!(f, "{s}"),
362            Self::EmacsDirective(s) => write!(f, "{s}"),
363            Self::MetaKey(s) => write!(f, "{s}"),
364            Self::Indent(n) => write!(f, "<indent:{n}>"),
365            Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
366            Self::Error(s) => write!(f, "{s}"),
367        }
368    }
369}
370
371/// Tokenize source code into a vector of (Token, Span) pairs.
372///
373/// This function:
374/// 1. Runs the Logos lexer for fast tokenization
375/// 2. Post-processes to detect indentation at line starts
376/// 3. Handles lexer errors by producing Error tokens
377pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
378    let mut tokens = Vec::new();
379    let mut lexer = Token::lexer(source);
380    let mut at_line_start = true;
381    let mut last_newline_end = 0usize;
382
383    while let Some(result) = lexer.next() {
384        let span = lexer.span();
385
386        match result {
387            Ok(Token::Newline) => {
388                tokens.push((Token::Newline, span.clone().into()));
389                at_line_start = true;
390                last_newline_end = span.end;
391            }
392            Ok(Token::Hash) if at_line_start && span.start == last_newline_end => {
393                // Hash at very start of line (no indentation) is a comment
394                // Find end of line and create a comment token for the whole line
395                let comment_start = span.start;
396                let line_end = source[span.end..]
397                    .find('\n')
398                    .map_or(source.len(), |i| span.end + i);
399                let comment_text = &source[comment_start..line_end];
400                tokens.push((
401                    Token::Comment(comment_text),
402                    Span {
403                        start: comment_start,
404                        end: line_end,
405                    },
406                ));
407                // Skip lexer tokens until we reach the newline
408                while let Some(peek_result) = lexer.next() {
409                    let peek_span = lexer.span();
410                    let peek_end = peek_span.end;
411                    if peek_result == Ok(Token::Newline) {
412                        tokens.push((Token::Newline, peek_span.into()));
413                        at_line_start = true;
414                        last_newline_end = peek_end;
415                        break;
416                    }
417                    // Skip other tokens on the comment line
418                }
419            }
420            Ok(token) => {
421                // Check for indentation at line start
422                if at_line_start && span.start > last_newline_end {
423                    // Count leading whitespace between last newline and this token
424                    // Tabs count as indentation (treat 1 tab as 4 spaces for counting purposes)
425                    let leading = &source[last_newline_end..span.start];
426                    let mut space_count = 0;
427                    let mut char_count = 0;
428                    for c in leading.chars() {
429                        match c {
430                            ' ' => {
431                                space_count += 1;
432                                char_count += 1;
433                            }
434                            '\t' => {
435                                space_count += 4; // Treat tab as 4 spaces
436                                char_count += 1;
437                            }
438                            _ => break,
439                        }
440                    }
441                    // Python beancount accepts 1+ space for metadata indentation
442                    if space_count >= 1 {
443                        let indent_start = last_newline_end;
444                        let indent_end = last_newline_end + char_count;
445                        // Use DeepIndent for 4+ spaces (posting metadata level)
446                        let indent_token = if space_count >= 4 {
447                            Token::DeepIndent(space_count)
448                        } else {
449                            Token::Indent(space_count)
450                        };
451                        tokens.push((
452                            indent_token,
453                            Span {
454                                start: indent_start,
455                                end: indent_end,
456                            },
457                        ));
458                    }
459                }
460                at_line_start = false;
461                tokens.push((token, span.into()));
462            }
463            Err(()) => {
464                // Lexer error - produce an Error token with the invalid source text
465                at_line_start = false;
466                let invalid_text = &source[span.clone()];
467                tokens.push((Token::Error(invalid_text), span.into()));
468            }
469        }
470    }
471
472    tokens
473}
474
475#[cfg(test)]
476mod tests {
477    use super::*;
478
479    #[test]
480    fn test_tokenize_date() {
481        let tokens = tokenize("2024-01-15");
482        assert_eq!(tokens.len(), 1);
483        assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
484    }
485
486    #[test]
487    fn test_tokenize_number() {
488        let tokens = tokenize("1234.56");
489        assert_eq!(tokens.len(), 1);
490        assert!(matches!(tokens[0].0, Token::Number("1234.56")));
491
492        let tokens = tokenize("-1,234.56");
493        assert_eq!(tokens.len(), 1);
494        assert!(matches!(tokens[0].0, Token::Number("-1,234.56")));
495    }
496
497    #[test]
498    fn test_tokenize_account() {
499        let tokens = tokenize("Assets:Bank:Checking");
500        assert_eq!(tokens.len(), 1);
501        assert!(matches!(
502            tokens[0].0,
503            Token::Account("Assets:Bank:Checking")
504        ));
505    }
506
507    #[test]
508    fn test_tokenize_account_unicode() {
509        // Test various Unicode characters in account names
510        // Matching beancount's UTF-8-ONLY support (any non-ASCII character)
511
512        // Emoji (Unicode Symbol category So)
513        let tokens = tokenize("Assets:CORP✨");
514        assert_eq!(tokens.len(), 1);
515        assert!(matches!(tokens[0].0, Token::Account("Assets:CORP✨")));
516
517        // CJK characters (Unicode Letter category Lo)
518        let tokens = tokenize("Assets:沪深300");
519        assert_eq!(tokens.len(), 1);
520        assert!(matches!(tokens[0].0, Token::Account("Assets:沪深300")));
521
522        // Full CJK component
523        let tokens = tokenize("Assets:日本銀行");
524        assert_eq!(tokens.len(), 1);
525        assert!(matches!(tokens[0].0, Token::Account("Assets:日本銀行")));
526
527        // Non-ASCII letters (accented)
528        let tokens = tokenize("Assets:Café");
529        assert_eq!(tokens.len(), 1);
530        assert!(matches!(tokens[0].0, Token::Account("Assets:Café")));
531
532        // Currency symbol (Unicode Symbol category Sc)
533        let tokens = tokenize("Assets:€uro");
534        assert_eq!(tokens.len(), 1);
535        assert!(matches!(tokens[0].0, Token::Account("Assets:€uro")));
536
537        // Emoji in middle of component
538        let tokens = tokenize("Assets:Test💰Account");
539        assert_eq!(tokens.len(), 1);
540        assert!(matches!(
541            tokens[0].0,
542            Token::Account("Assets:Test💰Account")
543        ));
544    }
545
546    #[test]
547    fn test_tokenize_currency() {
548        let tokens = tokenize("USD");
549        assert_eq!(tokens.len(), 1);
550        assert!(matches!(tokens[0].0, Token::Currency("USD")));
551    }
552
553    #[test]
554    fn test_tokenize_string() {
555        let tokens = tokenize(r#""Hello, World!""#);
556        assert_eq!(tokens.len(), 1);
557        assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
558    }
559
560    #[test]
561    fn test_tokenize_keywords() {
562        let tokens = tokenize("txn balance open close");
563        assert_eq!(tokens.len(), 4);
564        assert!(matches!(tokens[0].0, Token::Txn));
565        assert!(matches!(tokens[1].0, Token::Balance));
566        assert!(matches!(tokens[2].0, Token::Open));
567        assert!(matches!(tokens[3].0, Token::Close));
568    }
569
570    #[test]
571    fn test_tokenize_tag_and_link() {
572        let tokens = tokenize("#my-tag ^my-link");
573        assert_eq!(tokens.len(), 2);
574        assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
575        assert!(matches!(tokens[1].0, Token::Link("^my-link")));
576    }
577
578    #[test]
579    fn test_tokenize_comment() {
580        let tokens = tokenize("; This is a comment");
581        assert_eq!(tokens.len(), 1);
582        assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
583    }
584
585    #[test]
586    fn test_tokenize_indentation() {
587        let tokens = tokenize("txn\n  Assets:Bank 100 USD");
588        // Should have: Txn, Newline, Indent, Account, Number, Currency
589        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
590    }
591
592    #[test]
593    fn test_tokenize_transaction_line() {
594        let source = "2024-01-15 * \"Grocery Store\" #food\n  Expenses:Food 50.00 USD";
595        let tokens = tokenize(source);
596
597        // Check key tokens are present
598        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
599        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
600        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
601        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
602        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
603        assert!(
604            tokens
605                .iter()
606                .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
607        );
608        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
609        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
610        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
611    }
612
613    #[test]
614    fn test_tokenize_metadata_key() {
615        let tokens = tokenize("filename:");
616        assert_eq!(tokens.len(), 1);
617        assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
618    }
619
620    #[test]
621    fn test_tokenize_punctuation() {
622        let tokens = tokenize("{ } @ @@ , ~");
623        let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
624        assert!(token_types.contains(&Token::LBrace));
625        assert!(token_types.contains(&Token::RBrace));
626        assert!(token_types.contains(&Token::At));
627        assert!(token_types.contains(&Token::AtAt));
628        assert!(token_types.contains(&Token::Comma));
629        assert!(token_types.contains(&Token::Tilde));
630    }
631}