rustledger_parser/
logos_lexer.rs

1//! SIMD-accelerated lexer using Logos.
2//!
3//! This module provides a fast tokenizer for Beancount syntax using the Logos crate,
4//! which generates a DFA-based lexer with SIMD optimizations where available.
5
6use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10/// A span in the source code (byte offsets).
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13    /// Start byte offset (inclusive).
14    pub start: usize,
15    /// End byte offset (exclusive).
16    pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20    fn from(range: Range<usize>) -> Self {
21        Self {
22            start: range.start,
23            end: range.end,
24        }
25    }
26}
27
28impl From<Span> for Range<usize> {
29    fn from(span: Span) -> Self {
30        span.start..span.end
31    }
32}
33
34/// Token types produced by the Logos lexer.
35#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] // Skip horizontal whitespace (spaces and tabs)
37pub enum Token<'src> {
38    // ===== Literals =====
39    /// A date in YYYY-MM-DD or YYYY/MM/DD format.
40    #[regex(r"\d{4}[-/]\d{2}[-/]\d{2}")]
41    Date(&'src str),
42
43    /// A number with optional sign, thousands separators, and decimals.
44    /// Examples: 123, -456, 1,234.56, 1234.5678, .50, -.50
45    #[regex(r"-?(\.\d+|(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)")]
46    Number(&'src str),
47
48    /// A double-quoted string (handles escape sequences).
49    /// The slice includes the quotes.
50    #[regex(r#""([^"\\]|\\.)*""#)]
51    String(&'src str),
52
53    /// An account name like Assets:Bank:Checking or Assets:401k:Fidelity.
54    /// Must start with one of the 5 account types and have at least one sub-account.
55    /// Sub-accounts can start with uppercase letter or digit.
56    #[regex(r"(Assets|Liabilities|Equity|Income|Expenses)(:[A-Za-z0-9][a-zA-Z0-9-]*)+")]
57    Account(&'src str),
58
59    /// A currency/commodity code like USD, EUR, AAPL, BTC.
60    /// Uppercase letters, can contain digits, apostrophes, dots, underscores, hyphens.
61    /// Note: This pattern is lower priority than Account, Keywords, and Flags.
62    /// Currency must have at least 2 characters to avoid conflict with single-letter flags.
63    /// Also supports `/` prefix for options/futures contracts (e.g., `/LOX21_211204_P100.25`).
64    #[regex(r"/[A-Z0-9'._-]+|[A-Z][A-Z0-9'._-]+")]
65    Currency(&'src str),
66
67    /// A tag like #tag-name.
68    #[regex(r"#[a-zA-Z0-9-_/.]+")]
69    Tag(&'src str),
70
71    /// A link like ^link-name.
72    #[regex(r"\^[a-zA-Z0-9-_/.]+")]
73    Link(&'src str),
74
75    // ===== Keywords =====
76    // Using #[token] for exact matches (higher priority than regex)
77    /// The `txn` keyword for transactions.
78    #[token("txn")]
79    Txn,
80    /// The `balance` directive keyword.
81    #[token("balance")]
82    Balance,
83    /// The `open` directive keyword.
84    #[token("open")]
85    Open,
86    /// The `close` directive keyword.
87    #[token("close")]
88    Close,
89    /// The `commodity` directive keyword.
90    #[token("commodity")]
91    Commodity,
92    /// The `pad` directive keyword.
93    #[token("pad")]
94    Pad,
95    /// The `event` directive keyword.
96    #[token("event")]
97    Event,
98    /// The `query` directive keyword.
99    #[token("query")]
100    Query,
101    /// The `note` directive keyword.
102    #[token("note")]
103    Note,
104    /// The `document` directive keyword.
105    #[token("document")]
106    Document,
107    /// The `price` directive keyword.
108    #[token("price")]
109    Price,
110    /// The `custom` directive keyword.
111    #[token("custom")]
112    Custom,
113    /// The `option` directive keyword.
114    #[token("option")]
115    Option_,
116    /// The `include` directive keyword.
117    #[token("include")]
118    Include,
119    /// The `plugin` directive keyword.
120    #[token("plugin")]
121    Plugin,
122    /// The `pushtag` directive keyword.
123    #[token("pushtag")]
124    Pushtag,
125    /// The `poptag` directive keyword.
126    #[token("poptag")]
127    Poptag,
128    /// The `pushmeta` directive keyword.
129    #[token("pushmeta")]
130    Pushmeta,
131    /// The `popmeta` directive keyword.
132    #[token("popmeta")]
133    Popmeta,
134    /// The `TRUE` boolean literal (also True, true).
135    #[token("TRUE")]
136    #[token("True")]
137    #[token("true")]
138    True,
139    /// The `FALSE` boolean literal (also False, false).
140    #[token("FALSE")]
141    #[token("False")]
142    #[token("false")]
143    False,
144    /// The `NULL` literal.
145    #[token("NULL")]
146    Null,
147
148    // ===== Punctuation =====
149    // Order matters: longer tokens first
150    /// Double left brace `{{` for cost specifications (legacy total cost).
151    #[token("{{")]
152    LDoubleBrace,
153    /// Double right brace `}}` for cost specifications.
154    #[token("}}")]
155    RDoubleBrace,
156    /// Left brace with hash `{#` for total cost (new syntax).
157    #[token("{#")]
158    LBraceHash,
159    /// Left brace `{` for cost specifications.
160    #[token("{")]
161    LBrace,
162    /// Right brace `}` for cost specifications.
163    #[token("}")]
164    RBrace,
165    /// Left parenthesis `(` for expressions.
166    #[token("(")]
167    LParen,
168    /// Right parenthesis `)` for expressions.
169    #[token(")")]
170    RParen,
171    /// Double at-sign `@@` for total cost.
172    #[token("@@")]
173    AtAt,
174    /// At-sign `@` for unit cost.
175    #[token("@")]
176    At,
177    /// Colon `:` separator.
178    #[token(":")]
179    Colon,
180    /// Comma `,` separator.
181    #[token(",")]
182    Comma,
183    /// Tilde `~` for tolerance.
184    #[token("~")]
185    Tilde,
186    /// Plus `+` operator.
187    #[token("+")]
188    Plus,
189    /// Minus `-` operator.
190    #[token("-")]
191    Minus,
192    /// Star `*` for cleared transactions and multiplication.
193    #[token("*")]
194    Star,
195    /// Slash `/` for division.
196    #[token("/")]
197    Slash,
198
199    // ===== Transaction Flags =====
200    /// Pending flag `!` for incomplete transactions.
201    #[token("!")]
202    Pending,
203
204    /// Other transaction flags: P S T C U R M # ? % &
205    /// Note: # is only a flag when NOT followed by tag characters
206    #[regex(r"[PSTCURM#?%&]")]
207    Flag(&'src str),
208
209    // ===== Structural =====
210    /// Newline (significant in Beancount for directive boundaries).
211    #[regex(r"\r?\n")]
212    Newline,
213
214    /// A comment starting with semicolon.
215    /// The slice includes the semicolon.
216    #[regex(r";[^\n\r]*")]
217    Comment(&'src str),
218
219    /// Shebang line at start of file (e.g., #!/usr/bin/env bean-web).
220    /// Treated as a comment-like directive to skip.
221    #[regex(r"#![^\n\r]*")]
222    Shebang(&'src str),
223
224    /// Emacs org-mode directive (e.g., "#+STARTUP: showall").
225    /// These are Emacs configuration lines that should be skipped.
226    #[regex(r"#\+[^\n\r]*")]
227    EmacsDirective(&'src str),
228
229    /// A metadata key (identifier followed by colon).
230    /// Examples: filename:, lineno:, custom-key:, nameOnCard:
231    /// The slice includes the trailing colon. Keys can use camelCase or `snake_case`.
232    #[regex(r"[a-zA-Z][a-zA-Z0-9_-]*:")]
233    MetaKey(&'src str),
234
235    /// Indentation token (inserted by post-processing, not by Logos).
236    /// Contains the number of leading spaces.
237    /// This is a placeholder - actual indentation detection happens in [`tokenize`].
238    Indent(usize),
239
240    /// Deep indentation (4+ spaces) - used for posting-level metadata.
241    DeepIndent(usize),
242
243    /// Error token for unrecognized input.
244    Error,
245}
246
247impl Token<'_> {
248    /// Returns true if this is a transaction flag (* or !).
249    pub const fn is_txn_flag(&self) -> bool {
250        matches!(self, Self::Star | Self::Pending | Self::Flag(_))
251    }
252
253    /// Returns true if this is a keyword that starts a directive.
254    pub const fn is_directive_keyword(&self) -> bool {
255        matches!(
256            self,
257            Self::Txn
258                | Self::Balance
259                | Self::Open
260                | Self::Close
261                | Self::Commodity
262                | Self::Pad
263                | Self::Event
264                | Self::Query
265                | Self::Note
266                | Self::Document
267                | Self::Price
268                | Self::Custom
269                | Self::Option_
270                | Self::Include
271                | Self::Plugin
272                | Self::Pushtag
273                | Self::Poptag
274                | Self::Pushmeta
275                | Self::Popmeta
276        )
277    }
278}
279
280impl fmt::Display for Token<'_> {
281    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
282        match self {
283            Self::Date(s) => write!(f, "{s}"),
284            Self::Number(s) => write!(f, "{s}"),
285            Self::String(s) => write!(f, "{s}"),
286            Self::Account(s) => write!(f, "{s}"),
287            Self::Currency(s) => write!(f, "{s}"),
288            Self::Tag(s) => write!(f, "{s}"),
289            Self::Link(s) => write!(f, "{s}"),
290            Self::Txn => write!(f, "txn"),
291            Self::Balance => write!(f, "balance"),
292            Self::Open => write!(f, "open"),
293            Self::Close => write!(f, "close"),
294            Self::Commodity => write!(f, "commodity"),
295            Self::Pad => write!(f, "pad"),
296            Self::Event => write!(f, "event"),
297            Self::Query => write!(f, "query"),
298            Self::Note => write!(f, "note"),
299            Self::Document => write!(f, "document"),
300            Self::Price => write!(f, "price"),
301            Self::Custom => write!(f, "custom"),
302            Self::Option_ => write!(f, "option"),
303            Self::Include => write!(f, "include"),
304            Self::Plugin => write!(f, "plugin"),
305            Self::Pushtag => write!(f, "pushtag"),
306            Self::Poptag => write!(f, "poptag"),
307            Self::Pushmeta => write!(f, "pushmeta"),
308            Self::Popmeta => write!(f, "popmeta"),
309            Self::True => write!(f, "TRUE"),
310            Self::False => write!(f, "FALSE"),
311            Self::Null => write!(f, "NULL"),
312            Self::LDoubleBrace => write!(f, "{{{{"),
313            Self::RDoubleBrace => write!(f, "}}}}"),
314            Self::LBraceHash => write!(f, "{{#"),
315            Self::LBrace => write!(f, "{{"),
316            Self::RBrace => write!(f, "}}"),
317            Self::LParen => write!(f, "("),
318            Self::RParen => write!(f, ")"),
319            Self::AtAt => write!(f, "@@"),
320            Self::At => write!(f, "@"),
321            Self::Colon => write!(f, ":"),
322            Self::Comma => write!(f, ","),
323            Self::Tilde => write!(f, "~"),
324            Self::Plus => write!(f, "+"),
325            Self::Minus => write!(f, "-"),
326            Self::Star => write!(f, "*"),
327            Self::Slash => write!(f, "/"),
328            Self::Pending => write!(f, "!"),
329            Self::Flag(s) => write!(f, "{s}"),
330            Self::Newline => write!(f, "\\n"),
331            Self::Comment(s) => write!(f, "{s}"),
332            Self::Shebang(s) => write!(f, "{s}"),
333            Self::EmacsDirective(s) => write!(f, "{s}"),
334            Self::MetaKey(s) => write!(f, "{s}"),
335            Self::Indent(n) => write!(f, "<indent:{n}>"),
336            Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
337            Self::Error => write!(f, "<error>"),
338        }
339    }
340}
341
342/// Tokenize source code into a vector of (Token, Span) pairs.
343///
344/// This function:
345/// 1. Runs the Logos lexer for fast tokenization
346/// 2. Post-processes to detect indentation at line starts
347/// 3. Handles lexer errors by producing Error tokens
348pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
349    let mut tokens = Vec::new();
350    let mut lexer = Token::lexer(source);
351    let mut at_line_start = true;
352    let mut last_newline_end = 0usize;
353
354    while let Some(result) = lexer.next() {
355        let span = lexer.span();
356
357        match result {
358            Ok(Token::Newline) => {
359                tokens.push((Token::Newline, span.clone().into()));
360                at_line_start = true;
361                last_newline_end = span.end;
362            }
363            Ok(token) => {
364                // Check for indentation at line start
365                if at_line_start && span.start > last_newline_end {
366                    // Count leading whitespace between last newline and this token
367                    // Tabs count as indentation (treat 1 tab as 4 spaces for counting purposes)
368                    let leading = &source[last_newline_end..span.start];
369                    let mut space_count = 0;
370                    let mut char_count = 0;
371                    for c in leading.chars() {
372                        match c {
373                            ' ' => {
374                                space_count += 1;
375                                char_count += 1;
376                            }
377                            '\t' => {
378                                space_count += 4; // Treat tab as 4 spaces
379                                char_count += 1;
380                            }
381                            _ => break,
382                        }
383                    }
384                    if space_count >= 2 {
385                        let indent_start = last_newline_end;
386                        let indent_end = last_newline_end + char_count;
387                        // Use DeepIndent for 4+ spaces (posting metadata level)
388                        let indent_token = if space_count >= 4 {
389                            Token::DeepIndent(space_count)
390                        } else {
391                            Token::Indent(space_count)
392                        };
393                        tokens.push((
394                            indent_token,
395                            Span {
396                                start: indent_start,
397                                end: indent_end,
398                            },
399                        ));
400                    }
401                }
402                at_line_start = false;
403                tokens.push((token, span.into()));
404            }
405            Err(()) => {
406                // Lexer error - produce an Error token
407                at_line_start = false;
408                tokens.push((Token::Error, span.into()));
409            }
410        }
411    }
412
413    tokens
414}
415
416#[cfg(test)]
417mod tests {
418    use super::*;
419
420    #[test]
421    fn test_tokenize_date() {
422        let tokens = tokenize("2024-01-15");
423        assert_eq!(tokens.len(), 1);
424        assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
425    }
426
427    #[test]
428    fn test_tokenize_number() {
429        let tokens = tokenize("1234.56");
430        assert_eq!(tokens.len(), 1);
431        assert!(matches!(tokens[0].0, Token::Number("1234.56")));
432
433        let tokens = tokenize("-1,234.56");
434        assert_eq!(tokens.len(), 1);
435        assert!(matches!(tokens[0].0, Token::Number("-1,234.56")));
436    }
437
438    #[test]
439    fn test_tokenize_account() {
440        let tokens = tokenize("Assets:Bank:Checking");
441        assert_eq!(tokens.len(), 1);
442        assert!(matches!(
443            tokens[0].0,
444            Token::Account("Assets:Bank:Checking")
445        ));
446    }
447
448    #[test]
449    fn test_tokenize_currency() {
450        let tokens = tokenize("USD");
451        assert_eq!(tokens.len(), 1);
452        assert!(matches!(tokens[0].0, Token::Currency("USD")));
453    }
454
455    #[test]
456    fn test_tokenize_string() {
457        let tokens = tokenize(r#""Hello, World!""#);
458        assert_eq!(tokens.len(), 1);
459        assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
460    }
461
462    #[test]
463    fn test_tokenize_keywords() {
464        let tokens = tokenize("txn balance open close");
465        assert_eq!(tokens.len(), 4);
466        assert!(matches!(tokens[0].0, Token::Txn));
467        assert!(matches!(tokens[1].0, Token::Balance));
468        assert!(matches!(tokens[2].0, Token::Open));
469        assert!(matches!(tokens[3].0, Token::Close));
470    }
471
472    #[test]
473    fn test_tokenize_tag_and_link() {
474        let tokens = tokenize("#my-tag ^my-link");
475        assert_eq!(tokens.len(), 2);
476        assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
477        assert!(matches!(tokens[1].0, Token::Link("^my-link")));
478    }
479
480    #[test]
481    fn test_tokenize_comment() {
482        let tokens = tokenize("; This is a comment");
483        assert_eq!(tokens.len(), 1);
484        assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
485    }
486
487    #[test]
488    fn test_tokenize_indentation() {
489        let tokens = tokenize("txn\n  Assets:Bank 100 USD");
490        // Should have: Txn, Newline, Indent, Account, Number, Currency
491        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
492    }
493
494    #[test]
495    fn test_tokenize_transaction_line() {
496        let source = "2024-01-15 * \"Grocery Store\" #food\n  Expenses:Food 50.00 USD";
497        let tokens = tokenize(source);
498
499        // Check key tokens are present
500        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
501        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
502        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
503        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
504        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
505        assert!(tokens
506            .iter()
507            .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_))));
508        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
509        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
510        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
511    }
512
513    #[test]
514    fn test_tokenize_metadata_key() {
515        let tokens = tokenize("filename:");
516        assert_eq!(tokens.len(), 1);
517        assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
518    }
519
520    #[test]
521    fn test_tokenize_punctuation() {
522        let tokens = tokenize("{ } @ @@ , ~");
523        let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
524        assert!(token_types.contains(&Token::LBrace));
525        assert!(token_types.contains(&Token::RBrace));
526        assert!(token_types.contains(&Token::At));
527        assert!(token_types.contains(&Token::AtAt));
528        assert!(token_types.contains(&Token::Comma));
529        assert!(token_types.contains(&Token::Tilde));
530    }
531}