rustledger_parser/
logos_lexer.rs

1//! SIMD-accelerated lexer using Logos.
2//!
3//! This module provides a fast tokenizer for Beancount syntax using the Logos crate,
4//! which generates a DFA-based lexer with SIMD optimizations where available.
5
6use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10/// A span in the source code (byte offsets).
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13    /// Start byte offset (inclusive).
14    pub start: usize,
15    /// End byte offset (exclusive).
16    pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20    fn from(range: Range<usize>) -> Self {
21        Self {
22            start: range.start,
23            end: range.end,
24        }
25    }
26}
27
28impl From<Span> for Range<usize> {
29    fn from(span: Span) -> Self {
30        span.start..span.end
31    }
32}
33
34/// Token types produced by the Logos lexer.
35#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] // Skip horizontal whitespace (spaces and tabs)
37pub enum Token<'src> {
38    // ===== Literals =====
39    /// A date in YYYY-MM-DD or YYYY/MM/DD format.
40    #[regex(r"\d{4}[-/]\d{2}[-/]\d{2}")]
41    Date(&'src str),
42
43    /// A number with optional sign, thousands separators, and decimals.
44    /// Examples: 123, -456, 1,234.56, 1234.5678, .50, -.50
45    #[regex(r"-?(\.\d+|(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)")]
46    Number(&'src str),
47
48    /// A double-quoted string (handles escape sequences).
49    /// The slice includes the quotes.
50    #[regex(r#""([^"\\]|\\.)*""#)]
51    String(&'src str),
52
53    /// An account name like Assets:Bank:Checking or Assets:401k:Fidelity.
54    /// Must start with one of the 5 account types and have at least one sub-account.
55    /// Sub-accounts can start with uppercase letter or digit.
56    #[regex(r"(Assets|Liabilities|Equity|Income|Expenses)(:[A-Za-z0-9][a-zA-Z0-9-]*)+")]
57    Account(&'src str),
58
59    /// A currency/commodity code like USD, EUR, AAPL, BTC.
60    /// Uppercase letters, can contain digits, apostrophes, dots, underscores, hyphens.
61    /// Note: This pattern is lower priority than Account, Keywords, and Flags.
62    /// Currency must have at least 2 characters to avoid conflict with single-letter flags.
63    /// Also supports `/` prefix for options/futures contracts (e.g., `/LOX21_211204_P100.25`).
64    #[regex(r"/[A-Z0-9'._-]+|[A-Z][A-Z0-9'._-]+")]
65    Currency(&'src str),
66
67    /// A tag like #tag-name.
68    #[regex(r"#[a-zA-Z0-9-_/.]+")]
69    Tag(&'src str),
70
71    /// A link like ^link-name.
72    #[regex(r"\^[a-zA-Z0-9-_/.]+")]
73    Link(&'src str),
74
75    // ===== Keywords =====
76    // Using #[token] for exact matches (higher priority than regex)
77    /// The `txn` keyword for transactions.
78    #[token("txn")]
79    Txn,
80    /// The `balance` directive keyword.
81    #[token("balance")]
82    Balance,
83    /// The `open` directive keyword.
84    #[token("open")]
85    Open,
86    /// The `close` directive keyword.
87    #[token("close")]
88    Close,
89    /// The `commodity` directive keyword.
90    #[token("commodity")]
91    Commodity,
92    /// The `pad` directive keyword.
93    #[token("pad")]
94    Pad,
95    /// The `event` directive keyword.
96    #[token("event")]
97    Event,
98    /// The `query` directive keyword.
99    #[token("query")]
100    Query,
101    /// The `note` directive keyword.
102    #[token("note")]
103    Note,
104    /// The `document` directive keyword.
105    #[token("document")]
106    Document,
107    /// The `price` directive keyword.
108    #[token("price")]
109    Price,
110    /// The `custom` directive keyword.
111    #[token("custom")]
112    Custom,
113    /// The `option` directive keyword.
114    #[token("option")]
115    Option_,
116    /// The `include` directive keyword.
117    #[token("include")]
118    Include,
119    /// The `plugin` directive keyword.
120    #[token("plugin")]
121    Plugin,
122    /// The `pushtag` directive keyword.
123    #[token("pushtag")]
124    Pushtag,
125    /// The `poptag` directive keyword.
126    #[token("poptag")]
127    Poptag,
128    /// The `pushmeta` directive keyword.
129    #[token("pushmeta")]
130    Pushmeta,
131    /// The `popmeta` directive keyword.
132    #[token("popmeta")]
133    Popmeta,
134    /// The `TRUE` boolean literal (also True, true).
135    #[token("TRUE")]
136    #[token("True")]
137    #[token("true")]
138    True,
139    /// The `FALSE` boolean literal (also False, false).
140    #[token("FALSE")]
141    #[token("False")]
142    #[token("false")]
143    False,
144    /// The `NULL` literal.
145    #[token("NULL")]
146    Null,
147
148    // ===== Punctuation =====
149    // Order matters: longer tokens first
150    /// Double left brace `{{` for cost specifications (legacy total cost).
151    #[token("{{")]
152    LDoubleBrace,
153    /// Double right brace `}}` for cost specifications.
154    #[token("}}")]
155    RDoubleBrace,
156    /// Left brace with hash `{#` for total cost (new syntax).
157    #[token("{#")]
158    LBraceHash,
159    /// Left brace `{` for cost specifications.
160    #[token("{")]
161    LBrace,
162    /// Right brace `}` for cost specifications.
163    #[token("}")]
164    RBrace,
165    /// Left parenthesis `(` for expressions.
166    #[token("(")]
167    LParen,
168    /// Right parenthesis `)` for expressions.
169    #[token(")")]
170    RParen,
171    /// Double at-sign `@@` for total cost.
172    #[token("@@")]
173    AtAt,
174    /// At-sign `@` for unit cost.
175    #[token("@")]
176    At,
177    /// Colon `:` separator.
178    #[token(":")]
179    Colon,
180    /// Comma `,` separator.
181    #[token(",")]
182    Comma,
183    /// Tilde `~` for tolerance.
184    #[token("~")]
185    Tilde,
186    /// Plus `+` operator.
187    #[token("+")]
188    Plus,
189    /// Minus `-` operator.
190    #[token("-")]
191    Minus,
192    /// Star `*` for cleared transactions and multiplication.
193    #[token("*")]
194    Star,
195    /// Slash `/` for division.
196    #[token("/")]
197    Slash,
198
199    // ===== Transaction Flags =====
200    /// Pending flag `!` for incomplete transactions.
201    #[token("!")]
202    Pending,
203
204    /// Other transaction flags: P S T C U R M # ? % &
205    /// Note: # is only a flag when NOT followed by tag characters
206    #[regex(r"[PSTCURM#?%&]")]
207    Flag(&'src str),
208
209    // ===== Structural =====
210    /// Newline (significant in Beancount for directive boundaries).
211    #[regex(r"\r?\n")]
212    Newline,
213
214    /// A comment starting with semicolon.
215    /// The slice includes the semicolon.
216    #[regex(r";[^\n\r]*", allow_greedy = true)]
217    Comment(&'src str),
218
219    /// Shebang line at start of file (e.g., #!/usr/bin/env bean-web).
220    /// Treated as a comment-like directive to skip.
221    #[regex(r"#![^\n\r]*", allow_greedy = true)]
222    Shebang(&'src str),
223
224    /// Emacs org-mode directive (e.g., "#+STARTUP: showall").
225    /// These are Emacs configuration lines that should be skipped.
226    #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
227    EmacsDirective(&'src str),
228
229    /// A metadata key (identifier followed by colon).
230    /// Examples: filename:, lineno:, custom-key:, nameOnCard:
231    /// The slice includes the trailing colon. Keys can use camelCase or `snake_case`.
232    #[regex(r"[a-zA-Z][a-zA-Z0-9_-]*:")]
233    MetaKey(&'src str),
234
235    /// Indentation token (inserted by post-processing, not by Logos).
236    /// Contains the number of leading spaces.
237    /// This is a placeholder - actual indentation detection happens in [`tokenize`].
238    Indent(usize),
239
240    /// Deep indentation (4+ spaces) - used for posting-level metadata.
241    DeepIndent(usize),
242
243    /// Error token for unrecognized input.
244    /// Contains the invalid source text for better error messages.
245    Error(&'src str),
246}
247
248impl Token<'_> {
249    /// Returns true if this is a transaction flag (* or !).
250    pub const fn is_txn_flag(&self) -> bool {
251        matches!(self, Self::Star | Self::Pending | Self::Flag(_))
252    }
253
254    /// Returns true if this is a keyword that starts a directive.
255    pub const fn is_directive_keyword(&self) -> bool {
256        matches!(
257            self,
258            Self::Txn
259                | Self::Balance
260                | Self::Open
261                | Self::Close
262                | Self::Commodity
263                | Self::Pad
264                | Self::Event
265                | Self::Query
266                | Self::Note
267                | Self::Document
268                | Self::Price
269                | Self::Custom
270                | Self::Option_
271                | Self::Include
272                | Self::Plugin
273                | Self::Pushtag
274                | Self::Poptag
275                | Self::Pushmeta
276                | Self::Popmeta
277        )
278    }
279}
280
281impl fmt::Display for Token<'_> {
282    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
283        match self {
284            Self::Date(s) => write!(f, "{s}"),
285            Self::Number(s) => write!(f, "{s}"),
286            Self::String(s) => write!(f, "{s}"),
287            Self::Account(s) => write!(f, "{s}"),
288            Self::Currency(s) => write!(f, "{s}"),
289            Self::Tag(s) => write!(f, "{s}"),
290            Self::Link(s) => write!(f, "{s}"),
291            Self::Txn => write!(f, "txn"),
292            Self::Balance => write!(f, "balance"),
293            Self::Open => write!(f, "open"),
294            Self::Close => write!(f, "close"),
295            Self::Commodity => write!(f, "commodity"),
296            Self::Pad => write!(f, "pad"),
297            Self::Event => write!(f, "event"),
298            Self::Query => write!(f, "query"),
299            Self::Note => write!(f, "note"),
300            Self::Document => write!(f, "document"),
301            Self::Price => write!(f, "price"),
302            Self::Custom => write!(f, "custom"),
303            Self::Option_ => write!(f, "option"),
304            Self::Include => write!(f, "include"),
305            Self::Plugin => write!(f, "plugin"),
306            Self::Pushtag => write!(f, "pushtag"),
307            Self::Poptag => write!(f, "poptag"),
308            Self::Pushmeta => write!(f, "pushmeta"),
309            Self::Popmeta => write!(f, "popmeta"),
310            Self::True => write!(f, "TRUE"),
311            Self::False => write!(f, "FALSE"),
312            Self::Null => write!(f, "NULL"),
313            Self::LDoubleBrace => write!(f, "{{{{"),
314            Self::RDoubleBrace => write!(f, "}}}}"),
315            Self::LBraceHash => write!(f, "{{#"),
316            Self::LBrace => write!(f, "{{"),
317            Self::RBrace => write!(f, "}}"),
318            Self::LParen => write!(f, "("),
319            Self::RParen => write!(f, ")"),
320            Self::AtAt => write!(f, "@@"),
321            Self::At => write!(f, "@"),
322            Self::Colon => write!(f, ":"),
323            Self::Comma => write!(f, ","),
324            Self::Tilde => write!(f, "~"),
325            Self::Plus => write!(f, "+"),
326            Self::Minus => write!(f, "-"),
327            Self::Star => write!(f, "*"),
328            Self::Slash => write!(f, "/"),
329            Self::Pending => write!(f, "!"),
330            Self::Flag(s) => write!(f, "{s}"),
331            Self::Newline => write!(f, "\\n"),
332            Self::Comment(s) => write!(f, "{s}"),
333            Self::Shebang(s) => write!(f, "{s}"),
334            Self::EmacsDirective(s) => write!(f, "{s}"),
335            Self::MetaKey(s) => write!(f, "{s}"),
336            Self::Indent(n) => write!(f, "<indent:{n}>"),
337            Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
338            Self::Error(s) => write!(f, "{s}"),
339        }
340    }
341}
342
343/// Tokenize source code into a vector of (Token, Span) pairs.
344///
345/// This function:
346/// 1. Runs the Logos lexer for fast tokenization
347/// 2. Post-processes to detect indentation at line starts
348/// 3. Handles lexer errors by producing Error tokens
349pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
350    let mut tokens = Vec::new();
351    let mut lexer = Token::lexer(source);
352    let mut at_line_start = true;
353    let mut last_newline_end = 0usize;
354
355    while let Some(result) = lexer.next() {
356        let span = lexer.span();
357
358        match result {
359            Ok(Token::Newline) => {
360                tokens.push((Token::Newline, span.clone().into()));
361                at_line_start = true;
362                last_newline_end = span.end;
363            }
364            Ok(token) => {
365                // Check for indentation at line start
366                if at_line_start && span.start > last_newline_end {
367                    // Count leading whitespace between last newline and this token
368                    // Tabs count as indentation (treat 1 tab as 4 spaces for counting purposes)
369                    let leading = &source[last_newline_end..span.start];
370                    let mut space_count = 0;
371                    let mut char_count = 0;
372                    for c in leading.chars() {
373                        match c {
374                            ' ' => {
375                                space_count += 1;
376                                char_count += 1;
377                            }
378                            '\t' => {
379                                space_count += 4; // Treat tab as 4 spaces
380                                char_count += 1;
381                            }
382                            _ => break,
383                        }
384                    }
385                    if space_count >= 2 {
386                        let indent_start = last_newline_end;
387                        let indent_end = last_newline_end + char_count;
388                        // Use DeepIndent for 4+ spaces (posting metadata level)
389                        let indent_token = if space_count >= 4 {
390                            Token::DeepIndent(space_count)
391                        } else {
392                            Token::Indent(space_count)
393                        };
394                        tokens.push((
395                            indent_token,
396                            Span {
397                                start: indent_start,
398                                end: indent_end,
399                            },
400                        ));
401                    }
402                }
403                at_line_start = false;
404                tokens.push((token, span.into()));
405            }
406            Err(()) => {
407                // Lexer error - produce an Error token with the invalid source text
408                at_line_start = false;
409                let invalid_text = &source[span.clone()];
410                tokens.push((Token::Error(invalid_text), span.into()));
411            }
412        }
413    }
414
415    tokens
416}
417
418#[cfg(test)]
419mod tests {
420    use super::*;
421
422    #[test]
423    fn test_tokenize_date() {
424        let tokens = tokenize("2024-01-15");
425        assert_eq!(tokens.len(), 1);
426        assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
427    }
428
429    #[test]
430    fn test_tokenize_number() {
431        let tokens = tokenize("1234.56");
432        assert_eq!(tokens.len(), 1);
433        assert!(matches!(tokens[0].0, Token::Number("1234.56")));
434
435        let tokens = tokenize("-1,234.56");
436        assert_eq!(tokens.len(), 1);
437        assert!(matches!(tokens[0].0, Token::Number("-1,234.56")));
438    }
439
440    #[test]
441    fn test_tokenize_account() {
442        let tokens = tokenize("Assets:Bank:Checking");
443        assert_eq!(tokens.len(), 1);
444        assert!(matches!(
445            tokens[0].0,
446            Token::Account("Assets:Bank:Checking")
447        ));
448    }
449
450    #[test]
451    fn test_tokenize_currency() {
452        let tokens = tokenize("USD");
453        assert_eq!(tokens.len(), 1);
454        assert!(matches!(tokens[0].0, Token::Currency("USD")));
455    }
456
457    #[test]
458    fn test_tokenize_string() {
459        let tokens = tokenize(r#""Hello, World!""#);
460        assert_eq!(tokens.len(), 1);
461        assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
462    }
463
464    #[test]
465    fn test_tokenize_keywords() {
466        let tokens = tokenize("txn balance open close");
467        assert_eq!(tokens.len(), 4);
468        assert!(matches!(tokens[0].0, Token::Txn));
469        assert!(matches!(tokens[1].0, Token::Balance));
470        assert!(matches!(tokens[2].0, Token::Open));
471        assert!(matches!(tokens[3].0, Token::Close));
472    }
473
474    #[test]
475    fn test_tokenize_tag_and_link() {
476        let tokens = tokenize("#my-tag ^my-link");
477        assert_eq!(tokens.len(), 2);
478        assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
479        assert!(matches!(tokens[1].0, Token::Link("^my-link")));
480    }
481
482    #[test]
483    fn test_tokenize_comment() {
484        let tokens = tokenize("; This is a comment");
485        assert_eq!(tokens.len(), 1);
486        assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
487    }
488
489    #[test]
490    fn test_tokenize_indentation() {
491        let tokens = tokenize("txn\n  Assets:Bank 100 USD");
492        // Should have: Txn, Newline, Indent, Account, Number, Currency
493        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
494    }
495
496    #[test]
497    fn test_tokenize_transaction_line() {
498        let source = "2024-01-15 * \"Grocery Store\" #food\n  Expenses:Food 50.00 USD";
499        let tokens = tokenize(source);
500
501        // Check key tokens are present
502        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
503        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
504        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
505        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
506        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
507        assert!(
508            tokens
509                .iter()
510                .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
511        );
512        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
513        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
514        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
515    }
516
517    #[test]
518    fn test_tokenize_metadata_key() {
519        let tokens = tokenize("filename:");
520        assert_eq!(tokens.len(), 1);
521        assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
522    }
523
524    #[test]
525    fn test_tokenize_punctuation() {
526        let tokens = tokenize("{ } @ @@ , ~");
527        let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
528        assert!(token_types.contains(&Token::LBrace));
529        assert!(token_types.contains(&Token::RBrace));
530        assert!(token_types.contains(&Token::At));
531        assert!(token_types.contains(&Token::AtAt));
532        assert!(token_types.contains(&Token::Comma));
533        assert!(token_types.contains(&Token::Tilde));
534    }
535}
rustledger_parser/logos_lexer.rs

rustledger_parser/
logos_lexer.rs