Skip to main content

rustledger_parser/
logos_lexer.rs

1//! SIMD-accelerated lexer using Logos.
2//!
3//! This module provides a fast tokenizer for Beancount syntax using the Logos crate,
4//! which generates a DFA-based lexer with SIMD optimizations where available.
5
6use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10// The leading-BOM strip happens at the `parse()` entry boundary (see
11// `crate::bom::strip_leading`). By the time the lexer runs, the source
12// is BOM-free at byte 0 by construction. Any U+FEFF byte the lexer
13// encounters is therefore mid-file and unrecognized — logos's default
14// error path emits a `Token::Error` for it, and the parser's existing
15// error classifier (which searches `error_text` for U+FEFF) surfaces
16// the dedicated `ParseErrorKind::BomInDirectiveBody` diagnostic.
17//
18// No BOM-aware lexer callback, no `Token::Bom` variant, and no
19// BOM regex in the Token enum — but the `Err(()) => ...` arm in
20// `tokenize` DOES contain one mid-file-BOM special case: it preserves
21// `at_line_start` and advances `last_newline_end` past leading BOM
22// bytes in the error span, so indented content on the same logical
23// line still emits an `Indent` token. That logic lives in the
24// `apply_err_layout_transparency` helper below and is unit-tested
25// directly (including the multi-BOM coalesced-Err case that logos
26// doesn't produce from real input today but might in the future).
27
28/// A span in the source code (byte offsets).
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub struct Span {
31    /// Start byte offset (inclusive).
32    pub start: usize,
33    /// End byte offset (exclusive).
34    pub end: usize,
35}
36
37impl From<Range<usize>> for Span {
38    fn from(range: Range<usize>) -> Self {
39        Self {
40            start: range.start,
41            end: range.end,
42        }
43    }
44}
45
46impl From<Span> for Range<usize> {
47    fn from(span: Span) -> Self {
48        span.start..span.end
49    }
50}
51
52/// Token types produced by the Logos lexer.
53///
54/// Horizontal whitespace is emitted as a first-class [`Token::Whitespace`]
55/// token (was previously skipped via `#[logos(skip r"[ \t]+")]`). The
56/// existing [`tokenize`] entry point filters whitespace out for
57/// backward-compat with the AST-style parser; the new
58/// [`tokenize_lossless`] entry point keeps them so the CST can
59/// reconstruct source byte-for-byte. Both paths share the same Logos
60/// implementation — there is exactly one tokenization pass per file.
61#[derive(Logos, Debug, Clone, PartialEq, Eq)]
62pub enum Token<'src> {
63    /// Horizontal whitespace (`[ \t]+`). Significant for the CST and
64    /// for the existing indent post-processing in [`tokenize`]; both
65    /// callers handle this variant.
66    #[regex(r"[ \t]+")]
67    Whitespace(&'src str),
68    // ===== Literals =====
69    /// A date in YYYY-MM-DD, YYYY-M-D, YYYY/MM/DD, or YYYY/M/D format.
70    /// Single-digit month and day are accepted (e.g., 2024-1-5).
71    #[regex(r"\d{4}[-/]\d{1,2}[-/]\d{1,2}")]
72    Date(&'src str),
73
74    /// A number with optional thousands separators and decimals.
75    /// Examples: 123, 1,234.56, 1234.5678, 1. (trailing decimal)
76    /// Negative numbers are handled as unary minus (`-` token + number)
77    /// to allow subtraction expressions like `3-2` to parse correctly.
78    /// Python beancount v3 requires an integer part before the decimal point.
79    /// Leading decimals like `.50` are rejected per the beancount v3 spec.
80    #[regex(r"(\d{1,3}(,\d{3})*|\d+)(\.\d*)?")]
81    Number(&'src str),
82
83    /// A double-quoted string (handles escape sequences).
84    /// The slice includes the quotes.
85    #[regex(r#""([^"\\]|\\.)*""#)]
86    String(&'src str),
87
88    /// An account name like Assets:Bank:Checking, Капитал:Retained-Earnings,
89    /// or 资产:银行:支票.
90    ///
91    /// The first component starts with an uppercase letter (`\p{Lu}`), a
92    /// letter without case like CJK ideographs (`\p{Lo}`), or a titlecase
93    /// letter (`\p{Lt}`). Sub-components may also start with a digit.
94    /// Subsequent characters can be any Unicode letter, digit, or hyphen.
95    ///
96    /// Note: The beancount v3 spec restricts the first character to ASCII
97    /// `[A-Z]`, but this is an artifact of the C flex lexer's poor Unicode
98    /// support, not a meaningful language design choice (see
99    /// beancount/beancount#161, #398, #733).
100    ///
101    /// The account type prefix is validated later against options (`name_assets`, etc.).
102    #[regex(r"[\p{Lu}\p{Lo}\p{Lt}][\p{L}0-9-]*(:([\p{Lu}\p{Lo}\p{Lt}0-9][\p{L}0-9-]*)+)+")]
103    Account(&'src str),
104
105    /// A currency/commodity code like USD, EUR, AAPL, BTC, or single-char tickers like T, V, F.
106    /// Uppercase letters, can contain digits, apostrophes, dots, underscores, hyphens.
107    /// Single-character currencies (e.g., T for AT&T, V for Visa) are valid NYSE/NASDAQ tickers.
108    /// Note: Single-char currencies are disambiguated from transaction flags in the parser.
109    /// Also supports `/` prefix for options/futures contracts (e.g., `/ESM24`, `/LOX21_211204_P100.25`).
110    /// The `/` prefix requires an uppercase letter first to avoid matching `/1.14` as currency.
111    /// Priority 3 ensures Currency wins over Flag for single uppercase letters.
112    #[regex(r"/[A-Z][A-Z0-9'._-]*|[A-Z][A-Z0-9'._-]*", priority = 3)]
113    Currency(&'src str),
114
115    /// A tag like #tag-name.
116    #[regex(r"#[a-zA-Z0-9-_/.]+")]
117    Tag(&'src str),
118
119    /// A link like ^link-name.
120    #[regex(r"\^[a-zA-Z0-9-_/.]+")]
121    Link(&'src str),
122
123    // ===== Keywords =====
124    // Using #[token] for exact matches (higher priority than regex)
125    /// The `txn` keyword for transactions.
126    #[token("txn")]
127    Txn,
128    /// The `balance` directive keyword.
129    #[token("balance")]
130    Balance,
131    /// The `open` directive keyword.
132    #[token("open")]
133    Open,
134    /// The `close` directive keyword.
135    #[token("close")]
136    Close,
137    /// The `commodity` directive keyword.
138    #[token("commodity")]
139    Commodity,
140    /// The `pad` directive keyword.
141    #[token("pad")]
142    Pad,
143    /// The `event` directive keyword.
144    #[token("event")]
145    Event,
146    /// The `query` directive keyword.
147    #[token("query")]
148    Query,
149    /// The `note` directive keyword.
150    #[token("note")]
151    Note,
152    /// The `document` directive keyword.
153    #[token("document")]
154    Document,
155    /// The `price` directive keyword.
156    #[token("price")]
157    Price,
158    /// The `custom` directive keyword.
159    #[token("custom")]
160    Custom,
161    /// The `option` directive keyword.
162    #[token("option")]
163    Option_,
164    /// The `include` directive keyword.
165    #[token("include")]
166    Include,
167    /// The `plugin` directive keyword.
168    #[token("plugin")]
169    Plugin,
170    /// The `pushtag` directive keyword.
171    #[token("pushtag")]
172    Pushtag,
173    /// The `poptag` directive keyword.
174    #[token("poptag")]
175    Poptag,
176    /// The `pushmeta` directive keyword.
177    #[token("pushmeta")]
178    Pushmeta,
179    /// The `popmeta` directive keyword.
180    #[token("popmeta")]
181    Popmeta,
182    /// The `TRUE` boolean literal (also True, true).
183    #[token("TRUE")]
184    #[token("True")]
185    #[token("true")]
186    True,
187    /// The `FALSE` boolean literal (also False, false).
188    #[token("FALSE")]
189    #[token("False")]
190    #[token("false")]
191    False,
192    /// The `NULL` literal.
193    #[token("NULL")]
194    Null,
195
196    // ===== Punctuation =====
197    // Order matters: longer tokens first
198    /// Double left brace `{{` for cost specifications (legacy total cost).
199    #[token("{{")]
200    LDoubleBrace,
201    /// Double right brace `}}` for cost specifications.
202    #[token("}}")]
203    RDoubleBrace,
204    /// Left brace with hash `{#` for total cost (new syntax).
205    #[token("{#")]
206    LBraceHash,
207    /// Left brace `{` for cost specifications.
208    #[token("{")]
209    LBrace,
210    /// Right brace `}` for cost specifications.
211    #[token("}")]
212    RBrace,
213    /// Left parenthesis `(` for expressions.
214    #[token("(")]
215    LParen,
216    /// Right parenthesis `)` for expressions.
217    #[token(")")]
218    RParen,
219    /// Double at-sign `@@` for total cost.
220    #[token("@@")]
221    AtAt,
222    /// At-sign `@` for unit cost.
223    #[token("@")]
224    At,
225    /// Colon `:` separator.
226    #[token(":")]
227    Colon,
228    /// Comma `,` separator.
229    #[token(",")]
230    Comma,
231    /// Tilde `~` for tolerance.
232    #[token("~")]
233    Tilde,
234    /// Pipe `|` for deprecated payee/narration separator.
235    #[token("|")]
236    Pipe,
237    /// Plus `+` operator.
238    #[token("+")]
239    Plus,
240    /// Minus `-` operator.
241    #[token("-")]
242    Minus,
243    /// Star `*` for cleared transactions and multiplication.
244    #[token("*")]
245    Star,
246    /// Slash `/` for division.
247    #[token("/")]
248    Slash,
249
250    // ===== Transaction Flags =====
251    /// Pending flag `!` for incomplete transactions.
252    #[token("!")]
253    Pending,
254
255    /// Other transaction flags: P S T C U R M ? &
256    /// Note: # and % are handled as comments when followed by space
257    #[regex(r"[PSTCURM?&]")]
258    Flag(&'src str),
259
260    // ===== Structural =====
261    /// Newline (significant in Beancount for directive boundaries).
262    #[regex(r"\r?\n")]
263    Newline,
264
265    /// A comment starting with semicolon.
266    /// The slice includes the semicolon.
267    #[regex(r";[^\n\r]*", allow_greedy = true)]
268    Comment(&'src str),
269
270    /// Hash token `#` used as separator in cost specs: `{per_unit # total currency}`
271    /// Note: In Python beancount, `#` is only a comment at the START of a line.
272    /// Mid-line `# text` is NOT a comment - it's either a cost separator or syntax error.
273    /// Start-of-line hash comments are handled in post-processing (tokenize function).
274    #[token("#")]
275    Hash,
276
277    /// A percent comment (ledger-style).
278    /// Python beancount accepts % as a comment character for ledger compatibility.
279    #[regex(r"%[^\n\r]*", allow_greedy = true)]
280    PercentComment(&'src str),
281
282    /// Shebang line at start of file (e.g., #!/usr/bin/env bean-web).
283    /// Treated as a comment-like directive to skip.
284    #[regex(r"#![^\n\r]*", allow_greedy = true)]
285    Shebang(&'src str),
286
287    /// Emacs org-mode directive (e.g., "#+STARTUP: showall").
288    /// These are Emacs configuration lines that should be skipped.
289    #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
290    EmacsDirective(&'src str),
291
292    /// A metadata key (identifier followed by colon).
293    /// Examples: filename:, lineno:, custom-key:, nameOnCard:
294    /// The slice includes the trailing colon. Keys must start with a lowercase ASCII letter
295    /// per the beancount v3 spec. Keys starting with uppercase are rejected.
296    #[regex(r"[a-z][a-zA-Z0-9_-]*:")]
297    MetaKey(&'src str),
298
299    /// Indentation token (inserted by post-processing, not by Logos).
300    /// Contains the number of leading spaces.
301    /// This is a placeholder - actual indentation detection happens in [`tokenize`].
302    Indent(usize),
303
304    /// Deep indentation (3+ spaces) - used for posting-level metadata.
305    DeepIndent(usize),
306
307    /// Error token for unrecognized input.
308    /// Contains the invalid source text for better error messages.
309    Error(&'src str),
310}
311
312impl Token<'_> {
313    /// Returns true if this is a transaction flag (* or !).
314    /// Single-character currencies (e.g., T, P, C) can also be used as flags.
315    pub const fn is_txn_flag(&self) -> bool {
316        match self {
317            Self::Star | Self::Pending | Self::Flag(_) | Self::Hash => true,
318            // Single-char currencies can be used as transaction flags
319            Self::Currency(s) => s.len() == 1,
320            _ => false,
321        }
322    }
323
324    /// Returns true if this is a keyword that starts a directive.
325    pub const fn is_directive_keyword(&self) -> bool {
326        matches!(
327            self,
328            Self::Txn
329                | Self::Balance
330                | Self::Open
331                | Self::Close
332                | Self::Commodity
333                | Self::Pad
334                | Self::Event
335                | Self::Query
336                | Self::Note
337                | Self::Document
338                | Self::Price
339                | Self::Custom
340                | Self::Option_
341                | Self::Include
342                | Self::Plugin
343                | Self::Pushtag
344                | Self::Poptag
345                | Self::Pushmeta
346                | Self::Popmeta
347        )
348    }
349}
350
351impl fmt::Display for Token<'_> {
352    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
353        match self {
354            Self::Date(s) => write!(f, "{s}"),
355            Self::Number(s) => write!(f, "{s}"),
356            Self::String(s) => write!(f, "{s}"),
357            Self::Account(s) => write!(f, "{s}"),
358            Self::Currency(s) => write!(f, "{s}"),
359            Self::Tag(s) => write!(f, "{s}"),
360            Self::Link(s) => write!(f, "{s}"),
361            Self::Txn => write!(f, "txn"),
362            Self::Balance => write!(f, "balance"),
363            Self::Open => write!(f, "open"),
364            Self::Close => write!(f, "close"),
365            Self::Commodity => write!(f, "commodity"),
366            Self::Pad => write!(f, "pad"),
367            Self::Event => write!(f, "event"),
368            Self::Query => write!(f, "query"),
369            Self::Note => write!(f, "note"),
370            Self::Document => write!(f, "document"),
371            Self::Price => write!(f, "price"),
372            Self::Custom => write!(f, "custom"),
373            Self::Option_ => write!(f, "option"),
374            Self::Include => write!(f, "include"),
375            Self::Plugin => write!(f, "plugin"),
376            Self::Pushtag => write!(f, "pushtag"),
377            Self::Poptag => write!(f, "poptag"),
378            Self::Pushmeta => write!(f, "pushmeta"),
379            Self::Popmeta => write!(f, "popmeta"),
380            Self::True => write!(f, "TRUE"),
381            Self::False => write!(f, "FALSE"),
382            Self::Null => write!(f, "NULL"),
383            Self::LDoubleBrace => write!(f, "{{{{"),
384            Self::RDoubleBrace => write!(f, "}}}}"),
385            Self::LBraceHash => write!(f, "{{#"),
386            Self::LBrace => write!(f, "{{"),
387            Self::RBrace => write!(f, "}}"),
388            Self::LParen => write!(f, "("),
389            Self::RParen => write!(f, ")"),
390            Self::AtAt => write!(f, "@@"),
391            Self::At => write!(f, "@"),
392            Self::Colon => write!(f, ":"),
393            Self::Comma => write!(f, ","),
394            Self::Tilde => write!(f, "~"),
395            Self::Pipe => write!(f, "|"),
396            Self::Plus => write!(f, "+"),
397            Self::Minus => write!(f, "-"),
398            Self::Star => write!(f, "*"),
399            Self::Slash => write!(f, "/"),
400            Self::Pending => write!(f, "!"),
401            Self::Flag(s) => write!(f, "{s}"),
402            Self::Whitespace(s) => write!(f, "{s}"),
403            Self::Newline => write!(f, "\\n"),
404            Self::Comment(s) => write!(f, "{s}"),
405            Self::Hash => write!(f, "#"),
406            Self::PercentComment(s) => write!(f, "{s}"),
407            Self::Shebang(s) => write!(f, "{s}"),
408            Self::EmacsDirective(s) => write!(f, "{s}"),
409            Self::MetaKey(s) => write!(f, "{s}"),
410            Self::Indent(n) => write!(f, "<indent:{n}>"),
411            Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
412            Self::Error(s) => {
413                // Strip any embedded U+FEFF bytes (a mid-file BOM
414                // captured into a lexer error span) so diagnostics
415                // rendering this token stay human-readable. LSP problem
416                // panels, CLI stderr, and GitHub-rendered bug reports
417                // all silently drop or strip literal BOM bytes — the
418                // `<BOM>` placeholder makes the failure mode visible.
419                //
420                // Streamed (rather than `s.replace(...)`) so this
421                // Display impl is zero-allocation. LSP problem panels
422                // re-render diagnostics on every keystroke during
423                // interactive editing; a `String` allocation per
424                // render showed up in flame graphs for files with
425                // many BOM-containing Token::Error tokens. The fast
426                // path (no BOM in `s`) is one `f.write_str(s)` call.
427                if s.contains(crate::bom::BOM_CHAR) {
428                    let mut chunks = s.split(crate::bom::BOM_CHAR);
429                    // Interleave chunks with "<BOM>" between them.
430                    // `split` yields N+1 chunks for N matches, so the
431                    // first chunk is emitted as-is and each subsequent
432                    // chunk gets a `<BOM>` prefix. Final output is
433                    // chunk0 + "<BOM>" + chunk1 + "<BOM>" + chunkN —
434                    // matching the (allocating) `s.replace(...)`
435                    // behavior exactly.
436                    if let Some(first) = chunks.next() {
437                        f.write_str(first)?;
438                    }
439                    for chunk in chunks {
440                        f.write_str("<BOM>")?;
441                        f.write_str(chunk)?;
442                    }
443                    Ok(())
444                } else {
445                    f.write_str(s)
446                }
447            }
448        }
449    }
450}
451
452/// Apply mid-file BOM layout-transparency rules to lexer-state from
453/// inside the `Err` arm of `tokenize`.
454///
455/// A mid-file BOM (U+FEFF) is layout-transparent: it produces an
456/// error diagnostic via the parser's classifier, but must NOT clobber
457/// `at_line_start` or move `last_newline_end` past the BOM, otherwise
458/// the next token on the same logical line (e.g. an indented posting
459/// from a concatenated Windows file) loses its indent classification
460/// and the parser mistypes it. Leading-BOM is handled at the
461/// `crate::parse` boundary and never reaches this code path; only
462/// mid-file BOMs that survived the strip do.
463///
464/// We use `trim_start_matches` (rather than `starts_with` + a single
465/// `BOM_LEN` advance) so a multi-BOM run — e.g., a hypothetical
466/// coalesced `\u{FEFF}\u{FEFF}` Err span from a triple-concatenated
467/// Windows file — is ENTIRELY layout-transparent. Advancing
468/// `last_newline_end` past only the first BOM but then clobbering
469/// `at_line_start` because of the second BOM would cascade into
470/// misclassifying the next real token. The contract is: every BOM
471/// byte is layout-transparent; `at_line_start` is preserved iff the
472/// entire error span is BOM bytes; `last_newline_end` advances past
473/// the full run of leading BOMs.
474///
475/// Extracted as a private helper so the multi-BOM defensive code path
476/// can be unit-tested independently of logos's emission strategy.
477/// Today logos emits one Err per unrecognized char, so the coalesced
478/// path is unreachable from real input; the unit tests at the bottom
479/// of this file feed the helper synthetic `invalid_text` values that
480/// exercise the coalesced case directly.
481fn apply_err_layout_transparency(
482    invalid_text: &str,
483    span_start: usize,
484    at_line_start: &mut bool,
485    last_newline_end: &mut usize,
486) {
487    // Round-17 fix: the contract documented above says "every BOM
488    // byte is layout-transparent" — i.e., a span like
489    // `\u{FEFF}@@\u{FEFF}` should classify its non-BOM bytes for the
490    // at_line_start decision, not its BOM bytes. The previous impl
491    // only inspected the LEADING run of BOMs and clobbered
492    // `at_line_start` for any non-empty tail. That sub-case worked
493    // because a coalesced span starting with BOM + non-BOM tail
494    // really does break the indent contract. But a coalesced span
495    // like `@@\u{FEFF}` (non-BOM head followed by BOM tail) would
496    // also clobber — the BOM in the tail is layout-transparent per
497    // contract, but the head is real content so the clobber is
498    // already correct. The genuinely-wrong case (currently
499    // unreachable but reachable under a future logos upgrade that
500    // coalesces error sequences) is when the ENTIRE span is BOMs,
501    // possibly interleaved with whitespace: those should be fully
502    // layout-transparent. We now extract the LEADING run of BOM
503    // bytes for `last_newline_end` advancement, and consult the
504    // FULL invalid_text minus all BOM bytes for the at_line_start
505    // decision.
506    let after_leading_bom = invalid_text.trim_start_matches(crate::bom::BOM_CHAR);
507    let leading_bom_bytes = invalid_text.len() - after_leading_bom.len();
508    if leading_bom_bytes > 0 && *at_line_start && span_start == *last_newline_end {
509        *last_newline_end = span_start + leading_bom_bytes;
510    }
511
512    // Any non-BOM byte ANYWHERE in the span is "real content" for
513    // indent purposes. An all-BOM span (possibly interleaving BOMs
514    // at any position) leaves `at_line_start` untouched. The
515    // previous `is_empty()` check on JUST the after-leading-BOM
516    // tail had a latent gap for a coalesced `@<BOM>` span: the
517    // leading run is empty, so the `else` arm clobbered — which
518    // happens to be correct for that case, but the path was
519    // accidental rather than principled. Walking the whole span
520    // makes the rule explicit.
521    let has_non_bom_byte = invalid_text.chars().any(|c| c != crate::bom::BOM_CHAR);
522    if has_non_bom_byte {
523        *at_line_start = false;
524    }
525}
526
527/// Tokenize source code into a vector of (Token, Span) pairs for the
528/// AST-style parser.
529///
530/// Filters out [`Token::Whitespace`] (mid-line horizontal whitespace)
531/// but otherwise emits everything the lexer produces, with
532/// post-processing for line-start `#` comments and indentation.
533/// Callers that need a fully-lossless token stream (the CST builder)
534/// use [`tokenize_lossless`] instead.
535pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
536    tokenize_inner(source, /* keep_whitespace = */ false)
537}
538
539/// Tokenize source code losslessly: every byte of `source` appears in
540/// exactly one emitted `(Token, Span)` entry. This is the input to
541/// the CST builder.
542///
543/// Differs from [`tokenize`] in that [`Token::Whitespace`] tokens are
544/// preserved (the AST-style parser drops them; the CST keeps them so
545/// the round-trip stays byte-identical).
546pub fn tokenize_lossless(source: &str) -> Vec<(Token<'_>, Span)> {
547    tokenize_inner(source, /* keep_whitespace = */ true)
548}
549
550fn tokenize_inner(source: &str, keep_whitespace: bool) -> Vec<(Token<'_>, Span)> {
551    let mut tokens = Vec::new();
552    let mut lexer = Token::lexer(source);
553    let mut at_line_start = true;
554    let mut last_newline_end = 0usize;
555
556    while let Some(result) = lexer.next() {
557        let span = lexer.span();
558
559        if !keep_whitespace && matches!(result, Ok(Token::Whitespace(_))) {
560            // AST-path drops mid-line whitespace; the CST path keeps
561            // it. Layout-relevant whitespace (start-of-line indentation,
562            // BOM error spans) is handled by the dedicated arms below
563            // regardless of which path we are on.
564            continue;
565        }
566
567        match result {
568            Ok(Token::Newline) => {
569                tokens.push((Token::Newline, span.clone().into()));
570                at_line_start = true;
571                last_newline_end = span.end;
572            }
573            Ok(Token::Hash) if at_line_start && span.start == last_newline_end => {
574                // Hash at very start of line (no indentation) is a comment
575                // Find end of line and create a comment token for the whole line
576                let comment_start = span.start;
577                let line_end = source[span.end..]
578                    .find('\n')
579                    .map_or(source.len(), |i| span.end + i);
580                let comment_text = &source[comment_start..line_end];
581                tokens.push((
582                    Token::Comment(comment_text),
583                    Span {
584                        start: comment_start,
585                        end: line_end,
586                    },
587                ));
588                // Skip lexer tokens until we reach the newline
589                while let Some(peek_result) = lexer.next() {
590                    let peek_span = lexer.span();
591                    let peek_end = peek_span.end;
592                    if peek_result == Ok(Token::Newline) {
593                        tokens.push((Token::Newline, peek_span.into()));
594                        at_line_start = true;
595                        last_newline_end = peek_end;
596                        break;
597                    }
598                    // Skip other tokens on the comment line
599                }
600            }
601            Ok(token) => {
602                // Check for indentation at line start
603                if at_line_start && span.start > last_newline_end {
604                    // Count leading whitespace between last newline and this token
605                    // Tabs count as indentation (treat 1 tab as 4 spaces for counting purposes)
606                    let leading = &source[last_newline_end..span.start];
607                    let mut space_count = 0;
608                    let mut char_count = 0;
609                    for c in leading.chars() {
610                        match c {
611                            ' ' => {
612                                space_count += 1;
613                                char_count += 1;
614                            }
615                            '\t' => {
616                                space_count += 4; // Treat tab as 4 spaces
617                                char_count += 1;
618                            }
619                            _ => break,
620                        }
621                    }
622                    // Python beancount accepts 1+ space for metadata indentation
623                    if space_count >= 1 {
624                        let indent_start = last_newline_end;
625                        let indent_end = last_newline_end + char_count;
626                        // Use DeepIndent for 3+ spaces (posting metadata level).
627                        // Python beancount allows flexible indentation where posting
628                        // metadata just needs to be more indented than the posting.
629                        // Common patterns: 2-space posting / 4-space meta, or
630                        // 1-space posting / 3-space meta (as in beancount_reds_plugins).
631                        let indent_token = if space_count >= 3 {
632                            Token::DeepIndent(space_count)
633                        } else {
634                            Token::Indent(space_count)
635                        };
636                        tokens.push((
637                            indent_token,
638                            Span {
639                                start: indent_start,
640                                end: indent_end,
641                            },
642                        ));
643                    }
644                }
645                at_line_start = false;
646                tokens.push((token, span.into()));
647            }
648            Err(()) => {
649                // Lexer error - produce an Error token with the invalid source text.
650                let invalid_text = &source[span.clone()];
651                apply_err_layout_transparency(
652                    invalid_text,
653                    span.start,
654                    &mut at_line_start,
655                    &mut last_newline_end,
656                );
657                tokens.push((Token::Error(invalid_text), span.into()));
658            }
659        }
660    }
661
662    tokens
663}
664
665#[cfg(test)]
666mod tests {
667    use super::*;
668
669    #[test]
670    fn test_tokenize_date() {
671        let tokens = tokenize("2024-01-15");
672        assert_eq!(tokens.len(), 1);
673        assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
674    }
675
676    #[test]
677    fn test_tokenize_date_single_digit_month() {
678        // Single-digit month should be tokenized as Date
679        let tokens = tokenize("2024-1-15");
680        assert_eq!(tokens.len(), 1);
681        assert!(matches!(tokens[0].0, Token::Date("2024-1-15")));
682    }
683
684    #[test]
685    fn test_tokenize_date_single_digit_day() {
686        // Single-digit day should be tokenized as Date
687        let tokens = tokenize("2024-01-5");
688        assert_eq!(tokens.len(), 1);
689        assert!(matches!(tokens[0].0, Token::Date("2024-01-5")));
690    }
691
692    #[test]
693    fn test_tokenize_date_single_digit_month_and_day() {
694        // Single-digit month and day should be tokenized as Date
695        let tokens = tokenize("2024-1-1");
696        assert_eq!(tokens.len(), 1);
697        assert!(matches!(tokens[0].0, Token::Date("2024-1-1")));
698    }
699
700    #[test]
701    fn test_tokenize_date_slash_separator_single_digit() {
702        // Slash separator with single-digit parts
703        let tokens = tokenize("2024/1/5");
704        assert_eq!(tokens.len(), 1);
705        assert!(matches!(tokens[0].0, Token::Date("2024/1/5")));
706    }
707
708    #[test]
709    fn test_tokenize_number() {
710        let tokens = tokenize("1234.56");
711        assert_eq!(tokens.len(), 1);
712        assert!(matches!(tokens[0].0, Token::Number("1234.56")));
713
714        // Negative numbers are now Minus + Number (enables subtraction expressions)
715        let tokens = tokenize("-1,234.56");
716        assert_eq!(tokens.len(), 2);
717        assert!(matches!(tokens[0].0, Token::Minus));
718        assert!(matches!(tokens[1].0, Token::Number("1,234.56")));
719    }
720
721    #[test]
722    fn test_tokenize_account() {
723        let tokens = tokenize("Assets:Bank:Checking");
724        assert_eq!(tokens.len(), 1);
725        assert!(matches!(
726            tokens[0].0,
727            Token::Account("Assets:Bank:Checking")
728        ));
729    }
730
731    #[test]
732    fn test_tokenize_account_unicode() {
733        // Unicode uppercase letters and CJK characters are valid at the
734        // start of account components. Emoji and symbols are not.
735
736        // Non-letter (emoji) after valid ASCII start — still invalid
737        let tokens = tokenize("Assets:CORP✨");
738        assert!(
739            !matches!(tokens[0].0, Token::Account("Assets:CORP✨")),
740            "Unicode emoji in account name should not tokenize as a valid Account"
741        );
742        assert!(
743            tokens.iter().any(|(t, _)| matches!(t, Token::Error(_))),
744            "Unicode emoji should produce at least one Error token"
745        );
746
747        // CJK sub-component start — now valid (CJK ideographs are \p{Lo})
748        let tokens = tokenize("Assets:沪深300");
749        assert!(
750            matches!(tokens[0].0, Token::Account("Assets:沪深300")),
751            "CJK characters at the start of a sub-component should tokenize as Account"
752        );
753
754        // Full CJK sub-component — valid
755        let tokens = tokenize("Assets:日本銀行");
756        assert!(
757            matches!(tokens[0].0, Token::Account("Assets:日本銀行")),
758            "CJK sub-component should tokenize as Account"
759        );
760
761        // Cyrillic account type — valid (Cyrillic uppercase is \p{Lu})
762        let tokens = tokenize("Капитал:Retained");
763        assert!(
764            matches!(tokens[0].0, Token::Account("Капитал:Retained")),
765            "Cyrillic-starting account should tokenize as Account"
766        );
767
768        // Fully CJK account — valid
769        let tokens = tokenize("资产:银行:支票");
770        assert!(
771            matches!(tokens[0].0, Token::Account("资产:银行:支票")),
772            "Fully CJK account should tokenize as Account"
773        );
774    }
775
776    /// Regression for issue #736/#739: Unicode letters AFTER an ASCII start
777    /// in account sub-components are valid per the beancount v3 spec.
778    #[test]
779    fn test_tokenize_account_unicode_letters_after_ascii_start() {
780        // French: É after ASCII start
781        let tokens = tokenize("Assets:Banque-Épargne");
782        assert!(
783            matches!(tokens[0].0, Token::Account("Assets:Banque-Épargne")),
784            "accented Latin letter after ASCII start should tokenize as Account, got: {tokens:?}"
785        );
786
787        // German: ü after ASCII start
788        let tokens = tokenize("Assets:Müller");
789        assert!(
790            matches!(tokens[0].0, Token::Account("Assets:Müller")),
791            "German umlaut after ASCII start should tokenize as Account, got: {tokens:?}"
792        );
793
794        // Mixed CJK after ASCII start — letters are allowed
795        let tokens = tokenize("Assets:CorpJP日本");
796        assert!(
797            matches!(tokens[0].0, Token::Account("Assets:CorpJP日本")),
798            "CJK letters after ASCII start should tokenize as Account, got: {tokens:?}"
799        );
800    }
801
802    #[test]
803    fn test_tokenize_currency() {
804        let tokens = tokenize("USD");
805        assert_eq!(tokens.len(), 1);
806        assert!(matches!(tokens[0].0, Token::Currency("USD")));
807    }
808
809    #[test]
810    fn test_tokenize_single_char_currency() {
811        // Single-char NYSE/NASDAQ tickers: T (AT&T), V (Visa), F (Ford), X (US Steel)
812        let tokens = tokenize("T");
813        assert_eq!(tokens.len(), 1);
814        assert!(matches!(tokens[0].0, Token::Currency("T")));
815
816        let tokens = tokenize("V");
817        assert_eq!(tokens.len(), 1);
818        assert!(matches!(tokens[0].0, Token::Currency("V")));
819
820        let tokens = tokenize("F");
821        assert_eq!(tokens.len(), 1);
822        assert!(matches!(tokens[0].0, Token::Currency("F")));
823    }
824
825    #[test]
826    fn test_single_char_currency_is_txn_flag() {
827        // Single-char currencies should be recognized as potential transaction flags
828        let token = Token::Currency("T");
829        assert!(token.is_txn_flag());
830
831        // Multi-char currencies should NOT be transaction flags
832        let token = Token::Currency("USD");
833        assert!(!token.is_txn_flag());
834    }
835
836    #[test]
837    fn test_tokenize_string() {
838        let tokens = tokenize(r#""Hello, World!""#);
839        assert_eq!(tokens.len(), 1);
840        assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
841    }
842
843    #[test]
844    fn test_tokenize_keywords() {
845        let tokens = tokenize("txn balance open close");
846        assert_eq!(tokens.len(), 4);
847        assert!(matches!(tokens[0].0, Token::Txn));
848        assert!(matches!(tokens[1].0, Token::Balance));
849        assert!(matches!(tokens[2].0, Token::Open));
850        assert!(matches!(tokens[3].0, Token::Close));
851    }
852
853    #[test]
854    fn test_tokenize_tag_and_link() {
855        let tokens = tokenize("#my-tag ^my-link");
856        assert_eq!(tokens.len(), 2);
857        assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
858        assert!(matches!(tokens[1].0, Token::Link("^my-link")));
859    }
860
861    #[test]
862    fn test_tokenize_comment() {
863        let tokens = tokenize("; This is a comment");
864        assert_eq!(tokens.len(), 1);
865        assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
866    }
867
868    #[test]
869    fn test_tokenize_indentation() {
870        let tokens = tokenize("txn\n  Assets:Bank 100 USD");
871        // Should have: Txn, Newline, Indent, Account, Number, Currency
872        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
873    }
874
875    /// `Token::Error`'s Display impl strips embedded BOM bytes — if a
876    /// mid-file U+FEFF gets captured into a lexer error span, the
877    /// diagnostic still renders human-readably. The leading-BOM case
878    /// is handled at the `crate::parse` boundary (see `crate::bom`),
879    /// so this defensive measure only matters for mid-file BOMs that
880    /// fall into the lexer's default error path.
881    #[test]
882    fn test_display_token_error_strips_embedded_bom() {
883        let payload = "foo\u{FEFF}bar";
884        let s = format!("{}", Token::Error(payload));
885        assert_eq!(s, "foo<BOM>bar");
886        assert!(!s.contains(crate::bom::BOM_CHAR));
887    }
888
889    /// A mid-file BOM (any U+FEFF not at strict byte 0) reaches the
890    /// lexer with no special handling — there is no BOM regex on the
891    /// Token enum anymore. Logos's default error path emits `Token::Error`
892    /// for the unrecognized byte; the parser's error classifier (which
893    /// searches `error_text` for U+FEFF) surfaces the dedicated
894    /// diagnostic on the parser side. This test pins the lexer side:
895    /// some `Token::Error` appears in the stream containing the BOM byte.
896    #[test]
897    fn test_tokenize_mid_file_bom_falls_into_error_path() {
898        // Note: this test calls `tokenize` directly with the BOM byte
899        // present in the source — it does NOT go through `parse`, which
900        // would have stripped a strict-byte-0 BOM. So we put the BOM
901        // mid-source to bypass the strip.
902        let source = "2024-01-01 open Assets:Bank USD\n\u{FEFF}";
903        let tokens = tokenize(source);
904        let has_bom_in_error = tokens.iter().any(|(t, _)| {
905            if let Token::Error(s) = t {
906                s.contains(crate::bom::BOM_CHAR)
907            } else {
908                false
909            }
910        });
911        assert!(
912            has_bom_in_error,
913            "mid-file BOM should fall into `Token::Error`, got: {tokens:?}"
914        );
915    }
916
917    /// Layout-transparency contract for mid-file BOM: a BOM at line
918    /// start followed by indented content (the
919    /// `cat windows-a.bean windows-b.bean` concatenation case) must
920    /// NOT swallow the indent on the next token. The Err arm in
921    /// `tokenize` recognizes `Token::Error("\u{FEFF}")` and preserves
922    /// `at_line_start` + advances `last_newline_end` so the next
923    /// real token still gets its `Token::Indent` emission.
924    ///
925    /// Without this special case, the Err arm sets `at_line_start =
926    /// false` like for any other lex error, the indented posting
927    /// fails to produce an Indent token, and the parser misclassifies
928    /// the posting as a top-level directive — producing cascading
929    /// errors instead of the targeted BOM diagnostic.
930    #[test]
931    fn test_mid_file_bom_at_line_start_preserves_following_indent() {
932        // First a directive, then newline, then mid-file BOM, then
933        // indented posting-like content. `tokenize` is called directly
934        // (bypassing parse's strip-at-entry) so the BOM is mid-file.
935        let source = "2024-01-01 open Assets:Bank USD\n\u{FEFF}  meta-key: \"v\"\n";
936        let tokens = tokenize(source);
937        // The Token::Error for the BOM must be present.
938        let has_bom_error = tokens.iter().any(|(t, _)| {
939            if let Token::Error(s) = t {
940                *s == crate::bom::BOM
941            } else {
942                false
943            }
944        });
945        assert!(
946            has_bom_error,
947            "expected Token::Error(\"\\u{{FEFF}}\") in stream, got: {tokens:?}"
948        );
949        // Critically: the indent for the 2-space metadata line must
950        // survive — it should be a Token::Indent(2), not absorbed.
951        let has_indent_2 = tokens.iter().any(|(t, _)| matches!(t, Token::Indent(2)));
952        assert!(
953            has_indent_2,
954            "mid-file BOM at line start must not swallow the following Indent; got: {tokens:?}"
955        );
956        // And the metadata key tokenizes normally on the same line.
957        assert!(
958            tokens
959                .iter()
960                .any(|(t, _)| matches!(t, Token::MetaKey("meta-key:"))),
961            "expected MetaKey after BOM-prefixed indent, got: {tokens:?}"
962        );
963    }
964
965    /// Consecutive BOMs at line start (logos emits each as its own
966    /// Err) ALL preserve layout-transparency. The Err arm uses
967    /// `trim_start_matches(BOM_CHAR)` to find non-BOM content, so a
968    /// triple-concatenated Windows file producing `\n\u{FEFF}\u{FEFF}`
969    /// at line start, followed by indented content, still emits the
970    /// `Indent` for the metadata line. Without the `trim_start_matches`
971    /// approach (using a single-BOM length check instead), the second
972    /// BOM would either not advance `last_newline_end` correctly or
973    /// would clobber `at_line_start`, breaking the indent walk on the
974    /// next real token.
975    #[test]
976    fn test_consecutive_mid_file_boms_preserve_layout() {
977        let source = "2024-01-01 open Assets:Bank USD\n\u{FEFF}\u{FEFF}  meta-key: \"v\"\n";
978        let tokens = tokenize(source);
979        // Both BOMs should appear as Token::Error.
980        let bom_error_count = tokens
981            .iter()
982            .filter(|(t, _)| matches!(t, Token::Error(s) if *s == crate::bom::BOM))
983            .count();
984        assert_eq!(
985            bom_error_count, 2,
986            "expected 2 Token::Error(BOM) tokens, got: {tokens:?}"
987        );
988        // And the indent on the line containing the BOMs must survive.
989        let has_indent_2 = tokens.iter().any(|(t, _)| matches!(t, Token::Indent(2)));
990        assert!(
991            has_indent_2,
992            "consecutive mid-file BOMs at line start must not swallow following indent; \
993             got: {tokens:?}"
994        );
995        assert!(
996            tokens
997                .iter()
998                .any(|(t, _)| matches!(t, Token::MetaKey("meta-key:"))),
999            "expected MetaKey after consecutive-BOM-prefixed indent, got: {tokens:?}"
1000        );
1001    }
1002
1003    // ===== Direct tests of `apply_err_layout_transparency` =====
1004    //
1005    // These tests exercise the helper independently of logos's
1006    // emission strategy. Today logos emits one Err per unrecognized
1007    // char, so the multi-BOM-in-one-Err code path (the
1008    // `trim_start_matches` loop's motivating case) is unreachable
1009    // from real input. The tests below feed the helper synthetic
1010    // invalid_text values so the defensive code is actually
1011    // validated rather than documentation-only.
1012
1013    /// Coalesced double-BOM at line start: must advance
1014    /// `last_newline_end` past BOTH bytes and keep `at_line_start`.
1015    /// Pins the contract `trim_start_matches` exists to provide.
1016    #[test]
1017    fn err_layout_transparency_coalesced_double_bom_at_line_start() {
1018        let invalid_text = "\u{FEFF}\u{FEFF}";
1019        let span_start = 10;
1020        let mut at_line_start = true;
1021        let mut last_newline_end = 10;
1022        apply_err_layout_transparency(
1023            invalid_text,
1024            span_start,
1025            &mut at_line_start,
1026            &mut last_newline_end,
1027        );
1028        assert!(
1029            at_line_start,
1030            "all-BOM error span must preserve at_line_start"
1031        );
1032        assert_eq!(
1033            last_newline_end,
1034            10 + 2 * crate::bom::BOM_LEN,
1035            "last_newline_end must advance past BOTH BOMs, not just the first"
1036        );
1037    }
1038
1039    /// Coalesced BOM + trailing content: `at_line_start` clobbers (real
1040    /// content follows the BOM run); `last_newline_end` still
1041    /// advances past the BOM portion only.
1042    #[test]
1043    fn err_layout_transparency_coalesced_bom_with_trailing_content() {
1044        let invalid_text = "\u{FEFF}\u{FEFF}xyz";
1045        let span_start = 10;
1046        let mut at_line_start = true;
1047        let mut last_newline_end = 10;
1048        apply_err_layout_transparency(
1049            invalid_text,
1050            span_start,
1051            &mut at_line_start,
1052            &mut last_newline_end,
1053        );
1054        assert!(
1055            !at_line_start,
1056            "trailing non-BOM content must clobber at_line_start"
1057        );
1058        assert_eq!(
1059            last_newline_end,
1060            10 + 2 * crate::bom::BOM_LEN,
1061            "last_newline_end advances past leading BOMs, NOT past trailing content"
1062        );
1063    }
1064
1065    /// Non-BOM error: standard clobber.
1066    #[test]
1067    fn err_layout_transparency_non_bom_clobbers() {
1068        let invalid_text = "garbage";
1069        let mut at_line_start = true;
1070        let mut last_newline_end = 10;
1071        apply_err_layout_transparency(invalid_text, 10, &mut at_line_start, &mut last_newline_end);
1072        assert!(!at_line_start);
1073        assert_eq!(last_newline_end, 10, "non-BOM error must not advance");
1074    }
1075
1076    /// All-BOM error span but NOT at line start (e.g., BOM appears
1077    /// mid-line after some content): `at_line_start` was already
1078    /// false, the inner advance guard fails, and nothing changes.
1079    #[test]
1080    fn err_layout_transparency_all_bom_not_at_line_start_is_noop() {
1081        let invalid_text = "\u{FEFF}\u{FEFF}";
1082        let span_start = 20;
1083        let mut at_line_start = false; // mid-line
1084        let mut last_newline_end = 10;
1085        apply_err_layout_transparency(
1086            invalid_text,
1087            span_start,
1088            &mut at_line_start,
1089            &mut last_newline_end,
1090        );
1091        assert!(!at_line_start);
1092        assert_eq!(last_newline_end, 10, "guard prevents stale advance");
1093    }
1094
1095    /// Complementary to the previous test: the inner `at_line_start &&
1096    /// span_start == last_newline_end` guard has two clauses. The
1097    /// `*_not_at_line_start_*` test above exercises the first
1098    /// (`at_line_start = false`); THIS test pins the second
1099    /// (span doesn't begin at `last_newline_end`).
1100    ///
1101    /// Without exercising both clauses independently, a refactor that
1102    /// flipped `&&` to `||` would not be caught — either clause alone
1103    /// suffices to suppress the advance.
1104    #[test]
1105    fn err_layout_transparency_all_bom_span_mismatch_is_noop() {
1106        let invalid_text = "\u{FEFF}\u{FEFF}";
1107        // at_line_start IS true (the first clause's condition holds)…
1108        let mut at_line_start = true;
1109        // …but span_start (20) != last_newline_end (10), so the
1110        // second clause's condition fails. Combined: the advance
1111        // must NOT fire.
1112        let span_start = 20;
1113        let mut last_newline_end = 10;
1114        apply_err_layout_transparency(
1115            invalid_text,
1116            span_start,
1117            &mut at_line_start,
1118            &mut last_newline_end,
1119        );
1120        assert!(
1121            at_line_start,
1122            "all-BOM error span must preserve at_line_start regardless of span-vs-last-newline match"
1123        );
1124        assert_eq!(
1125            last_newline_end, 10,
1126            "span_start != last_newline_end must prevent stale advance"
1127        );
1128    }
1129
1130    /// Round-17/18: the contract "every BOM byte is layout-
1131    /// transparent" covers BOMs at ANY position in a coalesced error
1132    /// span, not just the leading run. Pre-round-17 the
1133    /// implementation only inspected the leading BOM run for the
1134    /// `at_line_start` decision — a coalesced span like
1135    /// `@@<BOM>` (non-BOM head, BOM tail) was clobbered by the
1136    /// leading-only logic even though the trailing BOM should have
1137    /// been transparent (and the leading `@@` would correctly
1138    /// clobber on its own). The fixed implementation walks the
1139    /// whole span: ANY non-BOM byte clobbers; only an all-BOM span
1140    /// (in any arrangement) preserves `at_line_start`.
1141    ///
1142    /// These tests cover the interleaved shapes the round-17
1143    /// contract claims to handle: BOM-only-tail, BOM-in-middle,
1144    /// and the recently-flagged "BOM-only in any arrangement"
1145    /// preservation guarantee.
1146    #[test]
1147    fn err_layout_transparency_bom_only_in_any_arrangement_preserves() {
1148        // All-BOM coalesced span — preserves at_line_start AND
1149        // advances last_newline_end past the leading run.
1150        let mut at_line_start = true;
1151        let mut last_newline_end = 10;
1152        apply_err_layout_transparency(
1153            "\u{FEFF}\u{FEFF}",
1154            10, // span_start == last_newline_end → advance fires
1155            &mut at_line_start,
1156            &mut last_newline_end,
1157        );
1158        assert!(at_line_start, "all-BOM span preserves at_line_start");
1159        assert_eq!(
1160            last_newline_end, 16,
1161            "leading BOM run advances last_newline_end past both BOM bytes \
1162             (each BOM is 3 UTF-8 bytes)"
1163        );
1164    }
1165
1166    /// Non-BOM head clobbers `at_line_start`. Pre-round-17 also did
1167    /// this (correctly); pinning prevents a regression that re-
1168    /// introduces a BOM-only-trim that misses non-BOM head bytes.
1169    #[test]
1170    fn err_layout_transparency_non_bom_head_clobbers() {
1171        let mut at_line_start = true;
1172        let mut last_newline_end = 0;
1173        apply_err_layout_transparency("@@\u{FEFF}", 10, &mut at_line_start, &mut last_newline_end);
1174        assert!(
1175            !at_line_start,
1176            "non-BOM head ('@@') clobbers at_line_start regardless of trailing BOM"
1177        );
1178    }
1179
1180    /// BOM head + non-BOM tail clobbers (because of the tail).
1181    /// Pre-round-17 the leading-only logic was correct here too;
1182    /// pinning ensures no regression that flips to leading-only.
1183    #[test]
1184    fn err_layout_transparency_bom_head_non_bom_tail_clobbers() {
1185        let mut at_line_start = true;
1186        let mut last_newline_end = 10;
1187        apply_err_layout_transparency("\u{FEFF}@@", 10, &mut at_line_start, &mut last_newline_end);
1188        assert!(
1189            !at_line_start,
1190            "non-BOM tail ('@@') clobbers at_line_start even though span starts with BOM"
1191        );
1192        assert_eq!(
1193            last_newline_end, 13,
1194            "leading BOM run STILL advances last_newline_end past the BOM"
1195        );
1196    }
1197
1198    /// Non-BOM in the middle of a BOM-flanked span clobbers. THIS
1199    /// is the case the round-17 docstring specifically claimed to
1200    /// cover; pre-round-17 the same outcome held (leading BOMs
1201    /// trimmed, non-empty tail clobbered) but only by accident.
1202    /// The fixed `has_non_bom_byte = chars().any(|c| c != BOM)`
1203    /// walks the whole span and makes the case explicit.
1204    #[test]
1205    fn err_layout_transparency_bom_flanking_non_bom_clobbers() {
1206        let mut at_line_start = true;
1207        let mut last_newline_end = 10;
1208        apply_err_layout_transparency(
1209            "\u{FEFF}@@\u{FEFF}",
1210            10,
1211            &mut at_line_start,
1212            &mut last_newline_end,
1213        );
1214        assert!(
1215            !at_line_start,
1216            "non-BOM middle ('@@') clobbers at_line_start"
1217        );
1218        assert_eq!(
1219            last_newline_end, 13,
1220            "leading BOM run advances last_newline_end past the leading BOM only"
1221        );
1222    }
1223
1224    #[test]
1225    fn test_tokenize_transaction_line() {
1226        let source = "2024-01-15 * \"Grocery Store\" #food\n  Expenses:Food 50.00 USD";
1227        let tokens = tokenize(source);
1228
1229        // Check key tokens are present
1230        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
1231        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
1232        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
1233        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
1234        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
1235        assert!(
1236            tokens
1237                .iter()
1238                .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
1239        );
1240        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
1241        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
1242        assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
1243    }
1244
1245    #[test]
1246    fn test_tokenize_metadata_key() {
1247        let tokens = tokenize("filename:");
1248        assert_eq!(tokens.len(), 1);
1249        assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
1250    }
1251
1252    #[test]
1253    fn test_tokenize_punctuation() {
1254        let tokens = tokenize("{ } @ @@ , ~");
1255        let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
1256        assert!(token_types.contains(&Token::LBrace));
1257        assert!(token_types.contains(&Token::RBrace));
1258        assert!(token_types.contains(&Token::At));
1259        assert!(token_types.contains(&Token::AtAt));
1260        assert!(token_types.contains(&Token::Comma));
1261        assert!(token_types.contains(&Token::Tilde));
1262    }
1263}