rustledger_parser/logos_lexer.rs
1//! SIMD-accelerated lexer using Logos.
2//!
3//! This module provides a fast tokenizer for Beancount syntax using the Logos crate,
4//! which generates a DFA-based lexer with SIMD optimizations where available.
5
6use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10// The leading-BOM strip happens at the `parse()` entry boundary (see
11// `crate::bom::strip_leading`). By the time the lexer runs, the source
12// is BOM-free at byte 0 by construction. Any U+FEFF byte the lexer
13// encounters is therefore mid-file and unrecognized — logos's default
14// error path emits a `Token::Error` for it, and the parser's existing
15// error classifier (which searches `error_text` for U+FEFF) surfaces
16// the dedicated `ParseErrorKind::BomInDirectiveBody` diagnostic.
17//
18// No BOM-aware lexer callback, no `Token::Bom` variant, and no
19// BOM regex in the Token enum — but the `Err(()) => ...` arm in
20// `tokenize` DOES contain one mid-file-BOM special case: it preserves
21// `at_line_start` and advances `last_newline_end` past leading BOM
22// bytes in the error span, so indented content on the same logical
23// line still emits an `Indent` token. That logic lives in the
24// `apply_err_layout_transparency` helper below and is unit-tested
25// directly (including the multi-BOM coalesced-Err case that logos
26// doesn't produce from real input today but might in the future).
27
28/// A span in the source code (byte offsets).
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub struct Span {
31 /// Start byte offset (inclusive).
32 pub start: usize,
33 /// End byte offset (exclusive).
34 pub end: usize,
35}
36
37impl From<Range<usize>> for Span {
38 fn from(range: Range<usize>) -> Self {
39 Self {
40 start: range.start,
41 end: range.end,
42 }
43 }
44}
45
46impl From<Span> for Range<usize> {
47 fn from(span: Span) -> Self {
48 span.start..span.end
49 }
50}
51
52/// Token types produced by the Logos lexer.
53///
54/// Horizontal whitespace is emitted as a first-class [`Token::Whitespace`]
55/// token (was previously skipped via `#[logos(skip r"[ \t]+")]`). The
56/// existing [`tokenize`] entry point filters whitespace out for
57/// backward-compat with the AST-style parser; the new
58/// [`tokenize_lossless`] entry point keeps them so the CST can
59/// reconstruct source byte-for-byte. Both paths share the same Logos
60/// implementation — there is exactly one tokenization pass per file.
61#[derive(Logos, Debug, Clone, PartialEq, Eq)]
62pub enum Token<'src> {
63 /// Horizontal whitespace (`[ \t]+`). Significant for the CST and
64 /// for the existing indent post-processing in [`tokenize`]; both
65 /// callers handle this variant.
66 #[regex(r"[ \t]+")]
67 Whitespace(&'src str),
68 // ===== Literals =====
69 /// A date in YYYY-MM-DD, YYYY-M-D, YYYY/MM/DD, or YYYY/M/D format.
70 /// Single-digit month and day are accepted (e.g., 2024-1-5).
71 #[regex(r"\d{4}[-/]\d{1,2}[-/]\d{1,2}")]
72 Date(&'src str),
73
74 /// A number with optional thousands separators and decimals.
75 /// Examples: 123, 1,234.56, 1234.5678, 1. (trailing decimal)
76 /// Negative numbers are handled as unary minus (`-` token + number)
77 /// to allow subtraction expressions like `3-2` to parse correctly.
78 /// Python beancount v3 requires an integer part before the decimal point.
79 /// Leading decimals like `.50` are rejected per the beancount v3 spec.
80 #[regex(r"(\d{1,3}(,\d{3})*|\d+)(\.\d*)?")]
81 Number(&'src str),
82
83 /// A double-quoted string (handles escape sequences).
84 /// The slice includes the quotes.
85 #[regex(r#""([^"\\]|\\.)*""#)]
86 String(&'src str),
87
88 /// An account name like Assets:Bank:Checking, Капитал:Retained-Earnings,
89 /// or 资产:银行:支票.
90 ///
91 /// The first component starts with an uppercase letter (`\p{Lu}`), a
92 /// letter without case like CJK ideographs (`\p{Lo}`), or a titlecase
93 /// letter (`\p{Lt}`). Sub-components may also start with a digit.
94 /// Subsequent characters can be any Unicode letter, digit, or hyphen.
95 ///
96 /// Note: The beancount v3 spec restricts the first character to ASCII
97 /// `[A-Z]`, but this is an artifact of the C flex lexer's poor Unicode
98 /// support, not a meaningful language design choice (see
99 /// beancount/beancount#161, #398, #733).
100 ///
101 /// The account type prefix is validated later against options (`name_assets`, etc.).
102 #[regex(r"[\p{Lu}\p{Lo}\p{Lt}][\p{L}0-9-]*(:([\p{Lu}\p{Lo}\p{Lt}0-9][\p{L}0-9-]*)+)+")]
103 Account(&'src str),
104
105 /// A currency/commodity code like USD, EUR, AAPL, BTC, or single-char tickers like T, V, F.
106 /// Uppercase letters, can contain digits, apostrophes, dots, underscores, hyphens.
107 /// Single-character currencies (e.g., T for AT&T, V for Visa) are valid NYSE/NASDAQ tickers.
108 /// Note: Single-char currencies are disambiguated from transaction flags in the parser.
109 /// Also supports `/` prefix for options/futures contracts (e.g., `/ESM24`, `/LOX21_211204_P100.25`).
110 /// The `/` prefix requires an uppercase letter first to avoid matching `/1.14` as currency.
111 /// Priority 3 ensures Currency wins over Flag for single uppercase letters.
112 #[regex(r"/[A-Z][A-Z0-9'._-]*|[A-Z][A-Z0-9'._-]*", priority = 3)]
113 Currency(&'src str),
114
115 /// A tag like #tag-name.
116 #[regex(r"#[a-zA-Z0-9-_/.]+")]
117 Tag(&'src str),
118
119 /// A link like ^link-name.
120 #[regex(r"\^[a-zA-Z0-9-_/.]+")]
121 Link(&'src str),
122
123 // ===== Keywords =====
124 // Using #[token] for exact matches (higher priority than regex)
125 /// The `txn` keyword for transactions.
126 #[token("txn")]
127 Txn,
128 /// The `balance` directive keyword.
129 #[token("balance")]
130 Balance,
131 /// The `open` directive keyword.
132 #[token("open")]
133 Open,
134 /// The `close` directive keyword.
135 #[token("close")]
136 Close,
137 /// The `commodity` directive keyword.
138 #[token("commodity")]
139 Commodity,
140 /// The `pad` directive keyword.
141 #[token("pad")]
142 Pad,
143 /// The `event` directive keyword.
144 #[token("event")]
145 Event,
146 /// The `query` directive keyword.
147 #[token("query")]
148 Query,
149 /// The `note` directive keyword.
150 #[token("note")]
151 Note,
152 /// The `document` directive keyword.
153 #[token("document")]
154 Document,
155 /// The `price` directive keyword.
156 #[token("price")]
157 Price,
158 /// The `custom` directive keyword.
159 #[token("custom")]
160 Custom,
161 /// The `option` directive keyword.
162 #[token("option")]
163 Option_,
164 /// The `include` directive keyword.
165 #[token("include")]
166 Include,
167 /// The `plugin` directive keyword.
168 #[token("plugin")]
169 Plugin,
170 /// The `pushtag` directive keyword.
171 #[token("pushtag")]
172 Pushtag,
173 /// The `poptag` directive keyword.
174 #[token("poptag")]
175 Poptag,
176 /// The `pushmeta` directive keyword.
177 #[token("pushmeta")]
178 Pushmeta,
179 /// The `popmeta` directive keyword.
180 #[token("popmeta")]
181 Popmeta,
182 /// The `TRUE` boolean literal (also True, true).
183 #[token("TRUE")]
184 #[token("True")]
185 #[token("true")]
186 True,
187 /// The `FALSE` boolean literal (also False, false).
188 #[token("FALSE")]
189 #[token("False")]
190 #[token("false")]
191 False,
192 /// The `NULL` literal.
193 #[token("NULL")]
194 Null,
195
196 // ===== Punctuation =====
197 // Order matters: longer tokens first
198 /// Double left brace `{{` for cost specifications (legacy total cost).
199 #[token("{{")]
200 LDoubleBrace,
201 /// Double right brace `}}` for cost specifications.
202 #[token("}}")]
203 RDoubleBrace,
204 /// Left brace with hash `{#` for total cost (new syntax).
205 #[token("{#")]
206 LBraceHash,
207 /// Left brace `{` for cost specifications.
208 #[token("{")]
209 LBrace,
210 /// Right brace `}` for cost specifications.
211 #[token("}")]
212 RBrace,
213 /// Left parenthesis `(` for expressions.
214 #[token("(")]
215 LParen,
216 /// Right parenthesis `)` for expressions.
217 #[token(")")]
218 RParen,
219 /// Double at-sign `@@` for total cost.
220 #[token("@@")]
221 AtAt,
222 /// At-sign `@` for unit cost.
223 #[token("@")]
224 At,
225 /// Colon `:` separator.
226 #[token(":")]
227 Colon,
228 /// Comma `,` separator.
229 #[token(",")]
230 Comma,
231 /// Tilde `~` for tolerance.
232 #[token("~")]
233 Tilde,
234 /// Pipe `|` for deprecated payee/narration separator.
235 #[token("|")]
236 Pipe,
237 /// Plus `+` operator.
238 #[token("+")]
239 Plus,
240 /// Minus `-` operator.
241 #[token("-")]
242 Minus,
243 /// Star `*` for cleared transactions and multiplication.
244 #[token("*")]
245 Star,
246 /// Slash `/` for division.
247 #[token("/")]
248 Slash,
249
250 // ===== Transaction Flags =====
251 /// Pending flag `!` for incomplete transactions.
252 #[token("!")]
253 Pending,
254
255 /// Other transaction flags: P S T C U R M ? &
256 /// Note: # and % are handled as comments when followed by space
257 #[regex(r"[PSTCURM?&]")]
258 Flag(&'src str),
259
260 // ===== Structural =====
261 /// Newline (significant in Beancount for directive boundaries).
262 #[regex(r"\r?\n")]
263 Newline,
264
265 /// A comment starting with semicolon.
266 /// The slice includes the semicolon.
267 #[regex(r";[^\n\r]*", allow_greedy = true)]
268 Comment(&'src str),
269
270 /// Hash token `#` used as separator in cost specs: `{per_unit # total currency}`
271 /// Note: In Python beancount, `#` is only a comment at the START of a line.
272 /// Mid-line `# text` is NOT a comment - it's either a cost separator or syntax error.
273 /// Start-of-line hash comments are handled in post-processing (tokenize function).
274 #[token("#")]
275 Hash,
276
277 /// A percent comment (ledger-style).
278 /// Python beancount accepts % as a comment character for ledger compatibility.
279 #[regex(r"%[^\n\r]*", allow_greedy = true)]
280 PercentComment(&'src str),
281
282 /// Shebang line at start of file (e.g., #!/usr/bin/env bean-web).
283 /// Treated as a comment-like directive to skip.
284 #[regex(r"#![^\n\r]*", allow_greedy = true)]
285 Shebang(&'src str),
286
287 /// Emacs org-mode directive (e.g., "#+STARTUP: showall").
288 /// These are Emacs configuration lines that should be skipped.
289 #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
290 EmacsDirective(&'src str),
291
292 /// A metadata key (identifier followed by colon).
293 /// Examples: filename:, lineno:, custom-key:, nameOnCard:
294 /// The slice includes the trailing colon. Keys must start with a lowercase ASCII letter
295 /// per the beancount v3 spec. Keys starting with uppercase are rejected.
296 #[regex(r"[a-z][a-zA-Z0-9_-]*:")]
297 MetaKey(&'src str),
298
299 /// Indentation token (inserted by post-processing, not by Logos).
300 /// Contains the number of leading spaces.
301 /// This is a placeholder - actual indentation detection happens in [`tokenize`].
302 Indent(usize),
303
304 /// Deep indentation (3+ spaces) - used for posting-level metadata.
305 DeepIndent(usize),
306
307 /// Error token for unrecognized input.
308 /// Contains the invalid source text for better error messages.
309 Error(&'src str),
310}
311
312impl Token<'_> {
313 /// Returns true if this is a transaction flag (* or !).
314 /// Single-character currencies (e.g., T, P, C) can also be used as flags.
315 pub const fn is_txn_flag(&self) -> bool {
316 match self {
317 Self::Star | Self::Pending | Self::Flag(_) | Self::Hash => true,
318 // Single-char currencies can be used as transaction flags
319 Self::Currency(s) => s.len() == 1,
320 _ => false,
321 }
322 }
323
324 /// Returns true if this is a keyword that starts a directive.
325 pub const fn is_directive_keyword(&self) -> bool {
326 matches!(
327 self,
328 Self::Txn
329 | Self::Balance
330 | Self::Open
331 | Self::Close
332 | Self::Commodity
333 | Self::Pad
334 | Self::Event
335 | Self::Query
336 | Self::Note
337 | Self::Document
338 | Self::Price
339 | Self::Custom
340 | Self::Option_
341 | Self::Include
342 | Self::Plugin
343 | Self::Pushtag
344 | Self::Poptag
345 | Self::Pushmeta
346 | Self::Popmeta
347 )
348 }
349}
350
351impl fmt::Display for Token<'_> {
352 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
353 match self {
354 Self::Date(s) => write!(f, "{s}"),
355 Self::Number(s) => write!(f, "{s}"),
356 Self::String(s) => write!(f, "{s}"),
357 Self::Account(s) => write!(f, "{s}"),
358 Self::Currency(s) => write!(f, "{s}"),
359 Self::Tag(s) => write!(f, "{s}"),
360 Self::Link(s) => write!(f, "{s}"),
361 Self::Txn => write!(f, "txn"),
362 Self::Balance => write!(f, "balance"),
363 Self::Open => write!(f, "open"),
364 Self::Close => write!(f, "close"),
365 Self::Commodity => write!(f, "commodity"),
366 Self::Pad => write!(f, "pad"),
367 Self::Event => write!(f, "event"),
368 Self::Query => write!(f, "query"),
369 Self::Note => write!(f, "note"),
370 Self::Document => write!(f, "document"),
371 Self::Price => write!(f, "price"),
372 Self::Custom => write!(f, "custom"),
373 Self::Option_ => write!(f, "option"),
374 Self::Include => write!(f, "include"),
375 Self::Plugin => write!(f, "plugin"),
376 Self::Pushtag => write!(f, "pushtag"),
377 Self::Poptag => write!(f, "poptag"),
378 Self::Pushmeta => write!(f, "pushmeta"),
379 Self::Popmeta => write!(f, "popmeta"),
380 Self::True => write!(f, "TRUE"),
381 Self::False => write!(f, "FALSE"),
382 Self::Null => write!(f, "NULL"),
383 Self::LDoubleBrace => write!(f, "{{{{"),
384 Self::RDoubleBrace => write!(f, "}}}}"),
385 Self::LBraceHash => write!(f, "{{#"),
386 Self::LBrace => write!(f, "{{"),
387 Self::RBrace => write!(f, "}}"),
388 Self::LParen => write!(f, "("),
389 Self::RParen => write!(f, ")"),
390 Self::AtAt => write!(f, "@@"),
391 Self::At => write!(f, "@"),
392 Self::Colon => write!(f, ":"),
393 Self::Comma => write!(f, ","),
394 Self::Tilde => write!(f, "~"),
395 Self::Pipe => write!(f, "|"),
396 Self::Plus => write!(f, "+"),
397 Self::Minus => write!(f, "-"),
398 Self::Star => write!(f, "*"),
399 Self::Slash => write!(f, "/"),
400 Self::Pending => write!(f, "!"),
401 Self::Flag(s) => write!(f, "{s}"),
402 Self::Whitespace(s) => write!(f, "{s}"),
403 Self::Newline => write!(f, "\\n"),
404 Self::Comment(s) => write!(f, "{s}"),
405 Self::Hash => write!(f, "#"),
406 Self::PercentComment(s) => write!(f, "{s}"),
407 Self::Shebang(s) => write!(f, "{s}"),
408 Self::EmacsDirective(s) => write!(f, "{s}"),
409 Self::MetaKey(s) => write!(f, "{s}"),
410 Self::Indent(n) => write!(f, "<indent:{n}>"),
411 Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
412 Self::Error(s) => {
413 // Strip any embedded U+FEFF bytes (a mid-file BOM
414 // captured into a lexer error span) so diagnostics
415 // rendering this token stay human-readable. LSP problem
416 // panels, CLI stderr, and GitHub-rendered bug reports
417 // all silently drop or strip literal BOM bytes — the
418 // `<BOM>` placeholder makes the failure mode visible.
419 //
420 // Streamed (rather than `s.replace(...)`) so this
421 // Display impl is zero-allocation. LSP problem panels
422 // re-render diagnostics on every keystroke during
423 // interactive editing; a `String` allocation per
424 // render showed up in flame graphs for files with
425 // many BOM-containing Token::Error tokens. The fast
426 // path (no BOM in `s`) is one `f.write_str(s)` call.
427 if s.contains(crate::bom::BOM_CHAR) {
428 let mut chunks = s.split(crate::bom::BOM_CHAR);
429 // Interleave chunks with "<BOM>" between them.
430 // `split` yields N+1 chunks for N matches, so the
431 // first chunk is emitted as-is and each subsequent
432 // chunk gets a `<BOM>` prefix. Final output is
433 // chunk0 + "<BOM>" + chunk1 + "<BOM>" + chunkN —
434 // matching the (allocating) `s.replace(...)`
435 // behavior exactly.
436 if let Some(first) = chunks.next() {
437 f.write_str(first)?;
438 }
439 for chunk in chunks {
440 f.write_str("<BOM>")?;
441 f.write_str(chunk)?;
442 }
443 Ok(())
444 } else {
445 f.write_str(s)
446 }
447 }
448 }
449 }
450}
451
452/// Apply mid-file BOM layout-transparency rules to lexer-state from
453/// inside the `Err` arm of `tokenize`.
454///
455/// A mid-file BOM (U+FEFF) is layout-transparent: it produces an
456/// error diagnostic via the parser's classifier, but must NOT clobber
457/// `at_line_start` or move `last_newline_end` past the BOM, otherwise
458/// the next token on the same logical line (e.g. an indented posting
459/// from a concatenated Windows file) loses its indent classification
460/// and the parser mistypes it. Leading-BOM is handled at the
461/// `crate::parse` boundary and never reaches this code path; only
462/// mid-file BOMs that survived the strip do.
463///
464/// We use `trim_start_matches` (rather than `starts_with` + a single
465/// `BOM_LEN` advance) so a multi-BOM run — e.g., a hypothetical
466/// coalesced `\u{FEFF}\u{FEFF}` Err span from a triple-concatenated
467/// Windows file — is ENTIRELY layout-transparent. Advancing
468/// `last_newline_end` past only the first BOM but then clobbering
469/// `at_line_start` because of the second BOM would cascade into
470/// misclassifying the next real token. The contract is: every BOM
471/// byte is layout-transparent; `at_line_start` is preserved iff the
472/// entire error span is BOM bytes; `last_newline_end` advances past
473/// the full run of leading BOMs.
474///
475/// Extracted as a private helper so the multi-BOM defensive code path
476/// can be unit-tested independently of logos's emission strategy.
477/// Today logos emits one Err per unrecognized char, so the coalesced
478/// path is unreachable from real input; the unit tests at the bottom
479/// of this file feed the helper synthetic `invalid_text` values that
480/// exercise the coalesced case directly.
481fn apply_err_layout_transparency(
482 invalid_text: &str,
483 span_start: usize,
484 at_line_start: &mut bool,
485 last_newline_end: &mut usize,
486) {
487 // Round-17 fix: the contract documented above says "every BOM
488 // byte is layout-transparent" — i.e., a span like
489 // `\u{FEFF}@@\u{FEFF}` should classify its non-BOM bytes for the
490 // at_line_start decision, not its BOM bytes. The previous impl
491 // only inspected the LEADING run of BOMs and clobbered
492 // `at_line_start` for any non-empty tail. That sub-case worked
493 // because a coalesced span starting with BOM + non-BOM tail
494 // really does break the indent contract. But a coalesced span
495 // like `@@\u{FEFF}` (non-BOM head followed by BOM tail) would
496 // also clobber — the BOM in the tail is layout-transparent per
497 // contract, but the head is real content so the clobber is
498 // already correct. The genuinely-wrong case (currently
499 // unreachable but reachable under a future logos upgrade that
500 // coalesces error sequences) is when the ENTIRE span is BOMs,
501 // possibly interleaved with whitespace: those should be fully
502 // layout-transparent. We now extract the LEADING run of BOM
503 // bytes for `last_newline_end` advancement, and consult the
504 // FULL invalid_text minus all BOM bytes for the at_line_start
505 // decision.
506 let after_leading_bom = invalid_text.trim_start_matches(crate::bom::BOM_CHAR);
507 let leading_bom_bytes = invalid_text.len() - after_leading_bom.len();
508 if leading_bom_bytes > 0 && *at_line_start && span_start == *last_newline_end {
509 *last_newline_end = span_start + leading_bom_bytes;
510 }
511
512 // Any non-BOM byte ANYWHERE in the span is "real content" for
513 // indent purposes. An all-BOM span (possibly interleaving BOMs
514 // at any position) leaves `at_line_start` untouched. The
515 // previous `is_empty()` check on JUST the after-leading-BOM
516 // tail had a latent gap for a coalesced `@<BOM>` span: the
517 // leading run is empty, so the `else` arm clobbered — which
518 // happens to be correct for that case, but the path was
519 // accidental rather than principled. Walking the whole span
520 // makes the rule explicit.
521 let has_non_bom_byte = invalid_text.chars().any(|c| c != crate::bom::BOM_CHAR);
522 if has_non_bom_byte {
523 *at_line_start = false;
524 }
525}
526
527/// Tokenize source code into a vector of (Token, Span) pairs for the
528/// AST-style parser.
529///
530/// Filters out [`Token::Whitespace`] (mid-line horizontal whitespace)
531/// but otherwise emits everything the lexer produces, with
532/// post-processing for line-start `#` comments and indentation.
533/// Callers that need a fully-lossless token stream (the CST builder)
534/// use [`tokenize_lossless`] instead.
535pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
536 tokenize_inner(source, /* keep_whitespace = */ false)
537}
538
539/// Tokenize source code losslessly: every byte of `source` appears in
540/// exactly one emitted `(Token, Span)` entry. This is the input to
541/// the CST builder.
542///
543/// Differs from [`tokenize`] in that [`Token::Whitespace`] tokens are
544/// preserved (the AST-style parser drops them; the CST keeps them so
545/// the round-trip stays byte-identical).
546pub fn tokenize_lossless(source: &str) -> Vec<(Token<'_>, Span)> {
547 tokenize_inner(source, /* keep_whitespace = */ true)
548}
549
550fn tokenize_inner(source: &str, keep_whitespace: bool) -> Vec<(Token<'_>, Span)> {
551 let mut tokens = Vec::new();
552 let mut lexer = Token::lexer(source);
553 let mut at_line_start = true;
554 let mut last_newline_end = 0usize;
555
556 while let Some(result) = lexer.next() {
557 let span = lexer.span();
558
559 if !keep_whitespace && matches!(result, Ok(Token::Whitespace(_))) {
560 // AST-path drops mid-line whitespace; the CST path keeps
561 // it. Layout-relevant whitespace (start-of-line indentation,
562 // BOM error spans) is handled by the dedicated arms below
563 // regardless of which path we are on.
564 continue;
565 }
566
567 match result {
568 Ok(Token::Newline) => {
569 tokens.push((Token::Newline, span.clone().into()));
570 at_line_start = true;
571 last_newline_end = span.end;
572 }
573 Ok(Token::Hash) if at_line_start && span.start == last_newline_end => {
574 // Hash at very start of line (no indentation) is a comment
575 // Find end of line and create a comment token for the whole line
576 let comment_start = span.start;
577 let line_end = source[span.end..]
578 .find('\n')
579 .map_or(source.len(), |i| span.end + i);
580 let comment_text = &source[comment_start..line_end];
581 tokens.push((
582 Token::Comment(comment_text),
583 Span {
584 start: comment_start,
585 end: line_end,
586 },
587 ));
588 // Skip lexer tokens until we reach the newline
589 while let Some(peek_result) = lexer.next() {
590 let peek_span = lexer.span();
591 let peek_end = peek_span.end;
592 if peek_result == Ok(Token::Newline) {
593 tokens.push((Token::Newline, peek_span.into()));
594 at_line_start = true;
595 last_newline_end = peek_end;
596 break;
597 }
598 // Skip other tokens on the comment line
599 }
600 }
601 Ok(token) => {
602 // Check for indentation at line start
603 if at_line_start && span.start > last_newline_end {
604 // Count leading whitespace between last newline and this token
605 // Tabs count as indentation (treat 1 tab as 4 spaces for counting purposes)
606 let leading = &source[last_newline_end..span.start];
607 let mut space_count = 0;
608 let mut char_count = 0;
609 for c in leading.chars() {
610 match c {
611 ' ' => {
612 space_count += 1;
613 char_count += 1;
614 }
615 '\t' => {
616 space_count += 4; // Treat tab as 4 spaces
617 char_count += 1;
618 }
619 _ => break,
620 }
621 }
622 // Python beancount accepts 1+ space for metadata indentation
623 if space_count >= 1 {
624 let indent_start = last_newline_end;
625 let indent_end = last_newline_end + char_count;
626 // Use DeepIndent for 3+ spaces (posting metadata level).
627 // Python beancount allows flexible indentation where posting
628 // metadata just needs to be more indented than the posting.
629 // Common patterns: 2-space posting / 4-space meta, or
630 // 1-space posting / 3-space meta (as in beancount_reds_plugins).
631 let indent_token = if space_count >= 3 {
632 Token::DeepIndent(space_count)
633 } else {
634 Token::Indent(space_count)
635 };
636 tokens.push((
637 indent_token,
638 Span {
639 start: indent_start,
640 end: indent_end,
641 },
642 ));
643 }
644 }
645 at_line_start = false;
646 tokens.push((token, span.into()));
647 }
648 Err(()) => {
649 // Lexer error - produce an Error token with the invalid source text.
650 let invalid_text = &source[span.clone()];
651 apply_err_layout_transparency(
652 invalid_text,
653 span.start,
654 &mut at_line_start,
655 &mut last_newline_end,
656 );
657 tokens.push((Token::Error(invalid_text), span.into()));
658 }
659 }
660 }
661
662 tokens
663}
664
665#[cfg(test)]
666mod tests {
667 use super::*;
668
669 #[test]
670 fn test_tokenize_date() {
671 let tokens = tokenize("2024-01-15");
672 assert_eq!(tokens.len(), 1);
673 assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
674 }
675
676 #[test]
677 fn test_tokenize_date_single_digit_month() {
678 // Single-digit month should be tokenized as Date
679 let tokens = tokenize("2024-1-15");
680 assert_eq!(tokens.len(), 1);
681 assert!(matches!(tokens[0].0, Token::Date("2024-1-15")));
682 }
683
684 #[test]
685 fn test_tokenize_date_single_digit_day() {
686 // Single-digit day should be tokenized as Date
687 let tokens = tokenize("2024-01-5");
688 assert_eq!(tokens.len(), 1);
689 assert!(matches!(tokens[0].0, Token::Date("2024-01-5")));
690 }
691
692 #[test]
693 fn test_tokenize_date_single_digit_month_and_day() {
694 // Single-digit month and day should be tokenized as Date
695 let tokens = tokenize("2024-1-1");
696 assert_eq!(tokens.len(), 1);
697 assert!(matches!(tokens[0].0, Token::Date("2024-1-1")));
698 }
699
700 #[test]
701 fn test_tokenize_date_slash_separator_single_digit() {
702 // Slash separator with single-digit parts
703 let tokens = tokenize("2024/1/5");
704 assert_eq!(tokens.len(), 1);
705 assert!(matches!(tokens[0].0, Token::Date("2024/1/5")));
706 }
707
708 #[test]
709 fn test_tokenize_number() {
710 let tokens = tokenize("1234.56");
711 assert_eq!(tokens.len(), 1);
712 assert!(matches!(tokens[0].0, Token::Number("1234.56")));
713
714 // Negative numbers are now Minus + Number (enables subtraction expressions)
715 let tokens = tokenize("-1,234.56");
716 assert_eq!(tokens.len(), 2);
717 assert!(matches!(tokens[0].0, Token::Minus));
718 assert!(matches!(tokens[1].0, Token::Number("1,234.56")));
719 }
720
721 #[test]
722 fn test_tokenize_account() {
723 let tokens = tokenize("Assets:Bank:Checking");
724 assert_eq!(tokens.len(), 1);
725 assert!(matches!(
726 tokens[0].0,
727 Token::Account("Assets:Bank:Checking")
728 ));
729 }
730
731 #[test]
732 fn test_tokenize_account_unicode() {
733 // Unicode uppercase letters and CJK characters are valid at the
734 // start of account components. Emoji and symbols are not.
735
736 // Non-letter (emoji) after valid ASCII start — still invalid
737 let tokens = tokenize("Assets:CORP✨");
738 assert!(
739 !matches!(tokens[0].0, Token::Account("Assets:CORP✨")),
740 "Unicode emoji in account name should not tokenize as a valid Account"
741 );
742 assert!(
743 tokens.iter().any(|(t, _)| matches!(t, Token::Error(_))),
744 "Unicode emoji should produce at least one Error token"
745 );
746
747 // CJK sub-component start — now valid (CJK ideographs are \p{Lo})
748 let tokens = tokenize("Assets:沪深300");
749 assert!(
750 matches!(tokens[0].0, Token::Account("Assets:沪深300")),
751 "CJK characters at the start of a sub-component should tokenize as Account"
752 );
753
754 // Full CJK sub-component — valid
755 let tokens = tokenize("Assets:日本銀行");
756 assert!(
757 matches!(tokens[0].0, Token::Account("Assets:日本銀行")),
758 "CJK sub-component should tokenize as Account"
759 );
760
761 // Cyrillic account type — valid (Cyrillic uppercase is \p{Lu})
762 let tokens = tokenize("Капитал:Retained");
763 assert!(
764 matches!(tokens[0].0, Token::Account("Капитал:Retained")),
765 "Cyrillic-starting account should tokenize as Account"
766 );
767
768 // Fully CJK account — valid
769 let tokens = tokenize("资产:银行:支票");
770 assert!(
771 matches!(tokens[0].0, Token::Account("资产:银行:支票")),
772 "Fully CJK account should tokenize as Account"
773 );
774 }
775
776 /// Regression for issue #736/#739: Unicode letters AFTER an ASCII start
777 /// in account sub-components are valid per the beancount v3 spec.
778 #[test]
779 fn test_tokenize_account_unicode_letters_after_ascii_start() {
780 // French: É after ASCII start
781 let tokens = tokenize("Assets:Banque-Épargne");
782 assert!(
783 matches!(tokens[0].0, Token::Account("Assets:Banque-Épargne")),
784 "accented Latin letter after ASCII start should tokenize as Account, got: {tokens:?}"
785 );
786
787 // German: ü after ASCII start
788 let tokens = tokenize("Assets:Müller");
789 assert!(
790 matches!(tokens[0].0, Token::Account("Assets:Müller")),
791 "German umlaut after ASCII start should tokenize as Account, got: {tokens:?}"
792 );
793
794 // Mixed CJK after ASCII start — letters are allowed
795 let tokens = tokenize("Assets:CorpJP日本");
796 assert!(
797 matches!(tokens[0].0, Token::Account("Assets:CorpJP日本")),
798 "CJK letters after ASCII start should tokenize as Account, got: {tokens:?}"
799 );
800 }
801
802 #[test]
803 fn test_tokenize_currency() {
804 let tokens = tokenize("USD");
805 assert_eq!(tokens.len(), 1);
806 assert!(matches!(tokens[0].0, Token::Currency("USD")));
807 }
808
809 #[test]
810 fn test_tokenize_single_char_currency() {
811 // Single-char NYSE/NASDAQ tickers: T (AT&T), V (Visa), F (Ford), X (US Steel)
812 let tokens = tokenize("T");
813 assert_eq!(tokens.len(), 1);
814 assert!(matches!(tokens[0].0, Token::Currency("T")));
815
816 let tokens = tokenize("V");
817 assert_eq!(tokens.len(), 1);
818 assert!(matches!(tokens[0].0, Token::Currency("V")));
819
820 let tokens = tokenize("F");
821 assert_eq!(tokens.len(), 1);
822 assert!(matches!(tokens[0].0, Token::Currency("F")));
823 }
824
825 #[test]
826 fn test_single_char_currency_is_txn_flag() {
827 // Single-char currencies should be recognized as potential transaction flags
828 let token = Token::Currency("T");
829 assert!(token.is_txn_flag());
830
831 // Multi-char currencies should NOT be transaction flags
832 let token = Token::Currency("USD");
833 assert!(!token.is_txn_flag());
834 }
835
836 #[test]
837 fn test_tokenize_string() {
838 let tokens = tokenize(r#""Hello, World!""#);
839 assert_eq!(tokens.len(), 1);
840 assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
841 }
842
843 #[test]
844 fn test_tokenize_keywords() {
845 let tokens = tokenize("txn balance open close");
846 assert_eq!(tokens.len(), 4);
847 assert!(matches!(tokens[0].0, Token::Txn));
848 assert!(matches!(tokens[1].0, Token::Balance));
849 assert!(matches!(tokens[2].0, Token::Open));
850 assert!(matches!(tokens[3].0, Token::Close));
851 }
852
853 #[test]
854 fn test_tokenize_tag_and_link() {
855 let tokens = tokenize("#my-tag ^my-link");
856 assert_eq!(tokens.len(), 2);
857 assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
858 assert!(matches!(tokens[1].0, Token::Link("^my-link")));
859 }
860
861 #[test]
862 fn test_tokenize_comment() {
863 let tokens = tokenize("; This is a comment");
864 assert_eq!(tokens.len(), 1);
865 assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
866 }
867
868 #[test]
869 fn test_tokenize_indentation() {
870 let tokens = tokenize("txn\n Assets:Bank 100 USD");
871 // Should have: Txn, Newline, Indent, Account, Number, Currency
872 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
873 }
874
875 /// `Token::Error`'s Display impl strips embedded BOM bytes — if a
876 /// mid-file U+FEFF gets captured into a lexer error span, the
877 /// diagnostic still renders human-readably. The leading-BOM case
878 /// is handled at the `crate::parse` boundary (see `crate::bom`),
879 /// so this defensive measure only matters for mid-file BOMs that
880 /// fall into the lexer's default error path.
881 #[test]
882 fn test_display_token_error_strips_embedded_bom() {
883 let payload = "foo\u{FEFF}bar";
884 let s = format!("{}", Token::Error(payload));
885 assert_eq!(s, "foo<BOM>bar");
886 assert!(!s.contains(crate::bom::BOM_CHAR));
887 }
888
889 /// A mid-file BOM (any U+FEFF not at strict byte 0) reaches the
890 /// lexer with no special handling — there is no BOM regex on the
891 /// Token enum anymore. Logos's default error path emits `Token::Error`
892 /// for the unrecognized byte; the parser's error classifier (which
893 /// searches `error_text` for U+FEFF) surfaces the dedicated
894 /// diagnostic on the parser side. This test pins the lexer side:
895 /// some `Token::Error` appears in the stream containing the BOM byte.
896 #[test]
897 fn test_tokenize_mid_file_bom_falls_into_error_path() {
898 // Note: this test calls `tokenize` directly with the BOM byte
899 // present in the source — it does NOT go through `parse`, which
900 // would have stripped a strict-byte-0 BOM. So we put the BOM
901 // mid-source to bypass the strip.
902 let source = "2024-01-01 open Assets:Bank USD\n\u{FEFF}";
903 let tokens = tokenize(source);
904 let has_bom_in_error = tokens.iter().any(|(t, _)| {
905 if let Token::Error(s) = t {
906 s.contains(crate::bom::BOM_CHAR)
907 } else {
908 false
909 }
910 });
911 assert!(
912 has_bom_in_error,
913 "mid-file BOM should fall into `Token::Error`, got: {tokens:?}"
914 );
915 }
916
917 /// Layout-transparency contract for mid-file BOM: a BOM at line
918 /// start followed by indented content (the
919 /// `cat windows-a.bean windows-b.bean` concatenation case) must
920 /// NOT swallow the indent on the next token. The Err arm in
921 /// `tokenize` recognizes `Token::Error("\u{FEFF}")` and preserves
922 /// `at_line_start` + advances `last_newline_end` so the next
923 /// real token still gets its `Token::Indent` emission.
924 ///
925 /// Without this special case, the Err arm sets `at_line_start =
926 /// false` like for any other lex error, the indented posting
927 /// fails to produce an Indent token, and the parser misclassifies
928 /// the posting as a top-level directive — producing cascading
929 /// errors instead of the targeted BOM diagnostic.
930 #[test]
931 fn test_mid_file_bom_at_line_start_preserves_following_indent() {
932 // First a directive, then newline, then mid-file BOM, then
933 // indented posting-like content. `tokenize` is called directly
934 // (bypassing parse's strip-at-entry) so the BOM is mid-file.
935 let source = "2024-01-01 open Assets:Bank USD\n\u{FEFF} meta-key: \"v\"\n";
936 let tokens = tokenize(source);
937 // The Token::Error for the BOM must be present.
938 let has_bom_error = tokens.iter().any(|(t, _)| {
939 if let Token::Error(s) = t {
940 *s == crate::bom::BOM
941 } else {
942 false
943 }
944 });
945 assert!(
946 has_bom_error,
947 "expected Token::Error(\"\\u{{FEFF}}\") in stream, got: {tokens:?}"
948 );
949 // Critically: the indent for the 2-space metadata line must
950 // survive — it should be a Token::Indent(2), not absorbed.
951 let has_indent_2 = tokens.iter().any(|(t, _)| matches!(t, Token::Indent(2)));
952 assert!(
953 has_indent_2,
954 "mid-file BOM at line start must not swallow the following Indent; got: {tokens:?}"
955 );
956 // And the metadata key tokenizes normally on the same line.
957 assert!(
958 tokens
959 .iter()
960 .any(|(t, _)| matches!(t, Token::MetaKey("meta-key:"))),
961 "expected MetaKey after BOM-prefixed indent, got: {tokens:?}"
962 );
963 }
964
965 /// Consecutive BOMs at line start (logos emits each as its own
966 /// Err) ALL preserve layout-transparency. The Err arm uses
967 /// `trim_start_matches(BOM_CHAR)` to find non-BOM content, so a
968 /// triple-concatenated Windows file producing `\n\u{FEFF}\u{FEFF}`
969 /// at line start, followed by indented content, still emits the
970 /// `Indent` for the metadata line. Without the `trim_start_matches`
971 /// approach (using a single-BOM length check instead), the second
972 /// BOM would either not advance `last_newline_end` correctly or
973 /// would clobber `at_line_start`, breaking the indent walk on the
974 /// next real token.
975 #[test]
976 fn test_consecutive_mid_file_boms_preserve_layout() {
977 let source = "2024-01-01 open Assets:Bank USD\n\u{FEFF}\u{FEFF} meta-key: \"v\"\n";
978 let tokens = tokenize(source);
979 // Both BOMs should appear as Token::Error.
980 let bom_error_count = tokens
981 .iter()
982 .filter(|(t, _)| matches!(t, Token::Error(s) if *s == crate::bom::BOM))
983 .count();
984 assert_eq!(
985 bom_error_count, 2,
986 "expected 2 Token::Error(BOM) tokens, got: {tokens:?}"
987 );
988 // And the indent on the line containing the BOMs must survive.
989 let has_indent_2 = tokens.iter().any(|(t, _)| matches!(t, Token::Indent(2)));
990 assert!(
991 has_indent_2,
992 "consecutive mid-file BOMs at line start must not swallow following indent; \
993 got: {tokens:?}"
994 );
995 assert!(
996 tokens
997 .iter()
998 .any(|(t, _)| matches!(t, Token::MetaKey("meta-key:"))),
999 "expected MetaKey after consecutive-BOM-prefixed indent, got: {tokens:?}"
1000 );
1001 }
1002
1003 // ===== Direct tests of `apply_err_layout_transparency` =====
1004 //
1005 // These tests exercise the helper independently of logos's
1006 // emission strategy. Today logos emits one Err per unrecognized
1007 // char, so the multi-BOM-in-one-Err code path (the
1008 // `trim_start_matches` loop's motivating case) is unreachable
1009 // from real input. The tests below feed the helper synthetic
1010 // invalid_text values so the defensive code is actually
1011 // validated rather than documentation-only.
1012
1013 /// Coalesced double-BOM at line start: must advance
1014 /// `last_newline_end` past BOTH bytes and keep `at_line_start`.
1015 /// Pins the contract `trim_start_matches` exists to provide.
1016 #[test]
1017 fn err_layout_transparency_coalesced_double_bom_at_line_start() {
1018 let invalid_text = "\u{FEFF}\u{FEFF}";
1019 let span_start = 10;
1020 let mut at_line_start = true;
1021 let mut last_newline_end = 10;
1022 apply_err_layout_transparency(
1023 invalid_text,
1024 span_start,
1025 &mut at_line_start,
1026 &mut last_newline_end,
1027 );
1028 assert!(
1029 at_line_start,
1030 "all-BOM error span must preserve at_line_start"
1031 );
1032 assert_eq!(
1033 last_newline_end,
1034 10 + 2 * crate::bom::BOM_LEN,
1035 "last_newline_end must advance past BOTH BOMs, not just the first"
1036 );
1037 }
1038
1039 /// Coalesced BOM + trailing content: `at_line_start` clobbers (real
1040 /// content follows the BOM run); `last_newline_end` still
1041 /// advances past the BOM portion only.
1042 #[test]
1043 fn err_layout_transparency_coalesced_bom_with_trailing_content() {
1044 let invalid_text = "\u{FEFF}\u{FEFF}xyz";
1045 let span_start = 10;
1046 let mut at_line_start = true;
1047 let mut last_newline_end = 10;
1048 apply_err_layout_transparency(
1049 invalid_text,
1050 span_start,
1051 &mut at_line_start,
1052 &mut last_newline_end,
1053 );
1054 assert!(
1055 !at_line_start,
1056 "trailing non-BOM content must clobber at_line_start"
1057 );
1058 assert_eq!(
1059 last_newline_end,
1060 10 + 2 * crate::bom::BOM_LEN,
1061 "last_newline_end advances past leading BOMs, NOT past trailing content"
1062 );
1063 }
1064
1065 /// Non-BOM error: standard clobber.
1066 #[test]
1067 fn err_layout_transparency_non_bom_clobbers() {
1068 let invalid_text = "garbage";
1069 let mut at_line_start = true;
1070 let mut last_newline_end = 10;
1071 apply_err_layout_transparency(invalid_text, 10, &mut at_line_start, &mut last_newline_end);
1072 assert!(!at_line_start);
1073 assert_eq!(last_newline_end, 10, "non-BOM error must not advance");
1074 }
1075
1076 /// All-BOM error span but NOT at line start (e.g., BOM appears
1077 /// mid-line after some content): `at_line_start` was already
1078 /// false, the inner advance guard fails, and nothing changes.
1079 #[test]
1080 fn err_layout_transparency_all_bom_not_at_line_start_is_noop() {
1081 let invalid_text = "\u{FEFF}\u{FEFF}";
1082 let span_start = 20;
1083 let mut at_line_start = false; // mid-line
1084 let mut last_newline_end = 10;
1085 apply_err_layout_transparency(
1086 invalid_text,
1087 span_start,
1088 &mut at_line_start,
1089 &mut last_newline_end,
1090 );
1091 assert!(!at_line_start);
1092 assert_eq!(last_newline_end, 10, "guard prevents stale advance");
1093 }
1094
1095 /// Complementary to the previous test: the inner `at_line_start &&
1096 /// span_start == last_newline_end` guard has two clauses. The
1097 /// `*_not_at_line_start_*` test above exercises the first
1098 /// (`at_line_start = false`); THIS test pins the second
1099 /// (span doesn't begin at `last_newline_end`).
1100 ///
1101 /// Without exercising both clauses independently, a refactor that
1102 /// flipped `&&` to `||` would not be caught — either clause alone
1103 /// suffices to suppress the advance.
1104 #[test]
1105 fn err_layout_transparency_all_bom_span_mismatch_is_noop() {
1106 let invalid_text = "\u{FEFF}\u{FEFF}";
1107 // at_line_start IS true (the first clause's condition holds)…
1108 let mut at_line_start = true;
1109 // …but span_start (20) != last_newline_end (10), so the
1110 // second clause's condition fails. Combined: the advance
1111 // must NOT fire.
1112 let span_start = 20;
1113 let mut last_newline_end = 10;
1114 apply_err_layout_transparency(
1115 invalid_text,
1116 span_start,
1117 &mut at_line_start,
1118 &mut last_newline_end,
1119 );
1120 assert!(
1121 at_line_start,
1122 "all-BOM error span must preserve at_line_start regardless of span-vs-last-newline match"
1123 );
1124 assert_eq!(
1125 last_newline_end, 10,
1126 "span_start != last_newline_end must prevent stale advance"
1127 );
1128 }
1129
1130 /// Round-17/18: the contract "every BOM byte is layout-
1131 /// transparent" covers BOMs at ANY position in a coalesced error
1132 /// span, not just the leading run. Pre-round-17 the
1133 /// implementation only inspected the leading BOM run for the
1134 /// `at_line_start` decision — a coalesced span like
1135 /// `@@<BOM>` (non-BOM head, BOM tail) was clobbered by the
1136 /// leading-only logic even though the trailing BOM should have
1137 /// been transparent (and the leading `@@` would correctly
1138 /// clobber on its own). The fixed implementation walks the
1139 /// whole span: ANY non-BOM byte clobbers; only an all-BOM span
1140 /// (in any arrangement) preserves `at_line_start`.
1141 ///
1142 /// These tests cover the interleaved shapes the round-17
1143 /// contract claims to handle: BOM-only-tail, BOM-in-middle,
1144 /// and the recently-flagged "BOM-only in any arrangement"
1145 /// preservation guarantee.
1146 #[test]
1147 fn err_layout_transparency_bom_only_in_any_arrangement_preserves() {
1148 // All-BOM coalesced span — preserves at_line_start AND
1149 // advances last_newline_end past the leading run.
1150 let mut at_line_start = true;
1151 let mut last_newline_end = 10;
1152 apply_err_layout_transparency(
1153 "\u{FEFF}\u{FEFF}",
1154 10, // span_start == last_newline_end → advance fires
1155 &mut at_line_start,
1156 &mut last_newline_end,
1157 );
1158 assert!(at_line_start, "all-BOM span preserves at_line_start");
1159 assert_eq!(
1160 last_newline_end, 16,
1161 "leading BOM run advances last_newline_end past both BOM bytes \
1162 (each BOM is 3 UTF-8 bytes)"
1163 );
1164 }
1165
1166 /// Non-BOM head clobbers `at_line_start`. Pre-round-17 also did
1167 /// this (correctly); pinning prevents a regression that re-
1168 /// introduces a BOM-only-trim that misses non-BOM head bytes.
1169 #[test]
1170 fn err_layout_transparency_non_bom_head_clobbers() {
1171 let mut at_line_start = true;
1172 let mut last_newline_end = 0;
1173 apply_err_layout_transparency("@@\u{FEFF}", 10, &mut at_line_start, &mut last_newline_end);
1174 assert!(
1175 !at_line_start,
1176 "non-BOM head ('@@') clobbers at_line_start regardless of trailing BOM"
1177 );
1178 }
1179
1180 /// BOM head + non-BOM tail clobbers (because of the tail).
1181 /// Pre-round-17 the leading-only logic was correct here too;
1182 /// pinning ensures no regression that flips to leading-only.
1183 #[test]
1184 fn err_layout_transparency_bom_head_non_bom_tail_clobbers() {
1185 let mut at_line_start = true;
1186 let mut last_newline_end = 10;
1187 apply_err_layout_transparency("\u{FEFF}@@", 10, &mut at_line_start, &mut last_newline_end);
1188 assert!(
1189 !at_line_start,
1190 "non-BOM tail ('@@') clobbers at_line_start even though span starts with BOM"
1191 );
1192 assert_eq!(
1193 last_newline_end, 13,
1194 "leading BOM run STILL advances last_newline_end past the BOM"
1195 );
1196 }
1197
1198 /// Non-BOM in the middle of a BOM-flanked span clobbers. THIS
1199 /// is the case the round-17 docstring specifically claimed to
1200 /// cover; pre-round-17 the same outcome held (leading BOMs
1201 /// trimmed, non-empty tail clobbered) but only by accident.
1202 /// The fixed `has_non_bom_byte = chars().any(|c| c != BOM)`
1203 /// walks the whole span and makes the case explicit.
1204 #[test]
1205 fn err_layout_transparency_bom_flanking_non_bom_clobbers() {
1206 let mut at_line_start = true;
1207 let mut last_newline_end = 10;
1208 apply_err_layout_transparency(
1209 "\u{FEFF}@@\u{FEFF}",
1210 10,
1211 &mut at_line_start,
1212 &mut last_newline_end,
1213 );
1214 assert!(
1215 !at_line_start,
1216 "non-BOM middle ('@@') clobbers at_line_start"
1217 );
1218 assert_eq!(
1219 last_newline_end, 13,
1220 "leading BOM run advances last_newline_end past the leading BOM only"
1221 );
1222 }
1223
1224 #[test]
1225 fn test_tokenize_transaction_line() {
1226 let source = "2024-01-15 * \"Grocery Store\" #food\n Expenses:Food 50.00 USD";
1227 let tokens = tokenize(source);
1228
1229 // Check key tokens are present
1230 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
1231 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
1232 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
1233 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
1234 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
1235 assert!(
1236 tokens
1237 .iter()
1238 .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
1239 );
1240 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
1241 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
1242 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
1243 }
1244
1245 #[test]
1246 fn test_tokenize_metadata_key() {
1247 let tokens = tokenize("filename:");
1248 assert_eq!(tokens.len(), 1);
1249 assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
1250 }
1251
1252 #[test]
1253 fn test_tokenize_punctuation() {
1254 let tokens = tokenize("{ } @ @@ , ~");
1255 let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
1256 assert!(token_types.contains(&Token::LBrace));
1257 assert!(token_types.contains(&Token::RBrace));
1258 assert!(token_types.contains(&Token::At));
1259 assert!(token_types.contains(&Token::AtAt));
1260 assert!(token_types.contains(&Token::Comma));
1261 assert!(token_types.contains(&Token::Tilde));
1262 }
1263}