rustledger_parser/cst/
parser.rs

1//! CST builders: phase 1 flat ([`parse_flat`]) + phase 2.1-2.4
2//! structured ([`parse_structured`]).
3//!
4//! Both walk the lossless token stream and emit a `GreenNode` whose
5//! `text()` is byte-identical to the input source. They differ in
6//! what they wrap:
7//!
8//! - [`parse_flat`] (phase 1) puts every token as a direct child of
9//!   a single `SOURCE_FILE` node. Useful for round-trip-only tests
10//!   and the kind-sequence corpus baseline.
11//! - [`parse_structured`] recognizes:
12//!   - **Phase 2.1a**: 14 single-line directive shapes —
13//!     `OPEN`/`CLOSE`/`BALANCE`/`PAD`/`EVENT`/`QUERY`/`NOTE`/
14//!     `DOCUMENT`/`PRICE`/`COMMODITY` (dated) +
15//!     `PUSHTAG`/`POPTAG`/`PUSHMETA`/`POPMETA` (top-level keyword).
16//!   - **Phase 2.1b**: `TRANSACTION` — DATE + `STAR` / `PENDING_KW`
17//!     (`!`) / `FLAG` / `TXN_KW`, multi-line scope through the last
18//!     indented sub-line (postings, metadata, indented comments).
19//!
20//!   Each wraps in its specific node kind per the Directive-
21//!   Terminator Rule (see [`crate::cst::trivia`]).
22//!
23//!   - **Phase 2.3**: edge directives —
24//!     `OPTION_DIRECTIVE` / `INCLUDE_DIRECTIVE` /
25//!     `PLUGIN_DIRECTIVE` (top-level keyword) +
26//!     `CUSTOM_DIRECTIVE` (dated with arbitrary trailing value
27//!     list). Body / metadata shape is identical to PR 2.1a's
28//!     dated and standalone-keyword directives — only the header
29//!     keyword recognition is new.
30//!
31//!   - **Phase 2.4**: error recovery — unrecognized / malformed
32//!     top-level lines are wrapped in `ERROR_NODE` (terminated by
33//!     NEWLINE or EOF per rule 5). Same trivia attachment policy
34//!     as recognized directives (rule 2): pending leading trivia
35//!     attaches inside the `ERROR_NODE` when it's not the very
36//!     first content in the file. AMOUNT now also wraps full
37//!     arithmetic expressions (`[sign] (NUMBER | PAREN_EXPR)
38//!     ([WS] op [WS] (NUMBER | PAREN_EXPR))* [WS CURRENCY]`),
39//!     closing the deferred 2.2c.1 divergence with Python
40//!     beancount on `10+5 USD`-shape amounts.
41//!
42//! Phase 2.2a adds `META_ENTRY` sub-node structure around indented
43//! `WS META_KEY ... (NEWLINE | EOF)` sub-lines inside any directive
44//! or transaction (per rule 5 of `cst::trivia`, an unterminated
45//! final sub-line at EOF still gets wrapped). Phase 2.2b adds
46//! `POSTING` sub-node structure around each `WS [(FLAG | STAR |
47//! PENDING_KW | HASH | single-char CURRENCY) WS] ACCOUNT ...`
48//! posting line inside `TRANSACTION` (the flag arm mirrors
49//! `parse_flag` in the legacy AST parser and `identify_directive`'s
50//! transaction-trigger arm; single-char `CURRENCY` covers letters
51//! like `T`/`V`/`F`/`X` that win the lexer's priority-3 Currency-
52//! vs-Flag tie-break). Posting-attached metadata (`META_ENTRY` sub-
53//! lines following the posting, indented `>=` the posting) becomes a
54//! child of that `POSTING`. Phase 2.2c adds `AMOUNT` / `COST_SPEC` /
55//! `PRICE_ANNOTATION` inside `POSTING`. Phase 5 deletes
56//! `parse_flat` once `parse_structured` covers every byte in
57//! every corpus file.
58
59use std::ops::Range;
60
61use rowan::GreenNodeBuilder;
62
63use crate::cst::lossless_tokens::lossless_kind_tokens;
64use crate::cst::syntax_kind::{SyntaxKind, SyntaxNode};
65
66/// Parse `source` to a flat lossless CST.
67///
68/// The returned node's text serialization equals `source` byte-for-
69/// byte for every UTF-8 input. Every token is a direct child of
70/// `SOURCE_FILE`; no structural directive wrapping.
71#[must_use]
72pub fn parse_flat(source: &str) -> SyntaxNode {
73    let mut builder = GreenNodeBuilder::new();
74    builder.start_node(SyntaxKind::SOURCE_FILE.into());
75    for (kind, range) in lossless_kind_tokens(source) {
76        builder.token(kind.into(), &source[range]);
77    }
78    builder.finish_node();
79    SyntaxNode::new_root(builder.finish())
80}
81
82/// Parse `source` to a structured lossless CST.
83///
84/// Recognizes the 14 single-line directive shapes (PR 2.1a) plus
85/// `TRANSACTION` (PR 2.1b) plus the 4 edge directives `OPTION` /
86/// `INCLUDE` / `PLUGIN` / `CUSTOM` (PR 2.3), and wraps each in its
87/// specific node kind. Trivia attaches per the Directive-
88/// Terminator Rule.
89///
90/// Unrecognized / malformed top-level lines are wrapped in an
91/// `ERROR_NODE` (PR 2.4) — same trivia attachment policy as
92/// recognized directives and the same rule-5 unterminated-at-EOF
93/// behavior. Round-trip byte-identical for every UTF-8 input.
94#[must_use]
95pub fn parse_structured(source: &str) -> SyntaxNode {
96    let tokens: Vec<(SyntaxKind, Range<usize>)> = lossless_kind_tokens(source);
97    let mut builder = GreenNodeBuilder::new();
98    builder.start_node(SyntaxKind::SOURCE_FILE.into());
99
100    let mut pending_leading: Vec<(SyntaxKind, Range<usize>)> = Vec::new();
101    let mut seen_first_content = false;
102    let mut i = 0;
103
104    while i < tokens.len() {
105        let (kind, ref range) = tokens[i];
106        if kind.is_trivia() {
107            pending_leading.push((kind, range.clone()));
108            i += 1;
109            continue;
110        }
111
112        // Non-trivia at the top level. Identify what kind of line
113        // starts here. Both branches share the same trivia-
114        // attachment + node-emission shape: drain pending trivia
115        // around `start_node(kind)` per rule 2 (the FIRST
116        // non-trivia content's pending trivia attaches under
117        // SOURCE_FILE; subsequent runs attach INSIDE the new
118        // node), emit the body, then `finish_node()`.
119        let node_kind = identify_directive(&tokens, i).unwrap_or(SyntaxKind::ERROR_NODE);
120        if seen_first_content {
121            builder.start_node(node_kind.into());
122            emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
123        } else {
124            emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
125            builder.start_node(node_kind.into());
126        }
127        seen_first_content = true;
128        i = match node_kind {
129            SyntaxKind::TRANSACTION => emit_transaction_body(&mut builder, source, &tokens, i),
130            SyntaxKind::ERROR_NODE => emit_through_terminator(&mut builder, source, &tokens, i),
131            // Recognized directive (PR 2.1a / 2.3 single-line shapes):
132            // header + optional indented META_ENTRY sub-lines.
133            _ => emit_directive_body(&mut builder, source, &tokens, i),
134        };
135        builder.finish_node();
136    }
137
138    // File-trailing trivia: drain any pending under SOURCE_FILE.
139    emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
140
141    builder.finish_node();
142    SyntaxNode::new_root(builder.finish())
143}
144
145/// Emit a sequence of `(kind, range)` tokens into the builder.
146fn emit_tokens(
147    builder: &mut GreenNodeBuilder<'_>,
148    source: &str,
149    tokens: impl IntoIterator<Item = (SyntaxKind, Range<usize>)>,
150) {
151    for (kind, range) in tokens {
152        builder.token(kind.into(), &source[range]);
153    }
154}
155
156/// Consume `tokens[i..]` into `builder` up to and including the
157/// next `NEWLINE` token (or EOF). Returns the new index `i`.
158fn emit_through_terminator(
159    builder: &mut GreenNodeBuilder<'_>,
160    source: &str,
161    tokens: &[(SyntaxKind, Range<usize>)],
162    mut i: usize,
163) -> usize {
164    while i < tokens.len() {
165        let (kind, ref range) = tokens[i];
166        builder.token(kind.into(), &source[range.clone()]);
167        i += 1;
168        if kind == SyntaxKind::NEWLINE {
169            break;
170        }
171    }
172    i
173}
174
175/// Consume one indented sub-line of a directive or transaction
176/// body, wrapping it in a `META_ENTRY` node iff it's metadata
177/// (i.e., starts `WS META_KEY ...`).
178///
179/// Phase 2.2a structural wrapping: each metadata sub-line becomes
180/// its own `META_ENTRY` node containing the indent `WHITESPACE`,
181/// the `META_KEY`, the rest of the line's content tokens, and —
182/// when present — the terminator `NEWLINE`. An UNTERMINATED final
183/// metadata sub-line at EOF (per rule 5 of `cst::trivia`) is still
184/// wrapped: its `META_ENTRY` simply ends at the last content token
185/// with no `NEWLINE` child. Token kinds inside the `META_ENTRY`
186/// stay flat — phase 3's typed-AST surface will expose `key()` and
187/// `value()` accessors that walk these children. Indented
188/// `;`-comments flow through as flat children, NOT wrapped in
189/// `META_ENTRY`. POSTING lines are recognized earlier in
190/// `emit_transaction_body` and never reach this helper.
191fn emit_body_sub_line(
192    builder: &mut GreenNodeBuilder<'_>,
193    source: &str,
194    tokens: &[(SyntaxKind, Range<usize>)],
195    i: usize,
196) -> usize {
197    if starts_meta_sub_line(tokens, i) {
198        builder.start_node(SyntaxKind::META_ENTRY.into());
199        let next = emit_through_terminator(builder, source, tokens, i);
200        builder.finish_node();
201        next
202    } else {
203        emit_through_terminator(builder, source, tokens, i)
204    }
205}
206
207/// Returns true iff `tokens[i..]` starts an indented `WS META_KEY ...`
208/// metadata sub-line.
209///
210/// **Single source of truth** for the `WS + META_KEY` recognition
211/// pattern. Used by both `emit_body_sub_line` (decides whether to
212/// open a `META_ENTRY` node around the sub-line) and
213/// `is_indented_directive_continuation`'s `META_KEY` arm (decides
214/// whether the directive body should keep consuming). Routing both
215/// call sites through one helper prevents the predicate-pair drift
216/// hazard where one widens (e.g. admits a different indent token)
217/// without the other and the parser starts consuming sub-lines
218/// without wrapping them, or wrapping sub-lines that the body loop
219/// never reaches.
220fn starts_meta_sub_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
221    matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _)))
222        && matches!(tokens.get(i + 1), Some((SyntaxKind::META_KEY, _)))
223}
224
225/// Consume the header line through its terminator NEWLINE, then
226/// keep consuming any indented metadata sub-lines OR indented
227/// `;`/`%` comment lines that follow at the same logical block.
228///
229/// The Directive-Terminator Rule (see `cst::trivia`) declares that
230/// a directive carrying metadata spans multiple lines: its last
231/// content token is the last content token of its LAST sub-line,
232/// not the header. Stopping at the header NEWLINE would orphan
233/// metadata under `SOURCE_FILE` and silently violate the rule. PR
234/// 2.1a wraps the full multi-line span; PR 2.2 will introduce a
235/// `META_ENTRY` sub-node around each `WHITESPACE META_KEY ...
236/// NEWLINE` run inside.
237///
238/// A continuation sub-line is recognized as `WHITESPACE` (the
239/// indent) followed by either:
240/// - `META_KEY` — the standard metadata sub-line, or
241/// - any comment-class trivia token (per [`is_comment_token`]: `;`,
242///   `%`, `#!`, `#+`) — an indented documentation comment between
243///   metadata entries (a common Beancount idiom; keeping it inside
244///   the directive prevents subsequent metadata from getting
245///   orphaned to `SOURCE_FILE`).
246///
247/// Anything else — a blank line, a non-indented top-level token,
248/// EOF — terminates the directive. Blank-line separated metadata
249/// blocks are currently a known limitation: a `\n` between two
250/// metadata entries closes the directive and orphans the second
251/// entry. PR 2.2's grammar will likely subsume this when it
252/// introduces `META_ENTRY` structure.
253fn emit_directive_body(
254    builder: &mut GreenNodeBuilder<'_>,
255    source: &str,
256    tokens: &[(SyntaxKind, Range<usize>)],
257    mut i: usize,
258) -> usize {
259    i = emit_through_terminator(builder, source, tokens, i);
260    // PROSPECTIVELY scan the upcoming indented-content block for
261    // any `WS META_KEY`. If the block contains metadata, any
262    // indented comments anywhere in it — including BEFORE the
263    // first META_KEY (the "doc-comment-for-the-following-field"
264    // idiom) — are continuations that belong inside the directive.
265    // If the block contains NO metadata, an indented comment is
266    // inter-directive trivia (rule 2) or file-trailing (rule 4)
267    // and must not be absorbed. Per-line bookkeeping was tried in
268    // v4 but couldn't see the META_KEY that came AFTER a leading
269    // comment, so a comment-before-first-metadata silently closed
270    // the directive and orphaned the metadata.
271    let block_has_meta = upcoming_indented_block_has_meta(tokens, i);
272    while is_indented_directive_continuation(tokens, i, block_has_meta) {
273        i = emit_body_sub_line(builder, source, tokens, i);
274    }
275    i
276}
277
278/// Consume the transaction header through its terminator NEWLINE,
279/// then keep consuming ANY indented sub-line (postings, metadata,
280/// indented comments — any line starting with `WHITESPACE`
281/// followed by a non-`NEWLINE` token).
282///
283/// **Phase 2.2b attributes metadata by indent depth.** Beancount
284/// distinguishes TRANSACTION-level metadata (at the transaction's
285/// standard indent, typically two spaces, before any posting OR
286/// interspersed between postings at that same indent) from
287/// POSTING-attached metadata (at a DEEPER indent following a
288/// posting line). The transaction-level case stays a direct child
289/// of `TRANSACTION`; the posting-attached case becomes a child of
290/// the preceding `POSTING` node.
291///
292/// State machine: walk the body lines while tracking the indent
293/// width of the most-recently-opened `POSTING` (if any). For each
294/// sub-line:
295///
296/// - **Posting line** (`WS [(FLAG | STAR | PENDING_KW | HASH |
297///   single-char CURRENCY) WS] ACCOUNT ...`, full flag set per
298///   [`starts_posting_sub_line`]):
299///   close the open POSTING if any, then open a new POSTING and
300///   consume the line. **Sibling POSTING indents are not required
301///   to be uniform**: a transaction with postings at different
302///   indent depths produces sibling POSTING nodes whose
303///   `open_posting_indent` reflects each one's own header indent.
304///   Subsequent metadata then attributes against the
305///   most-recently-opened POSTING's indent, which means
306///   metadata can attribute differently depending on which
307///   posting precedes it. Beancount's grammar uses uniform
308///   indentation by convention, so this is a defensive (not
309///   primary) shape; pinned by
310///   `postings_at_increasing_indents_produce_siblings_and_meta_attributes_to_latest`.
311/// - **Metadata sub-line** (`WS META_KEY ...`): if a POSTING is
312///   open AND this line's indent is `>=` the POSTING's indent, emit
313///   the `META_ENTRY` INSIDE the POSTING. Otherwise (no open POSTING,
314///   or strictly shallower indent), close any open POSTING and emit
315///   the `META_ENTRY` at TRANSACTION level. The `>=` (not `>`) match
316///   mirrors Beancount, which attributes metadata to the preceding
317///   posting by POSITION, so same-indent `key: value` is posting
318///   metadata.
319/// - **Indented comment line** (`WS COMMENT` / `WS PERCENT_COMMENT`):
320///   apply the same indent-attribution rule as metadata. If the
321///   comment is strictly more indented than the open POSTING, it
322///   stays INSIDE the POSTING (preserving the doc-comment-for-
323///   following-posting-metadata idiom — a deeper-indented `; doc`
324///   followed by deeper-indented `key: value` should both belong
325///   to the same posting). Otherwise close any open POSTING and
326///   emit the comment flat at TRANSACTION level (matches the
327///   `posting_with_indented_comment_between_postings_terminates_posting`
328///   test, where the comment is at the SAME indent as the postings
329///   and is therefore transaction-level inter-posting trivia).
330/// - **Any other indented content** (`WS STRING`, `WS NUMBER`,
331///   unrecognized shape): close any open POSTING and emit the line
332///   flat at TRANSACTION level. We don't know what to do with it
333///   structurally; flat-passthrough preserves bytes.
334///
335/// Indent width is measured as the BYTE LENGTH of the leading
336/// `WHITESPACE` token — sufficient when the source uses uniform
337/// spaces (the standard Beancount convention). **Known divergence
338/// from the legacy AST parser**: the legacy lexer's `Indent(N)` /
339/// `DeepIndent(N)` variants (`logos_lexer.rs:615-616`) count tabs
340/// as 4 spaces, so a tab-indented posting followed by space-
341/// indented metadata is compared by VISUAL columns there but by
342/// BYTE COUNT here. The two paths can disagree on mixed-indent
343/// files. No test corpus file currently triggers the divergence in
344/// posting-attached-metadata position; if one shows up, switching
345/// `indent_width` to a column-aware count is the fix.
346///
347/// Compared with `emit_directive_body` (which only continues on
348/// `WS META_KEY` and gated `WS COMMENT`), transactions have a
349/// looser body shape. PR 2.2c will introduce `AMOUNT` /
350/// `COST_SPEC` / `PRICE_ANNOTATION` sub-nodes INSIDE `POSTING`;
351/// for now the POSTING's content tokens (account, amount,
352/// currency, etc.) stay flat children of POSTING.
353///
354/// Termination: a blank line (NEWLINE alone, or WHITESPACE then
355/// NEWLINE), any non-indented top-level token, or EOF. Any open
356/// POSTING is closed before returning.
357fn emit_transaction_body(
358    builder: &mut GreenNodeBuilder<'_>,
359    source: &str,
360    tokens: &[(SyntaxKind, Range<usize>)],
361    mut i: usize,
362) -> usize {
363    i = emit_through_terminator(builder, source, tokens, i);
364
365    let mut open_posting_indent: Option<usize> = None;
366
367    while is_indented_transaction_body_line(tokens, i) {
368        let sub_line_indent = indent_width(tokens, i);
369
370        if starts_posting_sub_line(tokens, i) {
371            if open_posting_indent.is_some() {
372                builder.finish_node();
373            }
374            builder.start_node(SyntaxKind::POSTING.into());
375            open_posting_indent = Some(sub_line_indent);
376            i = emit_posting_line(builder, source, tokens, i);
377        } else if starts_meta_sub_line(tokens, i) {
378            // Beancount attributes metadata by POSITION: a `key: value`
379            // line following a posting attaches to that posting, even
380            // at the SAME indent (`attach_on_equal = true`).
381            close_open_posting_unless_attached(
382                builder,
383                &mut open_posting_indent,
384                sub_line_indent,
385                true,
386            );
387            i = emit_body_sub_line(builder, source, tokens, i);
388        } else if starts_indented_comment(tokens, i) {
389            // Comments use the STRICT (`>`) rule: deeper-indented
390            // comments stay INSIDE the open POSTING; same-or-shallower
391            // comments close the POSTING and emit flat at TRANSACTION
392            // level. Comments are AST-invisible, so this only affects
393            // formatter emission placement.
394            close_open_posting_unless_attached(
395                builder,
396                &mut open_posting_indent,
397                sub_line_indent,
398                false,
399            );
400            i = emit_through_terminator(builder, source, tokens, i);
401        } else {
402            // Catch-all: any other indented content (e.g., `WS
403            // STRING`, `WS NUMBER`, or unrecognized shapes that
404            // future error-recovery work might surface). Close any
405            // open POSTING and emit flat at TRANSACTION level. PR
406            // 2.2c (AMOUNT / COST_SPEC / PRICE_ANNOTATION) lives
407            // INSIDE a `POSTING` and reaches the parser through
408            // `starts_posting_sub_line`, never this branch — but
409            // if a future continuation form (e.g., multi-line
410            // postings) gets added, this branch is where it would
411            // need to be teased apart from genuine other content.
412            if open_posting_indent.is_some() {
413                builder.finish_node();
414                open_posting_indent = None;
415            }
416            i = emit_through_terminator(builder, source, tokens, i);
417        }
418    }
419
420    if open_posting_indent.is_some() {
421        builder.finish_node();
422    }
423
424    i
425}
426
427/// Consume a posting sub-line through its terminator NEWLINE (or
428/// EOF), wrapping the `AMOUNT`, `COST_SPEC`, and `PRICE_ANNOTATION`
429/// sub-structures inside the already-open `POSTING` node.
430///
431/// Preconditions: the caller has opened a `POSTING` node and is
432/// positioned at the first token of the posting line (`WS`).
433/// `starts_posting_sub_line(tokens, i)` must hold.
434///
435/// Body shape (after the `WS [(flag) WS] ACCOUNT` prefix):
436///
437/// - `AMOUNT` is the units amount: `[(MINUS | PLUS)] NUMBER
438///   [WS CURRENCY]`, or a bare `CURRENCY`. Mirrors the legacy AST
439///   `parse_incomplete_amount`: NUMBER + optional CURRENCY, or
440///   CURRENCY alone. Wrapping skips intervening `WHITESPACE`
441///   between AMOUNT and CURRENCY so the sub-node owns both.
442/// - `COST_SPEC` is a bracketed cost annotation, opened by
443///   `L_BRACE` (per-unit), `L_BRACE_HASH` (per-unit + total), or
444///   `L_DOUBLE_BRACE` (total-only), and closed by the matching
445///   `R_BRACE` / `R_DOUBLE_BRACE`. Contents stay flat children;
446///   phase 3 typed-AST will surface accessors. Per rule 5 of
447///   `cst::trivia`, an unclosed brace at EOF still gets wrapped
448///   (the `COST_SPEC` simply has no matching close-brace child).
449/// - `PRICE_ANNOTATION` is opened by `AT` (per-unit price) or
450///   `AT_AT` (total price). Its trailing amount is recursively
451///   wrapped in `AMOUNT` so the structure mirrors the units-amount
452///   case: `PRICE_ANNOTATION(AT [WS AMOUNT])`. The typed-AST
453///   decodes per-unit-vs-total by the opener token kind, then
454///   walks the `AMOUNT` child for the number/currency.
455///
456/// Canonical order on a well-formed posting line is `ACCOUNT
457/// [AMOUNT] [COST_SPEC] [PRICE_ANNOTATION]`. The state machine
458/// here is order-independent at the recognition level (each sub-
459/// structure wraps when its opener token is encountered), so a
460/// malformed posting with reordered or duplicated sub-structures
461/// still round-trips byte-identically — duplicates each get their
462/// own wrapper.
463///
464/// Trailing tokens (`WHITESPACE`, `COMMENT`, `PERCENT_COMMENT`,
465/// `NEWLINE`) that follow the last recognized sub-structure stay
466/// as flat children of `POSTING`.
467fn emit_posting_line(
468    builder: &mut GreenNodeBuilder<'_>,
469    source: &str,
470    tokens: &[(SyntaxKind, Range<usize>)],
471    mut i: usize,
472) -> usize {
473    // Emit the indent `WHITESPACE`.
474    if let Some((SyntaxKind::WHITESPACE, range)) = tokens.get(i) {
475        builder.token(SyntaxKind::WHITESPACE.into(), &source[range.clone()]);
476        i += 1;
477    }
478
479    // Optional flag (`FLAG` / `STAR` / `PENDING_KW` / `HASH` /
480    // single-char `CURRENCY`) + separating `WHITESPACE`. Mirrors
481    // `starts_posting_sub_line`'s flag arm.
482    let next = tokens.get(i).map(|(k, _)| *k);
483    let is_flag = match next {
484        Some(SyntaxKind::FLAG | SyntaxKind::STAR | SyntaxKind::PENDING_KW | SyntaxKind::HASH) => {
485            true
486        }
487        Some(SyntaxKind::CURRENCY) => tokens[i].1.len() == 1,
488        _ => false,
489    };
490    if is_flag {
491        // Emit flag + WHITESPACE pair.
492        if let Some((kind, range)) = tokens.get(i) {
493            builder.token((*kind).into(), &source[range.clone()]);
494            i += 1;
495        }
496        if let Some((SyntaxKind::WHITESPACE, range)) = tokens.get(i) {
497            builder.token(SyntaxKind::WHITESPACE.into(), &source[range.clone()]);
498            i += 1;
499        }
500    }
501
502    // Emit the required ACCOUNT.
503    if let Some((SyntaxKind::ACCOUNT, range)) = tokens.get(i) {
504        builder.token(SyntaxKind::ACCOUNT.into(), &source[range.clone()]);
505        i += 1;
506    }
507
508    // Scan post-ACCOUNT tokens, wrapping AMOUNT / COST_SPEC /
509    // PRICE_ANNOTATION as openers appear. Anything else flows as
510    // flat children of POSTING.
511    while i < tokens.len() {
512        let (kind, range) = (tokens[i].0, tokens[i].1.clone());
513        if kind == SyntaxKind::NEWLINE {
514            builder.token(kind.into(), &source[range]);
515            i += 1;
516            break;
517        }
518        if starts_amount(tokens, i) {
519            i = emit_amount(builder, source, tokens, i);
520            continue;
521        }
522        if matches!(
523            kind,
524            SyntaxKind::L_BRACE | SyntaxKind::L_BRACE_HASH | SyntaxKind::L_DOUBLE_BRACE,
525        ) {
526            i = emit_cost_spec(builder, source, tokens, i);
527            continue;
528        }
529        if matches!(kind, SyntaxKind::AT | SyntaxKind::AT_AT) {
530            i = emit_price_annotation(builder, source, tokens, i);
531            continue;
532        }
533        // Flat passthrough (WHITESPACE, COMMENT, PERCENT_COMMENT,
534        // anything else).
535        builder.token(kind.into(), &source[range]);
536        i += 1;
537    }
538
539    i
540}
541
542/// Returns true iff `tokens[i..]` starts an AMOUNT-shape token
543/// run: an arithmetic-expression operand (`NUMBER`, `L_PAREN`, or
544/// signed variants), or a bare `CURRENCY`. Used by
545/// `emit_posting_line` to gate whether to open an `AMOUNT` wrapper.
546fn starts_amount(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
547    match tokens.get(i).map(|(k, _)| *k) {
548        Some(SyntaxKind::NUMBER | SyntaxKind::CURRENCY | SyntaxKind::L_PAREN) => true,
549        Some(SyntaxKind::MINUS | SyntaxKind::PLUS) => matches!(
550            tokens.get(i + 1).map(|(k, _)| *k),
551            Some(SyntaxKind::NUMBER | SyntaxKind::L_PAREN),
552        ),
553        _ => false,
554    }
555}
556
557/// Returns true iff `tokens[i]` is an arithmetic operator
558/// (`PLUS` / `MINUS` / `STAR` / `SLASH`).
559const fn is_arith_op(kind: SyntaxKind) -> bool {
560    matches!(
561        kind,
562        SyntaxKind::PLUS | SyntaxKind::MINUS | SyntaxKind::STAR | SyntaxKind::SLASH,
563    )
564}
565
566/// Emit an `AMOUNT` node containing the units amount.
567///
568/// Recognizes Python beancount's `parse_expr` grammar shape:
569/// `[sign] operand ([WS] op [WS] [sign] operand)* [WS CURRENCY]`,
570/// where `operand` is `NUMBER` or a parenthesized sub-expression
571/// `L_PAREN expr R_PAREN`. Also accepts a bare `CURRENCY`
572/// (currency-only amount). Closes the PR 2.2c.1 deferred
573/// divergence: `bean-check` accepts `10+5 USD`, `-10+5 USD`, and
574/// `-(10+5) USD`; this helper now wraps them as a single `AMOUNT`
575/// node containing the full expression tokens flat (sign + operands
576/// + operators + currency).
577///
578/// Stops at the first token that doesn't fit the grammar (e.g.,
579/// `L_BRACE` cost-spec opener, `AT` price opener, `NEWLINE`,
580/// `COMMENT`, etc.). Returns the new index.
581fn emit_amount(
582    builder: &mut GreenNodeBuilder<'_>,
583    source: &str,
584    tokens: &[(SyntaxKind, Range<usize>)],
585    mut i: usize,
586) -> usize {
587    builder.start_node(SyntaxKind::AMOUNT.into());
588
589    // Currency-only amount: bare `CURRENCY` and nothing more.
590    if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::CURRENCY))
591        && !starts_amount_operand(tokens, i + 1)
592    {
593        let range = tokens[i].1.clone();
594        builder.token(SyntaxKind::CURRENCY.into(), &source[range]);
595        i += 1;
596        builder.finish_node();
597        return i;
598    }
599
600    // Optional leading sign.
601    if matches!(
602        tokens.get(i).map(|(k, _)| *k),
603        Some(SyntaxKind::MINUS | SyntaxKind::PLUS),
604    ) {
605        let (kind, range) = (tokens[i].0, tokens[i].1.clone());
606        builder.token(kind.into(), &source[range]);
607        i += 1;
608    }
609
610    // First operand.
611    i = emit_amount_operand(builder, source, tokens, i);
612
613    // Tail: zero or more `[WS] op [WS] [sign] operand` runs. Each
614    // iteration commits the WS / op / WS / sign tokens BEFORE
615    // dispatching the operand emission. Lookahead-only: do NOT
616    // consume any token until the full op-operand prefix is
617    // confirmed, so a trailing single WHITESPACE before CURRENCY
618    // (the canonical `100 USD` shape) isn't accidentally consumed
619    // as a leading op-prefix.
620    loop {
621        let mut j = i;
622        if matches!(tokens.get(j).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE)) {
623            j += 1;
624        }
625        let Some((op_kind, _)) = tokens.get(j) else {
626            break;
627        };
628        if !is_arith_op(*op_kind) {
629            break;
630        }
631        let op_kind = *op_kind;
632        j += 1;
633        if matches!(tokens.get(j).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE)) {
634            j += 1;
635        }
636        // Optional sign before next operand.
637        let signed = matches!(
638            tokens.get(j).map(|(k, _)| *k),
639            Some(SyntaxKind::MINUS | SyntaxKind::PLUS),
640        );
641        let operand_start = if signed { j + 1 } else { j };
642        if !starts_amount_operand(tokens, operand_start) {
643            break;
644        }
645        // Commit tokens [i..j) (WS? op WS?) into AMOUNT.
646        while i < j {
647            let (kind, range) = (tokens[i].0, tokens[i].1.clone());
648            // Sanity: the only non-op tokens we should be committing
649            // here are WHITESPACE. The op token itself was already
650            // verified.
651            debug_assert!(
652                kind == SyntaxKind::WHITESPACE || kind == op_kind || is_arith_op(kind),
653                "unexpected token kind {kind:?} during op-prefix commit",
654            );
655            builder.token(kind.into(), &source[range]);
656            i += 1;
657        }
658        if signed {
659            let (kind, range) = (tokens[i].0, tokens[i].1.clone());
660            builder.token(kind.into(), &source[range]);
661            i += 1;
662        }
663        i = emit_amount_operand(builder, source, tokens, i);
664    }
665
666    // Optional trailing CURRENCY, either directly adjacent (`100USD`,
667    // `(10+5)USD`) or separated by WHITESPACE (`100 USD`).
668    if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE))
669        && matches!(
670            tokens.get(i + 1).map(|(k, _)| *k),
671            Some(SyntaxKind::CURRENCY),
672        )
673    {
674        let ws_range = tokens[i].1.clone();
675        builder.token(SyntaxKind::WHITESPACE.into(), &source[ws_range]);
676        i += 1;
677        let cur_range = tokens[i].1.clone();
678        builder.token(SyntaxKind::CURRENCY.into(), &source[cur_range]);
679        i += 1;
680    } else if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::CURRENCY)) {
681        let cur_range = tokens[i].1.clone();
682        builder.token(SyntaxKind::CURRENCY.into(), &source[cur_range]);
683        i += 1;
684    }
685
686    builder.finish_node();
687    i
688}
689
690/// Returns true iff `tokens[i]` starts an arithmetic-expression
691/// operand (a bare `NUMBER` or a parenthesized sub-expression
692/// opener `L_PAREN`). Used by `emit_amount` to gate operand
693/// emission inside the op-loop tail.
694fn starts_amount_operand(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
695    matches!(
696        tokens.get(i).map(|(k, _)| *k),
697        Some(SyntaxKind::NUMBER | SyntaxKind::L_PAREN),
698    )
699}
700
701/// Emit one operand of an arithmetic expression: either a bare
702/// `NUMBER` or a parenthesized `L_PAREN expr R_PAREN` sub-
703/// expression. The sub-expression's content tokens stay flat
704/// children of the surrounding `AMOUNT` node (no separate
705/// `EXPR` / `PAREN_GROUP` wrapping for now). Per rule 5, an
706/// unclosed paren at EOF or NEWLINE stops without emitting a
707/// closing paren — round-trip preserves bytes.
708fn emit_amount_operand(
709    builder: &mut GreenNodeBuilder<'_>,
710    source: &str,
711    tokens: &[(SyntaxKind, Range<usize>)],
712    mut i: usize,
713) -> usize {
714    match tokens.get(i).map(|(k, _)| *k) {
715        Some(SyntaxKind::NUMBER) => {
716            let range = tokens[i].1.clone();
717            builder.token(SyntaxKind::NUMBER.into(), &source[range]);
718            i += 1;
719        }
720        Some(SyntaxKind::L_PAREN) => {
721            // Emit opener.
722            let range = tokens[i].1.clone();
723            builder.token(SyntaxKind::L_PAREN.into(), &source[range]);
724            i += 1;
725            // Consume balanced content until matching R_PAREN.
726            // Track nesting depth so `((1+2))` works. Stop at
727            // NEWLINE / EOF (rule 5 unterminated case).
728            let mut depth = 1usize;
729            while depth > 0 {
730                let Some((kind, range)) = tokens.get(i) else {
731                    break;
732                };
733                let (kind, range) = (*kind, range.clone());
734                if kind == SyntaxKind::NEWLINE {
735                    break;
736                }
737                builder.token(kind.into(), &source[range]);
738                i += 1;
739                match kind {
740                    SyntaxKind::L_PAREN => depth += 1,
741                    SyntaxKind::R_PAREN => depth -= 1,
742                    _ => {}
743                }
744            }
745        }
746        _ => {}
747    }
748    i
749}
750
751/// Emit a `COST_SPEC` node spanning `L_BRACE` / `L_BRACE_HASH` /
752/// `L_DOUBLE_BRACE` ... matching `R_BRACE` / `R_DOUBLE_BRACE`. Per
753/// rule 5 (unterminated final directive), an unclosed brace at
754/// EOF or hitting a NEWLINE still gets wrapped — the `COST_SPEC`
755/// simply has no matching close-brace child. Contents stay flat
756/// children of `COST_SPEC`.
757fn emit_cost_spec(
758    builder: &mut GreenNodeBuilder<'_>,
759    source: &str,
760    tokens: &[(SyntaxKind, Range<usize>)],
761    mut i: usize,
762) -> usize {
763    builder.start_node(SyntaxKind::COST_SPEC.into());
764
765    // Emit opening brace token.
766    if let Some((kind, range)) = tokens.get(i) {
767        builder.token((*kind).into(), &source[range.clone()]);
768        i += 1;
769    }
770
771    // Emit content tokens up to and including the matching close
772    // brace, or until NEWLINE / EOF (unclosed-brace case).
773    while i < tokens.len() {
774        let (kind, range) = (tokens[i].0, tokens[i].1.clone());
775        if kind == SyntaxKind::NEWLINE {
776            // Unclosed brace: stop BEFORE the NEWLINE so the
777            // NEWLINE remains a sibling of COST_SPEC (the
778            // posting-line terminator), not a child.
779            break;
780        }
781        builder.token(kind.into(), &source[range]);
782        i += 1;
783        if matches!(kind, SyntaxKind::R_BRACE | SyntaxKind::R_DOUBLE_BRACE) {
784            break;
785        }
786    }
787
788    builder.finish_node();
789    i
790}
791
792/// Emit a `PRICE_ANNOTATION` node opened by `AT` or `AT_AT`,
793/// optionally followed by `WS` and a nested `AMOUNT`. The nested
794/// `AMOUNT` mirrors the units-amount wrapping above; the typed-AST
795/// decodes per-unit-vs-total by inspecting the opener token kind
796/// (`AT` vs `AT_AT`) and walks the `AMOUNT` child for the number
797/// and currency. Avoids absorbing a trailing-only `WHITESPACE`
798/// before a comment or `NEWLINE` (only swallows WS that precedes
799/// an actual amount start).
800fn emit_price_annotation(
801    builder: &mut GreenNodeBuilder<'_>,
802    source: &str,
803    tokens: &[(SyntaxKind, Range<usize>)],
804    mut i: usize,
805) -> usize {
806    builder.start_node(SyntaxKind::PRICE_ANNOTATION.into());
807
808    // Emit the `AT` / `AT_AT` opener.
809    if let Some((kind, range)) = tokens.get(i) {
810        builder.token((*kind).into(), &source[range.clone()]);
811        i += 1;
812    }
813
814    // Optional intervening WHITESPACE, but only if an amount
815    // follows; trailing-only WS belongs as a sibling of
816    // PRICE_ANNOTATION, not a child.
817    let ws_then_amount = matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE),)
818        && starts_amount(tokens, i + 1);
819    if ws_then_amount {
820        let ws_range = tokens[i].1.clone();
821        builder.token(SyntaxKind::WHITESPACE.into(), &source[ws_range]);
822        i += 1;
823    }
824    if starts_amount(tokens, i) {
825        i = emit_amount(builder, source, tokens, i);
826    }
827
828    builder.finish_node();
829    i
830}
831
832/// Close any currently-open POSTING node IF the next sub-line at
833/// `sub_line_indent` should NOT be attached to it. Shared between the
834/// `META_ENTRY` and indented-comment branches of
835/// `emit_transaction_body`, which differ ONLY in their same-indent
836/// tie-break (`attach_on_equal`).
837///
838/// `attach_on_equal` selects the attachment threshold:
839///
840/// - **Metadata (`true`)**: a `key: value` sub-line attaches when it
841///   is indented `>=` the open POSTING. This matches Beancount, whose
842///   grammar attributes metadata by POSITION (any `key_value` line
843///   following a posting, before the next posting, attaches to that
844///   posting) rather than by relative indent — so the common
845///   `key: value` at the SAME column as the posting (e.g. the
846///   `effective_date:` idiom) is posting metadata, not transaction
847///   metadata. Pinned by
848///   `same_indent_metadata_attaches_to_preceding_posting`.
849/// - **Indented comment (`false`)**: a `; doc` / `% doc` sub-line
850///   attaches only when STRICTLY more indented (`>`). A same-indent
851///   comment closes the POSTING and emits as transaction-level
852///   inter-posting trivia. Comments are AST-invisible, so this
853///   threshold only affects CST/formatter emission placement; it is
854///   pinned by
855///   `posting_with_indented_comment_between_postings_terminates_posting`
856///   and must stay strict to preserve that formatter contract.
857///
858/// A sub-line below the attachment threshold closes the POSTING.
859/// Called with `open_posting_indent = None` is a no-op (no POSTING to
860/// close).
861fn close_open_posting_unless_attached(
862    builder: &mut GreenNodeBuilder<'_>,
863    open_posting_indent: &mut Option<usize>,
864    sub_line_indent: usize,
865    attach_on_equal: bool,
866) {
867    let attach = open_posting_indent.is_some_and(|p_indent| {
868        if attach_on_equal {
869            sub_line_indent >= p_indent
870        } else {
871            sub_line_indent > p_indent
872        }
873    });
874    if !attach && open_posting_indent.is_some() {
875        builder.finish_node();
876        *open_posting_indent = None;
877    }
878}
879
880/// Returns true iff `tokens[i..]` starts a posting sub-line:
881/// `WHITESPACE` (the indent) followed by `ACCOUNT`, or by an
882/// optional flag (`FLAG` / `STAR` / `PENDING_KW` / `HASH` /
883/// single-char `CURRENCY`) plus another `WHITESPACE` then
884/// `ACCOUNT`. Mirrors the legacy AST parser's `parse_posting` shape
885/// (`parser.rs:866-880`): indent, optional flag, then a required
886/// account. The flag set MUST stay in sync with `parse_flag` in the
887/// legacy parser (`Token::Star | Pending | Flag(_) | Hash` plus
888/// single-char `Currency`) and with `identify_directive`'s
889/// transaction-trigger arm above; drift would silently leave
890/// HASH-flagged or single-char-CURRENCY-flagged posting lines flat
891/// under `TRANSACTION` instead of wrapped in `POSTING`. The single-
892/// char `CURRENCY`-as-flag arm exists because the lexer's priority-3
893/// Currency-vs-Flag tie-break makes letters like `T`/`V`/`F`/`X`
894/// tokenize as `CURRENCY`, but they still function as posting flags
895/// by Beancount convention.
896fn starts_posting_sub_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
897    if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
898        return false;
899    }
900    if matches!(tokens.get(i + 1), Some((SyntaxKind::ACCOUNT, _))) {
901        return true;
902    }
903    let has_flag = match tokens.get(i + 1) {
904        Some((
905            SyntaxKind::FLAG | SyntaxKind::STAR | SyntaxKind::PENDING_KW | SyntaxKind::HASH,
906            _,
907        )) => true,
908        Some((SyntaxKind::CURRENCY, range)) => range.len() == 1,
909        _ => false,
910    };
911    if !has_flag {
912        return false;
913    }
914    matches!(tokens.get(i + 2), Some((SyntaxKind::WHITESPACE, _)))
915        && matches!(tokens.get(i + 3), Some((SyntaxKind::ACCOUNT, _)))
916}
917
918/// Byte length of the leading `WHITESPACE` token at `tokens[i]`,
919/// or 0 if there is no leading whitespace. Used by
920/// `emit_transaction_body` to decide whether a metadata or
921/// comment sub-line's indent is strictly deeper than the
922/// surrounding POSTING's indent (the posting-attached-metadata /
923/// posting-attached-comment rule).
924///
925/// **Known divergence from the legacy AST parser**: the legacy
926/// lexer's `Indent(N)` / `DeepIndent(N)` variants
927/// (`logos_lexer.rs:615-616`) count tabs as 4 spaces, but this
928/// helper returns raw bytes. Mixed tab+space indentation can
929/// therefore produce different attribution between the two paths.
930/// Acceptable for now because (a) Beancount idiom is uniform
931/// spaces, (b) no corpus file currently triggers the divergence in
932/// posting-attached-metadata position, and (c) the CST round-trip
933/// is byte-identical regardless of how `indent_width` classifies.
934/// If a file shows up, switch to a column-aware count.
935fn indent_width(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> usize {
936    match tokens.get(i) {
937        Some((SyntaxKind::WHITESPACE, range)) => range.len(),
938        _ => 0,
939    }
940}
941
942/// Returns true iff `kind` is one of the four comment-class trivia
943/// token kinds: `COMMENT` (`;`), `PERCENT_COMMENT` (`%`), `SHEBANG`
944/// (`#!`), or `EMACS_DIRECTIVE` (`#+`). Mirrors the comment subset
945/// of `SyntaxKind::is_trivia()` and is the single source of truth
946/// for the three call sites that need to decide whether a token
947/// "is a comment" for body-continuation / indent-attribution
948/// purposes (`starts_indented_comment`,
949/// `upcoming_indented_block_has_meta`,
950/// `is_indented_directive_continuation`). A new comment-class
951/// token would otherwise require three coordinated edits;
952/// `is_comment_token_covers_all_comment_class_trivia` in this
953/// module's tests asserts membership stays in sync with `is_trivia`.
954///
955/// **Known CST/AST divergence**: The legacy AST parser's
956/// `parse_posting_metadata` / `parse_transaction_directive` paths
957/// in `crates/rustledger-parser/src/parser.rs` only treat
958/// `Token::Comment` and `Token::PercentComment` as in-body trivia
959/// for transaction / directive bodies. `Token::Shebang` and
960/// `Token::EmacsDirective` are processed only at top level
961/// (`parse_directive` dispatch). So a deeper-indented `#+STARTUP:
962/// overview` between two postings is INSIDE the POSTING for the
963/// CST but TERMINATES the transaction for the AST. Phase-isolated
964/// in practice: the loader, LSP, validator, query, booking, and
965/// CLI all run through the AST path; the only current
966/// `parse_structured` consumers are this crate's corpus baseline
967/// test and `examples/dump_top_level_directives.rs`. Phase 5
968/// deletes `parse_flat` and the AST; that reconciliation should
969/// adopt the CST behavior (consistent with `is_trivia()`'s
970/// classification of all four comment-class tokens) rather than
971/// the AST behavior (an indented comment-class line silently
972/// terminating the directive is the surprising outcome).
973const fn is_comment_token(kind: SyntaxKind) -> bool {
974    matches!(
975        kind,
976        SyntaxKind::COMMENT
977            | SyntaxKind::PERCENT_COMMENT
978            | SyntaxKind::SHEBANG
979            | SyntaxKind::EMACS_DIRECTIVE,
980    )
981}
982
983/// Returns true iff `tokens[i..]` starts an indented comment line:
984/// `WHITESPACE` (the indent) followed by a comment-class token (per
985/// [`is_comment_token`]). Used by `emit_transaction_body` to apply
986/// the same indent-attribution rule to comments that it applies to
987/// metadata.
988fn starts_indented_comment(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
989    matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _)))
990        && matches!(tokens.get(i + 1), Some((k, _)) if is_comment_token(*k))
991}
992
993/// Returns true iff `tokens[i..]` starts an indented line with
994/// actual content: `WHITESPACE` followed by ANY non-`NEWLINE`
995/// token. A blank line (`NEWLINE` alone, or `WHITESPACE NEWLINE`)
996/// or EOF terminates the transaction body.
997///
998/// **Deliberate divergence from rule 4 of `cst::trivia`:** unlike
999/// the single-line-directive body, a TRANSACTION body absorbs an
1000/// indented trailing `;`-comment AT EOF (file-trailing-ish) into
1001/// the directive. Rationale: documentation comments interleaved
1002/// with postings are a Beancount idiom, and forcing the body to
1003/// "back-track" the last comment if it's trailing would require
1004/// look-ahead the per-line predicate can't do without extra state.
1005/// Pinned by `transaction_trailing_indented_comment_at_eof_stays_inside`.
1006fn is_indented_transaction_body_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
1007    if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
1008        return false;
1009    }
1010    !matches!(tokens.get(i + 1), Some((SyntaxKind::NEWLINE, _)) | None)
1011}
1012
1013/// Scan forward through any indented `WS META_KEY` sub-lines or
1014/// `WS <comment>` sub-lines (per [`is_comment_token`]) starting at
1015/// `tokens[i..]`, returning `true` iff at least one of them is a
1016/// metadata (`WS META_KEY`) sub-line. Stops at the first line that
1017/// is neither metadata nor an indented comment (blank line,
1018/// non-indented top-level content, EOF).
1019fn upcoming_indented_block_has_meta(tokens: &[(SyntaxKind, Range<usize>)], mut i: usize) -> bool {
1020    loop {
1021        let head = tokens.get(i).map(|(k, _)| *k);
1022        let next = tokens.get(i + 1).map(|(k, _)| *k);
1023        match (head, next) {
1024            (Some(SyntaxKind::WHITESPACE), Some(SyntaxKind::META_KEY)) => return true,
1025            (Some(SyntaxKind::WHITESPACE), Some(k)) if is_comment_token(k) => {
1026                // Skip past this indented-comment line.
1027                while i < tokens.len() && tokens[i].0 != SyntaxKind::NEWLINE {
1028                    i += 1;
1029                }
1030                if i >= tokens.len() {
1031                    return false;
1032                }
1033                i += 1; // past the NEWLINE
1034            }
1035            _ => return false,
1036        }
1037    }
1038}
1039
1040/// Returns true iff `tokens[i..]` starts an indented line that
1041/// CONTINUES the current multi-line directive: `WHITESPACE` (the
1042/// indent) followed by content that visually "belongs to" the
1043/// metadata block.
1044///
1045/// Recognizes:
1046/// - `WS META_KEY` — always a continuation regardless of context.
1047/// - `WS <comment>` (per [`is_comment_token`]) — a continuation iff
1048///   the surrounding indented block contains ANY `WS META_KEY` (the
1049///   `block_has_meta` argument). This prevents absorbing indented
1050///   comments that follow a header-only directive (rule 2 / rule
1051///   4 cases) while still keeping documentation comments BEFORE
1052///   the first metadata entry inside the directive.
1053///
1054/// All other shapes (blank `\n`, non-indented content, EOF)
1055/// terminate the directive.
1056fn is_indented_directive_continuation(
1057    tokens: &[(SyntaxKind, Range<usize>)],
1058    i: usize,
1059    block_has_meta: bool,
1060) -> bool {
1061    // The META_KEY arm routes through `starts_meta_sub_line` so the
1062    // continuation predicate and the wrapping predicate
1063    // (`emit_body_sub_line`) cannot drift.
1064    if starts_meta_sub_line(tokens, i) {
1065        return true;
1066    }
1067    if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
1068        return false;
1069    }
1070    match tokens.get(i + 1) {
1071        Some((k, _)) if is_comment_token(*k) => block_has_meta,
1072        _ => false,
1073    }
1074}
1075
1076/// Given the token slice and the index of a non-trivia token,
1077/// decide whether it starts a recognized top-level directive of
1078/// any kind. Returns the directive `SyntaxKind` if yes, `None`
1079/// otherwise (random content that doesn't fit a known shape — the
1080/// caller wraps such content in an `ERROR_NODE` per PR 2.4).
1081///
1082/// Beancount directive line shapes recognized here:
1083///
1084/// - `DATE WHITESPACE <KEYWORD> ...`: OPEN / CLOSE / BALANCE / PAD
1085///   / EVENT / QUERY / NOTE / DOCUMENT / PRICE / COMMODITY (PR
1086///   2.1a) + CUSTOM (PR 2.3)
1087/// - `DATE WHITESPACE <txn-trigger> ...`: TRANSACTION (PR 2.1b),
1088///   where `<txn-trigger>` is one of `STAR` / `PENDING_KW` (`!`)
1089///   / `FLAG` / `HASH` / `TXN_KW` / `STRING` ("implied" txn form
1090///   with no explicit flag) / single-char `CURRENCY` (ticker
1091///   letters). Mirrors `parse_dated_directive` in the legacy AST
1092///   parser at parser.rs:1707-1715.
1093/// - `<KEYWORD> ...` (no leading date): PUSHTAG / POPTAG /
1094///   PUSHMETA / POPMETA (PR 2.1a) + OPTION / INCLUDE / PLUGIN
1095///   (PR 2.3)
1096fn identify_directive(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> Option<SyntaxKind> {
1097    let (head, _) = tokens.get(i)?;
1098    match *head {
1099        // Top-level keyword directives — no leading date.
1100        SyntaxKind::PUSHTAG_KW => Some(SyntaxKind::PUSHTAG_DIRECTIVE),
1101        SyntaxKind::POPTAG_KW => Some(SyntaxKind::POPTAG_DIRECTIVE),
1102        SyntaxKind::PUSHMETA_KW => Some(SyntaxKind::PUSHMETA_DIRECTIVE),
1103        SyntaxKind::POPMETA_KW => Some(SyntaxKind::POPMETA_DIRECTIVE),
1104
1105        // Phase 2.3: edge directives (option / include / plugin).
1106        // These are top-level keyword directives — like
1107        // pushtag/poptag/pushmeta/popmeta above — so the same
1108        // single-line directive body shape applies. Their full
1109        // header is consumed by `emit_through_terminator`; trailing
1110        // indented metadata lines (a rare but legal Beancount idiom
1111        // for option / include / plugin) are absorbed by
1112        // `emit_directive_body`'s look-ahead, same as the other
1113        // top-level-keyword directives.
1114        SyntaxKind::OPTION_KW => Some(SyntaxKind::OPTION_DIRECTIVE),
1115        SyntaxKind::INCLUDE_KW => Some(SyntaxKind::INCLUDE_DIRECTIVE),
1116        SyntaxKind::PLUGIN_KW => Some(SyntaxKind::PLUGIN_DIRECTIVE),
1117
1118        // Dated directives — peek past SAME-LINE whitespace for the
1119        // keyword. Only WHITESPACE separates content tokens within a
1120        // directive's header line; a NEWLINE means we crossed into
1121        // the next line and the DATE/keyword pair is NOT a single
1122        // directive. Skipping `is_trivia()` (which includes NEWLINE
1123        // and COMMENT) would wrongly identify malformed `DATE\nopen ...`
1124        // as OPEN_DIRECTIVE while `emit_through_terminator` only
1125        // captures the first line, leaving the keyword orphaned.
1126        SyntaxKind::DATE => {
1127            let mut j = i + 1;
1128            while j < tokens.len() && tokens[j].0 == SyntaxKind::WHITESPACE {
1129                j += 1;
1130            }
1131            let (next, _) = tokens.get(j)?;
1132            match *next {
1133                SyntaxKind::OPEN_KW => Some(SyntaxKind::OPEN_DIRECTIVE),
1134                SyntaxKind::CLOSE_KW => Some(SyntaxKind::CLOSE_DIRECTIVE),
1135                SyntaxKind::BALANCE_KW => Some(SyntaxKind::BALANCE_DIRECTIVE),
1136                SyntaxKind::PAD_KW => Some(SyntaxKind::PAD_DIRECTIVE),
1137                SyntaxKind::EVENT_KW => Some(SyntaxKind::EVENT_DIRECTIVE),
1138                SyntaxKind::QUERY_KW => Some(SyntaxKind::QUERY_DIRECTIVE),
1139                SyntaxKind::NOTE_KW => Some(SyntaxKind::NOTE_DIRECTIVE),
1140                SyntaxKind::DOCUMENT_KW => Some(SyntaxKind::DOCUMENT_DIRECTIVE),
1141                SyntaxKind::PRICE_KW => Some(SyntaxKind::PRICE_DIRECTIVE),
1142                SyntaxKind::COMMODITY_KW => Some(SyntaxKind::COMMODITY_DIRECTIVE),
1143                // Phase 2.3: CUSTOM is a dated directive with a
1144                // type-name STRING followed by an arbitrary value
1145                // list (STRING / ACCOUNT / amount / DATE / CURRENCY
1146                // / BOOL_TRUE / BOOL_FALSE). The header consumption
1147                // is identical to the other dated single-line
1148                // directives; only the value list is open-ended,
1149                // which is fine for the CST since the trailing
1150                // tokens stay flat.
1151                SyntaxKind::CUSTOM_KW => Some(SyntaxKind::CUSTOM_DIRECTIVE),
1152                // Transaction triggers after the DATE. Beancount
1153                // accepts:
1154                // - `*` (STAR) for completed transactions
1155                // - `!` (PENDING_KW) for incomplete/warning
1156                // - letter flags P/S/T/C/U/R/M/?/& (FLAG)
1157                // - `#` (HASH) promoted to a flag in this position
1158                //   (cf. `Token::is_txn_flag` and the AST parser's
1159                //   `parse_flag` accepting Hash)
1160                // - the explicit `txn` keyword (TXN_KW)
1161                // - a bare STRING ("implied transaction": the AST
1162                //   parser at parser.rs:1713 dispatches
1163                //   `Token::String(_)` to `parse_transaction_directive`
1164                //   with an implied `*` flag; common shorthand
1165                //   form in real ledgers like
1166                //   `2024-01-15 "Coffee"`)
1167                SyntaxKind::STAR
1168                | SyntaxKind::PENDING_KW
1169                | SyntaxKind::FLAG
1170                | SyntaxKind::HASH
1171                | SyntaxKind::TXN_KW
1172                | SyntaxKind::STRING => Some(SyntaxKind::TRANSACTION),
1173                // Single-character CURRENCY: NYSE/NASDAQ-style
1174                // ticker letters (T, V, F, X, ...) double as
1175                // transaction flags. The lexer prioritizes
1176                // CURRENCY over FLAG for single uppercase letters
1177                // (logos_lexer Currency priority 3); the AST parser
1178                // (`parse_flag` arm `Token::Currency(s) if s.len() == 1`)
1179                // mirrors this. We do the same to stay consistent
1180                // with the established lexer/parser contract.
1181                SyntaxKind::CURRENCY if tokens[j].1.len() == 1 => Some(SyntaxKind::TRANSACTION),
1182                // Anything else: unknown shape.
1183                _ => None,
1184            }
1185        }
1186        _ => None,
1187    }
1188}
1189
1190#[cfg(test)]
1191mod tests {
1192    use super::*;
1193
1194    fn assert_round_trips(source: &str) {
1195        let tree = parse_flat(source);
1196        assert_eq!(tree.text().to_string(), source);
1197        let structured = parse_structured(source);
1198        assert_eq!(structured.text().to_string(), source);
1199    }
1200
1201    /// Drift guard: `is_comment_token` and `is_trivia` must agree on
1202    /// what counts as comment-class trivia. Enforces two invariants:
1203    ///
1204    /// 1. `is_trivia() ⊆ is_comment_token ∪ non_comment_trivia`:
1205    ///    every trivia kind is either a comment or in the explicit
1206    ///    whitespace-class allow-list. Catches a new lexer-level
1207    ///    addition to `is_trivia()` that's silently forgotten in
1208    ///    `is_comment_token`.
1209    /// 2. `is_comment_token ⊆ is_trivia()`: every kind
1210    ///    `is_comment_token` says yes to is actually trivia. Catches
1211    ///    a future edit to `is_comment_token`'s match arm that
1212    ///    accidentally pulls in a non-trivia content token,
1213    ///    silently extending indent-attribution to real content
1214    ///    inside POSTING / directive bodies.
1215    ///
1216    /// On failure (1), if the new trivia kind is neither comment-
1217    /// class nor whitespace-class (e.g., some future
1218    /// `SECTION_HEADER` that should NOT be absorbed as a
1219    /// continuation), don't reflexively add it to either set —
1220    /// revisit whether the body-continuation predicates need a
1221    /// different abstraction (`is_body_continuation_trivia` or
1222    /// similar) and propagate the choice to the three call sites.
1223    #[test]
1224    fn is_comment_token_covers_all_comment_class_trivia() {
1225        let non_comment_trivia = [SyntaxKind::BOM, SyntaxKind::WHITESPACE, SyntaxKind::NEWLINE];
1226
1227        let mut trivia_missed_from_comment: Vec<SyntaxKind> = Vec::new();
1228        let mut comment_not_trivia: Vec<SyntaxKind> = Vec::new();
1229        for d in 0u16..=u16::MAX {
1230            let Ok(kind) = SyntaxKind::try_from(d) else {
1231                continue;
1232            };
1233            // Invariant 1: trivia (minus whitespace allow-list) ⊆ comment.
1234            if kind.is_trivia() && !non_comment_trivia.contains(&kind) && !is_comment_token(kind) {
1235                trivia_missed_from_comment.push(kind);
1236            }
1237            // Invariant 2: comment ⊆ trivia.
1238            if is_comment_token(kind) && !kind.is_trivia() {
1239                comment_not_trivia.push(kind);
1240            }
1241        }
1242        assert!(
1243            trivia_missed_from_comment.is_empty(),
1244            "trivia kinds present in is_trivia() but missing from \
1245             is_comment_token: {trivia_missed_from_comment:?}. Three \
1246             options: (a) add them to is_comment_token if they are \
1247             comment-class; (b) extend the non_comment_trivia allow- \
1248             list in this test if they are whitespace-class; (c) if \
1249             they are neither, revisit whether the body-continuation \
1250             predicates need a different abstraction and propagate \
1251             the decision to the three call sites.",
1252        );
1253        assert!(
1254            comment_not_trivia.is_empty(),
1255            "is_comment_token claims these kinds are comments but \
1256             is_trivia() disagrees: {comment_not_trivia:?}. Either \
1257             add them to is_trivia() (if they really are trivia) or \
1258             remove them from is_comment_token (if they are content \
1259             tokens that should not be absorbed as comment \
1260             continuations).",
1261        );
1262    }
1263
1264    #[test]
1265    fn empty_source() {
1266        assert_round_trips("");
1267    }
1268
1269    #[test]
1270    fn whitespace_only() {
1271        assert_round_trips("   \t  ");
1272    }
1273
1274    #[test]
1275    fn bom_round_trips() {
1276        assert_round_trips("\u{FEFF}2024-01-01 open Assets:Bank\n");
1277    }
1278
1279    #[test]
1280    fn full_directive_round_trips() {
1281        assert_round_trips(
1282            "2024-01-01 open Assets:Bank USD\n\
1283             2024-01-15 * \"Coffee\"\n  \
1284               Assets:Bank  -5.00 USD\n  \
1285               Expenses:Food\n",
1286        );
1287    }
1288
1289    #[test]
1290    fn line_comment_round_trips() {
1291        assert_round_trips("; preamble\n2024-01-01 open Assets:Bank\n");
1292    }
1293
1294    #[test]
1295    fn no_trailing_newline_round_trips() {
1296        assert_round_trips("2024-01-01 open Assets:Bank");
1297    }
1298
1299    #[test]
1300    fn root_kind_is_source_file() {
1301        let tree = parse_flat("");
1302        assert_eq!(tree.kind(), SyntaxKind::SOURCE_FILE);
1303        let structured = parse_structured("");
1304        assert_eq!(structured.kind(), SyntaxKind::SOURCE_FILE);
1305    }
1306}
rustledger_parser/cst/parser.rs

rustledger_parser/cst/
parser.rs