rustledger_parser/cst/parser.rs
1//! CST builders: phase 1 flat ([`parse_flat`]) + phase 2.1-2.4
2//! structured ([`parse_structured`]).
3//!
4//! Both walk the lossless token stream and emit a `GreenNode` whose
5//! `text()` is byte-identical to the input source. They differ in
6//! what they wrap:
7//!
8//! - [`parse_flat`] (phase 1) puts every token as a direct child of
9//! a single `SOURCE_FILE` node. Useful for round-trip-only tests
10//! and the kind-sequence corpus baseline.
11//! - [`parse_structured`] recognizes:
12//! - **Phase 2.1a**: 14 single-line directive shapes —
13//! `OPEN`/`CLOSE`/`BALANCE`/`PAD`/`EVENT`/`QUERY`/`NOTE`/
14//! `DOCUMENT`/`PRICE`/`COMMODITY` (dated) +
15//! `PUSHTAG`/`POPTAG`/`PUSHMETA`/`POPMETA` (top-level keyword).
16//! - **Phase 2.1b**: `TRANSACTION` — DATE + `STAR` / `PENDING_KW`
17//! (`!`) / `FLAG` / `TXN_KW`, multi-line scope through the last
18//! indented sub-line (postings, metadata, indented comments).
19//!
20//! Each wraps in its specific node kind per the Directive-
21//! Terminator Rule (see [`crate::cst::trivia`]).
22//!
23//! - **Phase 2.3**: edge directives —
24//! `OPTION_DIRECTIVE` / `INCLUDE_DIRECTIVE` /
25//! `PLUGIN_DIRECTIVE` (top-level keyword) +
26//! `CUSTOM_DIRECTIVE` (dated with arbitrary trailing value
27//! list). Body / metadata shape is identical to PR 2.1a's
28//! dated and standalone-keyword directives — only the header
29//! keyword recognition is new.
30//!
31//! - **Phase 2.4**: error recovery — unrecognized / malformed
32//! top-level lines are wrapped in `ERROR_NODE` (terminated by
33//! NEWLINE or EOF per rule 5). Same trivia attachment policy
34//! as recognized directives (rule 2): pending leading trivia
35//! attaches inside the `ERROR_NODE` when it's not the very
36//! first content in the file. AMOUNT now also wraps full
37//! arithmetic expressions (`[sign] (NUMBER | PAREN_EXPR)
38//! ([WS] op [WS] (NUMBER | PAREN_EXPR))* [WS CURRENCY]`),
39//! closing the deferred 2.2c.1 divergence with Python
40//! beancount on `10+5 USD`-shape amounts.
41//!
42//! Phase 2.2a adds `META_ENTRY` sub-node structure around indented
43//! `WS META_KEY ... (NEWLINE | EOF)` sub-lines inside any directive
44//! or transaction (per rule 5 of `cst::trivia`, an unterminated
45//! final sub-line at EOF still gets wrapped). Phase 2.2b adds
46//! `POSTING` sub-node structure around each `WS [(FLAG | STAR |
47//! PENDING_KW | HASH | single-char CURRENCY) WS] ACCOUNT ...`
48//! posting line inside `TRANSACTION` (the flag arm mirrors
49//! `parse_flag` in the legacy AST parser and `identify_directive`'s
50//! transaction-trigger arm; single-char `CURRENCY` covers letters
51//! like `T`/`V`/`F`/`X` that win the lexer's priority-3 Currency-
52//! vs-Flag tie-break). Posting-attached metadata (`META_ENTRY` sub-
53//! lines following the posting, indented `>=` the posting) becomes a
54//! child of that `POSTING`. Phase 2.2c adds `AMOUNT` / `COST_SPEC` /
55//! `PRICE_ANNOTATION` inside `POSTING`. Phase 5 deletes
56//! `parse_flat` once `parse_structured` covers every byte in
57//! every corpus file.
58
59use std::ops::Range;
60
61use rowan::GreenNodeBuilder;
62
63use crate::cst::lossless_tokens::lossless_kind_tokens;
64use crate::cst::syntax_kind::{SyntaxKind, SyntaxNode};
65
66/// Parse `source` to a flat lossless CST.
67///
68/// The returned node's text serialization equals `source` byte-for-
69/// byte for every UTF-8 input. Every token is a direct child of
70/// `SOURCE_FILE`; no structural directive wrapping.
71#[must_use]
72pub fn parse_flat(source: &str) -> SyntaxNode {
73 let mut builder = GreenNodeBuilder::new();
74 builder.start_node(SyntaxKind::SOURCE_FILE.into());
75 for (kind, range) in lossless_kind_tokens(source) {
76 builder.token(kind.into(), &source[range]);
77 }
78 builder.finish_node();
79 SyntaxNode::new_root(builder.finish())
80}
81
82/// Parse `source` to a structured lossless CST.
83///
84/// Recognizes the 14 single-line directive shapes (PR 2.1a) plus
85/// `TRANSACTION` (PR 2.1b) plus the 4 edge directives `OPTION` /
86/// `INCLUDE` / `PLUGIN` / `CUSTOM` (PR 2.3), and wraps each in its
87/// specific node kind. Trivia attaches per the Directive-
88/// Terminator Rule.
89///
90/// Unrecognized / malformed top-level lines are wrapped in an
91/// `ERROR_NODE` (PR 2.4) — same trivia attachment policy as
92/// recognized directives and the same rule-5 unterminated-at-EOF
93/// behavior. Round-trip byte-identical for every UTF-8 input.
94#[must_use]
95pub fn parse_structured(source: &str) -> SyntaxNode {
96 let tokens: Vec<(SyntaxKind, Range<usize>)> = lossless_kind_tokens(source);
97 let mut builder = GreenNodeBuilder::new();
98 builder.start_node(SyntaxKind::SOURCE_FILE.into());
99
100 let mut pending_leading: Vec<(SyntaxKind, Range<usize>)> = Vec::new();
101 let mut seen_first_content = false;
102 let mut i = 0;
103
104 while i < tokens.len() {
105 let (kind, ref range) = tokens[i];
106 if kind.is_trivia() {
107 pending_leading.push((kind, range.clone()));
108 i += 1;
109 continue;
110 }
111
112 // Non-trivia at the top level. Identify what kind of line
113 // starts here. Both branches share the same trivia-
114 // attachment + node-emission shape: drain pending trivia
115 // around `start_node(kind)` per rule 2 (the FIRST
116 // non-trivia content's pending trivia attaches under
117 // SOURCE_FILE; subsequent runs attach INSIDE the new
118 // node), emit the body, then `finish_node()`.
119 let node_kind = identify_directive(&tokens, i).unwrap_or(SyntaxKind::ERROR_NODE);
120 if seen_first_content {
121 builder.start_node(node_kind.into());
122 emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
123 } else {
124 emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
125 builder.start_node(node_kind.into());
126 }
127 seen_first_content = true;
128 i = match node_kind {
129 SyntaxKind::TRANSACTION => emit_transaction_body(&mut builder, source, &tokens, i),
130 SyntaxKind::ERROR_NODE => emit_through_terminator(&mut builder, source, &tokens, i),
131 // Recognized directive (PR 2.1a / 2.3 single-line shapes):
132 // header + optional indented META_ENTRY sub-lines.
133 _ => emit_directive_body(&mut builder, source, &tokens, i),
134 };
135 builder.finish_node();
136 }
137
138 // File-trailing trivia: drain any pending under SOURCE_FILE.
139 emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
140
141 builder.finish_node();
142 SyntaxNode::new_root(builder.finish())
143}
144
145/// Emit a sequence of `(kind, range)` tokens into the builder.
146fn emit_tokens(
147 builder: &mut GreenNodeBuilder<'_>,
148 source: &str,
149 tokens: impl IntoIterator<Item = (SyntaxKind, Range<usize>)>,
150) {
151 for (kind, range) in tokens {
152 builder.token(kind.into(), &source[range]);
153 }
154}
155
156/// Consume `tokens[i..]` into `builder` up to and including the
157/// next `NEWLINE` token (or EOF). Returns the new index `i`.
158fn emit_through_terminator(
159 builder: &mut GreenNodeBuilder<'_>,
160 source: &str,
161 tokens: &[(SyntaxKind, Range<usize>)],
162 mut i: usize,
163) -> usize {
164 while i < tokens.len() {
165 let (kind, ref range) = tokens[i];
166 builder.token(kind.into(), &source[range.clone()]);
167 i += 1;
168 if kind == SyntaxKind::NEWLINE {
169 break;
170 }
171 }
172 i
173}
174
175/// Consume one indented sub-line of a directive or transaction
176/// body, wrapping it in a `META_ENTRY` node iff it's metadata
177/// (i.e., starts `WS META_KEY ...`).
178///
179/// Phase 2.2a structural wrapping: each metadata sub-line becomes
180/// its own `META_ENTRY` node containing the indent `WHITESPACE`,
181/// the `META_KEY`, the rest of the line's content tokens, and —
182/// when present — the terminator `NEWLINE`. An UNTERMINATED final
183/// metadata sub-line at EOF (per rule 5 of `cst::trivia`) is still
184/// wrapped: its `META_ENTRY` simply ends at the last content token
185/// with no `NEWLINE` child. Token kinds inside the `META_ENTRY`
186/// stay flat — phase 3's typed-AST surface will expose `key()` and
187/// `value()` accessors that walk these children. Indented
188/// `;`-comments flow through as flat children, NOT wrapped in
189/// `META_ENTRY`. POSTING lines are recognized earlier in
190/// `emit_transaction_body` and never reach this helper.
191fn emit_body_sub_line(
192 builder: &mut GreenNodeBuilder<'_>,
193 source: &str,
194 tokens: &[(SyntaxKind, Range<usize>)],
195 i: usize,
196) -> usize {
197 if starts_meta_sub_line(tokens, i) {
198 builder.start_node(SyntaxKind::META_ENTRY.into());
199 let next = emit_through_terminator(builder, source, tokens, i);
200 builder.finish_node();
201 next
202 } else {
203 emit_through_terminator(builder, source, tokens, i)
204 }
205}
206
207/// Returns true iff `tokens[i..]` starts an indented `WS META_KEY ...`
208/// metadata sub-line.
209///
210/// **Single source of truth** for the `WS + META_KEY` recognition
211/// pattern. Used by both `emit_body_sub_line` (decides whether to
212/// open a `META_ENTRY` node around the sub-line) and
213/// `is_indented_directive_continuation`'s `META_KEY` arm (decides
214/// whether the directive body should keep consuming). Routing both
215/// call sites through one helper prevents the predicate-pair drift
216/// hazard where one widens (e.g. admits a different indent token)
217/// without the other and the parser starts consuming sub-lines
218/// without wrapping them, or wrapping sub-lines that the body loop
219/// never reaches.
220fn starts_meta_sub_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
221 matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _)))
222 && matches!(tokens.get(i + 1), Some((SyntaxKind::META_KEY, _)))
223}
224
225/// Consume the header line through its terminator NEWLINE, then
226/// keep consuming any indented metadata sub-lines OR indented
227/// `;`/`%` comment lines that follow at the same logical block.
228///
229/// The Directive-Terminator Rule (see `cst::trivia`) declares that
230/// a directive carrying metadata spans multiple lines: its last
231/// content token is the last content token of its LAST sub-line,
232/// not the header. Stopping at the header NEWLINE would orphan
233/// metadata under `SOURCE_FILE` and silently violate the rule. PR
234/// 2.1a wraps the full multi-line span; PR 2.2 will introduce a
235/// `META_ENTRY` sub-node around each `WHITESPACE META_KEY ...
236/// NEWLINE` run inside.
237///
238/// A continuation sub-line is recognized as `WHITESPACE` (the
239/// indent) followed by either:
240/// - `META_KEY` — the standard metadata sub-line, or
241/// - any comment-class trivia token (per [`is_comment_token`]: `;`,
242/// `%`, `#!`, `#+`) — an indented documentation comment between
243/// metadata entries (a common Beancount idiom; keeping it inside
244/// the directive prevents subsequent metadata from getting
245/// orphaned to `SOURCE_FILE`).
246///
247/// Anything else — a blank line, a non-indented top-level token,
248/// EOF — terminates the directive. Blank-line separated metadata
249/// blocks are currently a known limitation: a `\n` between two
250/// metadata entries closes the directive and orphans the second
251/// entry. PR 2.2's grammar will likely subsume this when it
252/// introduces `META_ENTRY` structure.
253fn emit_directive_body(
254 builder: &mut GreenNodeBuilder<'_>,
255 source: &str,
256 tokens: &[(SyntaxKind, Range<usize>)],
257 mut i: usize,
258) -> usize {
259 i = emit_through_terminator(builder, source, tokens, i);
260 // PROSPECTIVELY scan the upcoming indented-content block for
261 // any `WS META_KEY`. If the block contains metadata, any
262 // indented comments anywhere in it — including BEFORE the
263 // first META_KEY (the "doc-comment-for-the-following-field"
264 // idiom) — are continuations that belong inside the directive.
265 // If the block contains NO metadata, an indented comment is
266 // inter-directive trivia (rule 2) or file-trailing (rule 4)
267 // and must not be absorbed. Per-line bookkeeping was tried in
268 // v4 but couldn't see the META_KEY that came AFTER a leading
269 // comment, so a comment-before-first-metadata silently closed
270 // the directive and orphaned the metadata.
271 let block_has_meta = upcoming_indented_block_has_meta(tokens, i);
272 while is_indented_directive_continuation(tokens, i, block_has_meta) {
273 i = emit_body_sub_line(builder, source, tokens, i);
274 }
275 i
276}
277
278/// Consume the transaction header through its terminator NEWLINE,
279/// then keep consuming ANY indented sub-line (postings, metadata,
280/// indented comments — any line starting with `WHITESPACE`
281/// followed by a non-`NEWLINE` token).
282///
283/// **Phase 2.2b attributes metadata by indent depth.** Beancount
284/// distinguishes TRANSACTION-level metadata (at the transaction's
285/// standard indent, typically two spaces, before any posting OR
286/// interspersed between postings at that same indent) from
287/// POSTING-attached metadata (at a DEEPER indent following a
288/// posting line). The transaction-level case stays a direct child
289/// of `TRANSACTION`; the posting-attached case becomes a child of
290/// the preceding `POSTING` node.
291///
292/// State machine: walk the body lines while tracking the indent
293/// width of the most-recently-opened `POSTING` (if any). For each
294/// sub-line:
295///
296/// - **Posting line** (`WS [(FLAG | STAR | PENDING_KW | HASH |
297/// single-char CURRENCY) WS] ACCOUNT ...`, full flag set per
298/// [`starts_posting_sub_line`]):
299/// close the open POSTING if any, then open a new POSTING and
300/// consume the line. **Sibling POSTING indents are not required
301/// to be uniform**: a transaction with postings at different
302/// indent depths produces sibling POSTING nodes whose
303/// `open_posting_indent` reflects each one's own header indent.
304/// Subsequent metadata then attributes against the
305/// most-recently-opened POSTING's indent, which means
306/// metadata can attribute differently depending on which
307/// posting precedes it. Beancount's grammar uses uniform
308/// indentation by convention, so this is a defensive (not
309/// primary) shape; pinned by
310/// `postings_at_increasing_indents_produce_siblings_and_meta_attributes_to_latest`.
311/// - **Metadata sub-line** (`WS META_KEY ...`): if a POSTING is
312/// open AND this line's indent is `>=` the POSTING's indent, emit
313/// the `META_ENTRY` INSIDE the POSTING. Otherwise (no open POSTING,
314/// or strictly shallower indent), close any open POSTING and emit
315/// the `META_ENTRY` at TRANSACTION level. The `>=` (not `>`) match
316/// mirrors Beancount, which attributes metadata to the preceding
317/// posting by POSITION, so same-indent `key: value` is posting
318/// metadata.
319/// - **Indented comment line** (`WS COMMENT` / `WS PERCENT_COMMENT`):
320/// apply the same indent-attribution rule as metadata. If the
321/// comment is strictly more indented than the open POSTING, it
322/// stays INSIDE the POSTING (preserving the doc-comment-for-
323/// following-posting-metadata idiom — a deeper-indented `; doc`
324/// followed by deeper-indented `key: value` should both belong
325/// to the same posting). Otherwise close any open POSTING and
326/// emit the comment flat at TRANSACTION level (matches the
327/// `posting_with_indented_comment_between_postings_terminates_posting`
328/// test, where the comment is at the SAME indent as the postings
329/// and is therefore transaction-level inter-posting trivia).
330/// - **Any other indented content** (`WS STRING`, `WS NUMBER`,
331/// unrecognized shape): close any open POSTING and emit the line
332/// flat at TRANSACTION level. We don't know what to do with it
333/// structurally; flat-passthrough preserves bytes.
334///
335/// Indent width is measured as the BYTE LENGTH of the leading
336/// `WHITESPACE` token — sufficient when the source uses uniform
337/// spaces (the standard Beancount convention). **Known divergence
338/// from the legacy AST parser**: the legacy lexer's `Indent(N)` /
339/// `DeepIndent(N)` variants (`logos_lexer.rs:615-616`) count tabs
340/// as 4 spaces, so a tab-indented posting followed by space-
341/// indented metadata is compared by VISUAL columns there but by
342/// BYTE COUNT here. The two paths can disagree on mixed-indent
343/// files. No test corpus file currently triggers the divergence in
344/// posting-attached-metadata position; if one shows up, switching
345/// `indent_width` to a column-aware count is the fix.
346///
347/// Compared with `emit_directive_body` (which only continues on
348/// `WS META_KEY` and gated `WS COMMENT`), transactions have a
349/// looser body shape. PR 2.2c will introduce `AMOUNT` /
350/// `COST_SPEC` / `PRICE_ANNOTATION` sub-nodes INSIDE `POSTING`;
351/// for now the POSTING's content tokens (account, amount,
352/// currency, etc.) stay flat children of POSTING.
353///
354/// Termination: a blank line (NEWLINE alone, or WHITESPACE then
355/// NEWLINE), any non-indented top-level token, or EOF. Any open
356/// POSTING is closed before returning.
357fn emit_transaction_body(
358 builder: &mut GreenNodeBuilder<'_>,
359 source: &str,
360 tokens: &[(SyntaxKind, Range<usize>)],
361 mut i: usize,
362) -> usize {
363 i = emit_through_terminator(builder, source, tokens, i);
364
365 let mut open_posting_indent: Option<usize> = None;
366
367 while is_indented_transaction_body_line(tokens, i) {
368 let sub_line_indent = indent_width(tokens, i);
369
370 if starts_posting_sub_line(tokens, i) {
371 if open_posting_indent.is_some() {
372 builder.finish_node();
373 }
374 builder.start_node(SyntaxKind::POSTING.into());
375 open_posting_indent = Some(sub_line_indent);
376 i = emit_posting_line(builder, source, tokens, i);
377 } else if starts_meta_sub_line(tokens, i) {
378 // Beancount attributes metadata by POSITION: a `key: value`
379 // line following a posting attaches to that posting, even
380 // at the SAME indent (`attach_on_equal = true`).
381 close_open_posting_unless_attached(
382 builder,
383 &mut open_posting_indent,
384 sub_line_indent,
385 true,
386 );
387 i = emit_body_sub_line(builder, source, tokens, i);
388 } else if starts_indented_comment(tokens, i) {
389 // Comments use the STRICT (`>`) rule: deeper-indented
390 // comments stay INSIDE the open POSTING; same-or-shallower
391 // comments close the POSTING and emit flat at TRANSACTION
392 // level. Comments are AST-invisible, so this only affects
393 // formatter emission placement.
394 close_open_posting_unless_attached(
395 builder,
396 &mut open_posting_indent,
397 sub_line_indent,
398 false,
399 );
400 i = emit_through_terminator(builder, source, tokens, i);
401 } else {
402 // Catch-all: any other indented content (e.g., `WS
403 // STRING`, `WS NUMBER`, or unrecognized shapes that
404 // future error-recovery work might surface). Close any
405 // open POSTING and emit flat at TRANSACTION level. PR
406 // 2.2c (AMOUNT / COST_SPEC / PRICE_ANNOTATION) lives
407 // INSIDE a `POSTING` and reaches the parser through
408 // `starts_posting_sub_line`, never this branch — but
409 // if a future continuation form (e.g., multi-line
410 // postings) gets added, this branch is where it would
411 // need to be teased apart from genuine other content.
412 if open_posting_indent.is_some() {
413 builder.finish_node();
414 open_posting_indent = None;
415 }
416 i = emit_through_terminator(builder, source, tokens, i);
417 }
418 }
419
420 if open_posting_indent.is_some() {
421 builder.finish_node();
422 }
423
424 i
425}
426
427/// Consume a posting sub-line through its terminator NEWLINE (or
428/// EOF), wrapping the `AMOUNT`, `COST_SPEC`, and `PRICE_ANNOTATION`
429/// sub-structures inside the already-open `POSTING` node.
430///
431/// Preconditions: the caller has opened a `POSTING` node and is
432/// positioned at the first token of the posting line (`WS`).
433/// `starts_posting_sub_line(tokens, i)` must hold.
434///
435/// Body shape (after the `WS [(flag) WS] ACCOUNT` prefix):
436///
437/// - `AMOUNT` is the units amount: `[(MINUS | PLUS)] NUMBER
438/// [WS CURRENCY]`, or a bare `CURRENCY`. Mirrors the legacy AST
439/// `parse_incomplete_amount`: NUMBER + optional CURRENCY, or
440/// CURRENCY alone. Wrapping skips intervening `WHITESPACE`
441/// between AMOUNT and CURRENCY so the sub-node owns both.
442/// - `COST_SPEC` is a bracketed cost annotation, opened by
443/// `L_BRACE` (per-unit), `L_BRACE_HASH` (per-unit + total), or
444/// `L_DOUBLE_BRACE` (total-only), and closed by the matching
445/// `R_BRACE` / `R_DOUBLE_BRACE`. Contents stay flat children;
446/// phase 3 typed-AST will surface accessors. Per rule 5 of
447/// `cst::trivia`, an unclosed brace at EOF still gets wrapped
448/// (the `COST_SPEC` simply has no matching close-brace child).
449/// - `PRICE_ANNOTATION` is opened by `AT` (per-unit price) or
450/// `AT_AT` (total price). Its trailing amount is recursively
451/// wrapped in `AMOUNT` so the structure mirrors the units-amount
452/// case: `PRICE_ANNOTATION(AT [WS AMOUNT])`. The typed-AST
453/// decodes per-unit-vs-total by the opener token kind, then
454/// walks the `AMOUNT` child for the number/currency.
455///
456/// Canonical order on a well-formed posting line is `ACCOUNT
457/// [AMOUNT] [COST_SPEC] [PRICE_ANNOTATION]`. The state machine
458/// here is order-independent at the recognition level (each sub-
459/// structure wraps when its opener token is encountered), so a
460/// malformed posting with reordered or duplicated sub-structures
461/// still round-trips byte-identically — duplicates each get their
462/// own wrapper.
463///
464/// Trailing tokens (`WHITESPACE`, `COMMENT`, `PERCENT_COMMENT`,
465/// `NEWLINE`) that follow the last recognized sub-structure stay
466/// as flat children of `POSTING`.
467fn emit_posting_line(
468 builder: &mut GreenNodeBuilder<'_>,
469 source: &str,
470 tokens: &[(SyntaxKind, Range<usize>)],
471 mut i: usize,
472) -> usize {
473 // Emit the indent `WHITESPACE`.
474 if let Some((SyntaxKind::WHITESPACE, range)) = tokens.get(i) {
475 builder.token(SyntaxKind::WHITESPACE.into(), &source[range.clone()]);
476 i += 1;
477 }
478
479 // Optional flag (`FLAG` / `STAR` / `PENDING_KW` / `HASH` /
480 // single-char `CURRENCY`) + separating `WHITESPACE`. Mirrors
481 // `starts_posting_sub_line`'s flag arm.
482 let next = tokens.get(i).map(|(k, _)| *k);
483 let is_flag = match next {
484 Some(SyntaxKind::FLAG | SyntaxKind::STAR | SyntaxKind::PENDING_KW | SyntaxKind::HASH) => {
485 true
486 }
487 Some(SyntaxKind::CURRENCY) => tokens[i].1.len() == 1,
488 _ => false,
489 };
490 if is_flag {
491 // Emit flag + WHITESPACE pair.
492 if let Some((kind, range)) = tokens.get(i) {
493 builder.token((*kind).into(), &source[range.clone()]);
494 i += 1;
495 }
496 if let Some((SyntaxKind::WHITESPACE, range)) = tokens.get(i) {
497 builder.token(SyntaxKind::WHITESPACE.into(), &source[range.clone()]);
498 i += 1;
499 }
500 }
501
502 // Emit the required ACCOUNT.
503 if let Some((SyntaxKind::ACCOUNT, range)) = tokens.get(i) {
504 builder.token(SyntaxKind::ACCOUNT.into(), &source[range.clone()]);
505 i += 1;
506 }
507
508 // Scan post-ACCOUNT tokens, wrapping AMOUNT / COST_SPEC /
509 // PRICE_ANNOTATION as openers appear. Anything else flows as
510 // flat children of POSTING.
511 while i < tokens.len() {
512 let (kind, range) = (tokens[i].0, tokens[i].1.clone());
513 if kind == SyntaxKind::NEWLINE {
514 builder.token(kind.into(), &source[range]);
515 i += 1;
516 break;
517 }
518 if starts_amount(tokens, i) {
519 i = emit_amount(builder, source, tokens, i);
520 continue;
521 }
522 if matches!(
523 kind,
524 SyntaxKind::L_BRACE | SyntaxKind::L_BRACE_HASH | SyntaxKind::L_DOUBLE_BRACE,
525 ) {
526 i = emit_cost_spec(builder, source, tokens, i);
527 continue;
528 }
529 if matches!(kind, SyntaxKind::AT | SyntaxKind::AT_AT) {
530 i = emit_price_annotation(builder, source, tokens, i);
531 continue;
532 }
533 // Flat passthrough (WHITESPACE, COMMENT, PERCENT_COMMENT,
534 // anything else).
535 builder.token(kind.into(), &source[range]);
536 i += 1;
537 }
538
539 i
540}
541
542/// Returns true iff `tokens[i..]` starts an AMOUNT-shape token
543/// run: an arithmetic-expression operand (`NUMBER`, `L_PAREN`, or
544/// signed variants), or a bare `CURRENCY`. Used by
545/// `emit_posting_line` to gate whether to open an `AMOUNT` wrapper.
546fn starts_amount(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
547 match tokens.get(i).map(|(k, _)| *k) {
548 Some(SyntaxKind::NUMBER | SyntaxKind::CURRENCY | SyntaxKind::L_PAREN) => true,
549 Some(SyntaxKind::MINUS | SyntaxKind::PLUS) => matches!(
550 tokens.get(i + 1).map(|(k, _)| *k),
551 Some(SyntaxKind::NUMBER | SyntaxKind::L_PAREN),
552 ),
553 _ => false,
554 }
555}
556
557/// Returns true iff `tokens[i]` is an arithmetic operator
558/// (`PLUS` / `MINUS` / `STAR` / `SLASH`).
559const fn is_arith_op(kind: SyntaxKind) -> bool {
560 matches!(
561 kind,
562 SyntaxKind::PLUS | SyntaxKind::MINUS | SyntaxKind::STAR | SyntaxKind::SLASH,
563 )
564}
565
566/// Emit an `AMOUNT` node containing the units amount.
567///
568/// Recognizes Python beancount's `parse_expr` grammar shape:
569/// `[sign] operand ([WS] op [WS] [sign] operand)* [WS CURRENCY]`,
570/// where `operand` is `NUMBER` or a parenthesized sub-expression
571/// `L_PAREN expr R_PAREN`. Also accepts a bare `CURRENCY`
572/// (currency-only amount). Closes the PR 2.2c.1 deferred
573/// divergence: `bean-check` accepts `10+5 USD`, `-10+5 USD`, and
574/// `-(10+5) USD`; this helper now wraps them as a single `AMOUNT`
575/// node containing the full expression tokens flat (sign + operands
576/// + operators + currency).
577///
578/// Stops at the first token that doesn't fit the grammar (e.g.,
579/// `L_BRACE` cost-spec opener, `AT` price opener, `NEWLINE`,
580/// `COMMENT`, etc.). Returns the new index.
581fn emit_amount(
582 builder: &mut GreenNodeBuilder<'_>,
583 source: &str,
584 tokens: &[(SyntaxKind, Range<usize>)],
585 mut i: usize,
586) -> usize {
587 builder.start_node(SyntaxKind::AMOUNT.into());
588
589 // Currency-only amount: bare `CURRENCY` and nothing more.
590 if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::CURRENCY))
591 && !starts_amount_operand(tokens, i + 1)
592 {
593 let range = tokens[i].1.clone();
594 builder.token(SyntaxKind::CURRENCY.into(), &source[range]);
595 i += 1;
596 builder.finish_node();
597 return i;
598 }
599
600 // Optional leading sign.
601 if matches!(
602 tokens.get(i).map(|(k, _)| *k),
603 Some(SyntaxKind::MINUS | SyntaxKind::PLUS),
604 ) {
605 let (kind, range) = (tokens[i].0, tokens[i].1.clone());
606 builder.token(kind.into(), &source[range]);
607 i += 1;
608 }
609
610 // First operand.
611 i = emit_amount_operand(builder, source, tokens, i);
612
613 // Tail: zero or more `[WS] op [WS] [sign] operand` runs. Each
614 // iteration commits the WS / op / WS / sign tokens BEFORE
615 // dispatching the operand emission. Lookahead-only: do NOT
616 // consume any token until the full op-operand prefix is
617 // confirmed, so a trailing single WHITESPACE before CURRENCY
618 // (the canonical `100 USD` shape) isn't accidentally consumed
619 // as a leading op-prefix.
620 loop {
621 let mut j = i;
622 if matches!(tokens.get(j).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE)) {
623 j += 1;
624 }
625 let Some((op_kind, _)) = tokens.get(j) else {
626 break;
627 };
628 if !is_arith_op(*op_kind) {
629 break;
630 }
631 let op_kind = *op_kind;
632 j += 1;
633 if matches!(tokens.get(j).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE)) {
634 j += 1;
635 }
636 // Optional sign before next operand.
637 let signed = matches!(
638 tokens.get(j).map(|(k, _)| *k),
639 Some(SyntaxKind::MINUS | SyntaxKind::PLUS),
640 );
641 let operand_start = if signed { j + 1 } else { j };
642 if !starts_amount_operand(tokens, operand_start) {
643 break;
644 }
645 // Commit tokens [i..j) (WS? op WS?) into AMOUNT.
646 while i < j {
647 let (kind, range) = (tokens[i].0, tokens[i].1.clone());
648 // Sanity: the only non-op tokens we should be committing
649 // here are WHITESPACE. The op token itself was already
650 // verified.
651 debug_assert!(
652 kind == SyntaxKind::WHITESPACE || kind == op_kind || is_arith_op(kind),
653 "unexpected token kind {kind:?} during op-prefix commit",
654 );
655 builder.token(kind.into(), &source[range]);
656 i += 1;
657 }
658 if signed {
659 let (kind, range) = (tokens[i].0, tokens[i].1.clone());
660 builder.token(kind.into(), &source[range]);
661 i += 1;
662 }
663 i = emit_amount_operand(builder, source, tokens, i);
664 }
665
666 // Optional trailing CURRENCY, either directly adjacent (`100USD`,
667 // `(10+5)USD`) or separated by WHITESPACE (`100 USD`).
668 if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE))
669 && matches!(
670 tokens.get(i + 1).map(|(k, _)| *k),
671 Some(SyntaxKind::CURRENCY),
672 )
673 {
674 let ws_range = tokens[i].1.clone();
675 builder.token(SyntaxKind::WHITESPACE.into(), &source[ws_range]);
676 i += 1;
677 let cur_range = tokens[i].1.clone();
678 builder.token(SyntaxKind::CURRENCY.into(), &source[cur_range]);
679 i += 1;
680 } else if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::CURRENCY)) {
681 let cur_range = tokens[i].1.clone();
682 builder.token(SyntaxKind::CURRENCY.into(), &source[cur_range]);
683 i += 1;
684 }
685
686 builder.finish_node();
687 i
688}
689
690/// Returns true iff `tokens[i]` starts an arithmetic-expression
691/// operand (a bare `NUMBER` or a parenthesized sub-expression
692/// opener `L_PAREN`). Used by `emit_amount` to gate operand
693/// emission inside the op-loop tail.
694fn starts_amount_operand(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
695 matches!(
696 tokens.get(i).map(|(k, _)| *k),
697 Some(SyntaxKind::NUMBER | SyntaxKind::L_PAREN),
698 )
699}
700
701/// Emit one operand of an arithmetic expression: either a bare
702/// `NUMBER` or a parenthesized `L_PAREN expr R_PAREN` sub-
703/// expression. The sub-expression's content tokens stay flat
704/// children of the surrounding `AMOUNT` node (no separate
705/// `EXPR` / `PAREN_GROUP` wrapping for now). Per rule 5, an
706/// unclosed paren at EOF or NEWLINE stops without emitting a
707/// closing paren — round-trip preserves bytes.
708fn emit_amount_operand(
709 builder: &mut GreenNodeBuilder<'_>,
710 source: &str,
711 tokens: &[(SyntaxKind, Range<usize>)],
712 mut i: usize,
713) -> usize {
714 match tokens.get(i).map(|(k, _)| *k) {
715 Some(SyntaxKind::NUMBER) => {
716 let range = tokens[i].1.clone();
717 builder.token(SyntaxKind::NUMBER.into(), &source[range]);
718 i += 1;
719 }
720 Some(SyntaxKind::L_PAREN) => {
721 // Emit opener.
722 let range = tokens[i].1.clone();
723 builder.token(SyntaxKind::L_PAREN.into(), &source[range]);
724 i += 1;
725 // Consume balanced content until matching R_PAREN.
726 // Track nesting depth so `((1+2))` works. Stop at
727 // NEWLINE / EOF (rule 5 unterminated case).
728 let mut depth = 1usize;
729 while depth > 0 {
730 let Some((kind, range)) = tokens.get(i) else {
731 break;
732 };
733 let (kind, range) = (*kind, range.clone());
734 if kind == SyntaxKind::NEWLINE {
735 break;
736 }
737 builder.token(kind.into(), &source[range]);
738 i += 1;
739 match kind {
740 SyntaxKind::L_PAREN => depth += 1,
741 SyntaxKind::R_PAREN => depth -= 1,
742 _ => {}
743 }
744 }
745 }
746 _ => {}
747 }
748 i
749}
750
751/// Emit a `COST_SPEC` node spanning `L_BRACE` / `L_BRACE_HASH` /
752/// `L_DOUBLE_BRACE` ... matching `R_BRACE` / `R_DOUBLE_BRACE`. Per
753/// rule 5 (unterminated final directive), an unclosed brace at
754/// EOF or hitting a NEWLINE still gets wrapped — the `COST_SPEC`
755/// simply has no matching close-brace child. Contents stay flat
756/// children of `COST_SPEC`.
757fn emit_cost_spec(
758 builder: &mut GreenNodeBuilder<'_>,
759 source: &str,
760 tokens: &[(SyntaxKind, Range<usize>)],
761 mut i: usize,
762) -> usize {
763 builder.start_node(SyntaxKind::COST_SPEC.into());
764
765 // Emit opening brace token.
766 if let Some((kind, range)) = tokens.get(i) {
767 builder.token((*kind).into(), &source[range.clone()]);
768 i += 1;
769 }
770
771 // Emit content tokens up to and including the matching close
772 // brace, or until NEWLINE / EOF (unclosed-brace case).
773 while i < tokens.len() {
774 let (kind, range) = (tokens[i].0, tokens[i].1.clone());
775 if kind == SyntaxKind::NEWLINE {
776 // Unclosed brace: stop BEFORE the NEWLINE so the
777 // NEWLINE remains a sibling of COST_SPEC (the
778 // posting-line terminator), not a child.
779 break;
780 }
781 builder.token(kind.into(), &source[range]);
782 i += 1;
783 if matches!(kind, SyntaxKind::R_BRACE | SyntaxKind::R_DOUBLE_BRACE) {
784 break;
785 }
786 }
787
788 builder.finish_node();
789 i
790}
791
792/// Emit a `PRICE_ANNOTATION` node opened by `AT` or `AT_AT`,
793/// optionally followed by `WS` and a nested `AMOUNT`. The nested
794/// `AMOUNT` mirrors the units-amount wrapping above; the typed-AST
795/// decodes per-unit-vs-total by inspecting the opener token kind
796/// (`AT` vs `AT_AT`) and walks the `AMOUNT` child for the number
797/// and currency. Avoids absorbing a trailing-only `WHITESPACE`
798/// before a comment or `NEWLINE` (only swallows WS that precedes
799/// an actual amount start).
800fn emit_price_annotation(
801 builder: &mut GreenNodeBuilder<'_>,
802 source: &str,
803 tokens: &[(SyntaxKind, Range<usize>)],
804 mut i: usize,
805) -> usize {
806 builder.start_node(SyntaxKind::PRICE_ANNOTATION.into());
807
808 // Emit the `AT` / `AT_AT` opener.
809 if let Some((kind, range)) = tokens.get(i) {
810 builder.token((*kind).into(), &source[range.clone()]);
811 i += 1;
812 }
813
814 // Optional intervening WHITESPACE, but only if an amount
815 // follows; trailing-only WS belongs as a sibling of
816 // PRICE_ANNOTATION, not a child.
817 let ws_then_amount = matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE),)
818 && starts_amount(tokens, i + 1);
819 if ws_then_amount {
820 let ws_range = tokens[i].1.clone();
821 builder.token(SyntaxKind::WHITESPACE.into(), &source[ws_range]);
822 i += 1;
823 }
824 if starts_amount(tokens, i) {
825 i = emit_amount(builder, source, tokens, i);
826 }
827
828 builder.finish_node();
829 i
830}
831
832/// Close any currently-open POSTING node IF the next sub-line at
833/// `sub_line_indent` should NOT be attached to it. Shared between the
834/// `META_ENTRY` and indented-comment branches of
835/// `emit_transaction_body`, which differ ONLY in their same-indent
836/// tie-break (`attach_on_equal`).
837///
838/// `attach_on_equal` selects the attachment threshold:
839///
840/// - **Metadata (`true`)**: a `key: value` sub-line attaches when it
841/// is indented `>=` the open POSTING. This matches Beancount, whose
842/// grammar attributes metadata by POSITION (any `key_value` line
843/// following a posting, before the next posting, attaches to that
844/// posting) rather than by relative indent — so the common
845/// `key: value` at the SAME column as the posting (e.g. the
846/// `effective_date:` idiom) is posting metadata, not transaction
847/// metadata. Pinned by
848/// `same_indent_metadata_attaches_to_preceding_posting`.
849/// - **Indented comment (`false`)**: a `; doc` / `% doc` sub-line
850/// attaches only when STRICTLY more indented (`>`). A same-indent
851/// comment closes the POSTING and emits as transaction-level
852/// inter-posting trivia. Comments are AST-invisible, so this
853/// threshold only affects CST/formatter emission placement; it is
854/// pinned by
855/// `posting_with_indented_comment_between_postings_terminates_posting`
856/// and must stay strict to preserve that formatter contract.
857///
858/// A sub-line below the attachment threshold closes the POSTING.
859/// Called with `open_posting_indent = None` is a no-op (no POSTING to
860/// close).
861fn close_open_posting_unless_attached(
862 builder: &mut GreenNodeBuilder<'_>,
863 open_posting_indent: &mut Option<usize>,
864 sub_line_indent: usize,
865 attach_on_equal: bool,
866) {
867 let attach = open_posting_indent.is_some_and(|p_indent| {
868 if attach_on_equal {
869 sub_line_indent >= p_indent
870 } else {
871 sub_line_indent > p_indent
872 }
873 });
874 if !attach && open_posting_indent.is_some() {
875 builder.finish_node();
876 *open_posting_indent = None;
877 }
878}
879
880/// Returns true iff `tokens[i..]` starts a posting sub-line:
881/// `WHITESPACE` (the indent) followed by `ACCOUNT`, or by an
882/// optional flag (`FLAG` / `STAR` / `PENDING_KW` / `HASH` /
883/// single-char `CURRENCY`) plus another `WHITESPACE` then
884/// `ACCOUNT`. Mirrors the legacy AST parser's `parse_posting` shape
885/// (`parser.rs:866-880`): indent, optional flag, then a required
886/// account. The flag set MUST stay in sync with `parse_flag` in the
887/// legacy parser (`Token::Star | Pending | Flag(_) | Hash` plus
888/// single-char `Currency`) and with `identify_directive`'s
889/// transaction-trigger arm above; drift would silently leave
890/// HASH-flagged or single-char-CURRENCY-flagged posting lines flat
891/// under `TRANSACTION` instead of wrapped in `POSTING`. The single-
892/// char `CURRENCY`-as-flag arm exists because the lexer's priority-3
893/// Currency-vs-Flag tie-break makes letters like `T`/`V`/`F`/`X`
894/// tokenize as `CURRENCY`, but they still function as posting flags
895/// by Beancount convention.
896fn starts_posting_sub_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
897 if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
898 return false;
899 }
900 if matches!(tokens.get(i + 1), Some((SyntaxKind::ACCOUNT, _))) {
901 return true;
902 }
903 let has_flag = match tokens.get(i + 1) {
904 Some((
905 SyntaxKind::FLAG | SyntaxKind::STAR | SyntaxKind::PENDING_KW | SyntaxKind::HASH,
906 _,
907 )) => true,
908 Some((SyntaxKind::CURRENCY, range)) => range.len() == 1,
909 _ => false,
910 };
911 if !has_flag {
912 return false;
913 }
914 matches!(tokens.get(i + 2), Some((SyntaxKind::WHITESPACE, _)))
915 && matches!(tokens.get(i + 3), Some((SyntaxKind::ACCOUNT, _)))
916}
917
918/// Byte length of the leading `WHITESPACE` token at `tokens[i]`,
919/// or 0 if there is no leading whitespace. Used by
920/// `emit_transaction_body` to decide whether a metadata or
921/// comment sub-line's indent is strictly deeper than the
922/// surrounding POSTING's indent (the posting-attached-metadata /
923/// posting-attached-comment rule).
924///
925/// **Known divergence from the legacy AST parser**: the legacy
926/// lexer's `Indent(N)` / `DeepIndent(N)` variants
927/// (`logos_lexer.rs:615-616`) count tabs as 4 spaces, but this
928/// helper returns raw bytes. Mixed tab+space indentation can
929/// therefore produce different attribution between the two paths.
930/// Acceptable for now because (a) Beancount idiom is uniform
931/// spaces, (b) no corpus file currently triggers the divergence in
932/// posting-attached-metadata position, and (c) the CST round-trip
933/// is byte-identical regardless of how `indent_width` classifies.
934/// If a file shows up, switch to a column-aware count.
935fn indent_width(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> usize {
936 match tokens.get(i) {
937 Some((SyntaxKind::WHITESPACE, range)) => range.len(),
938 _ => 0,
939 }
940}
941
942/// Returns true iff `kind` is one of the four comment-class trivia
943/// token kinds: `COMMENT` (`;`), `PERCENT_COMMENT` (`%`), `SHEBANG`
944/// (`#!`), or `EMACS_DIRECTIVE` (`#+`). Mirrors the comment subset
945/// of `SyntaxKind::is_trivia()` and is the single source of truth
946/// for the three call sites that need to decide whether a token
947/// "is a comment" for body-continuation / indent-attribution
948/// purposes (`starts_indented_comment`,
949/// `upcoming_indented_block_has_meta`,
950/// `is_indented_directive_continuation`). A new comment-class
951/// token would otherwise require three coordinated edits;
952/// `is_comment_token_covers_all_comment_class_trivia` in this
953/// module's tests asserts membership stays in sync with `is_trivia`.
954///
955/// **Known CST/AST divergence**: The legacy AST parser's
956/// `parse_posting_metadata` / `parse_transaction_directive` paths
957/// in `crates/rustledger-parser/src/parser.rs` only treat
958/// `Token::Comment` and `Token::PercentComment` as in-body trivia
959/// for transaction / directive bodies. `Token::Shebang` and
960/// `Token::EmacsDirective` are processed only at top level
961/// (`parse_directive` dispatch). So a deeper-indented `#+STARTUP:
962/// overview` between two postings is INSIDE the POSTING for the
963/// CST but TERMINATES the transaction for the AST. Phase-isolated
964/// in practice: the loader, LSP, validator, query, booking, and
965/// CLI all run through the AST path; the only current
966/// `parse_structured` consumers are this crate's corpus baseline
967/// test and `examples/dump_top_level_directives.rs`. Phase 5
968/// deletes `parse_flat` and the AST; that reconciliation should
969/// adopt the CST behavior (consistent with `is_trivia()`'s
970/// classification of all four comment-class tokens) rather than
971/// the AST behavior (an indented comment-class line silently
972/// terminating the directive is the surprising outcome).
973const fn is_comment_token(kind: SyntaxKind) -> bool {
974 matches!(
975 kind,
976 SyntaxKind::COMMENT
977 | SyntaxKind::PERCENT_COMMENT
978 | SyntaxKind::SHEBANG
979 | SyntaxKind::EMACS_DIRECTIVE,
980 )
981}
982
983/// Returns true iff `tokens[i..]` starts an indented comment line:
984/// `WHITESPACE` (the indent) followed by a comment-class token (per
985/// [`is_comment_token`]). Used by `emit_transaction_body` to apply
986/// the same indent-attribution rule to comments that it applies to
987/// metadata.
988fn starts_indented_comment(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
989 matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _)))
990 && matches!(tokens.get(i + 1), Some((k, _)) if is_comment_token(*k))
991}
992
993/// Returns true iff `tokens[i..]` starts an indented line with
994/// actual content: `WHITESPACE` followed by ANY non-`NEWLINE`
995/// token. A blank line (`NEWLINE` alone, or `WHITESPACE NEWLINE`)
996/// or EOF terminates the transaction body.
997///
998/// **Deliberate divergence from rule 4 of `cst::trivia`:** unlike
999/// the single-line-directive body, a TRANSACTION body absorbs an
1000/// indented trailing `;`-comment AT EOF (file-trailing-ish) into
1001/// the directive. Rationale: documentation comments interleaved
1002/// with postings are a Beancount idiom, and forcing the body to
1003/// "back-track" the last comment if it's trailing would require
1004/// look-ahead the per-line predicate can't do without extra state.
1005/// Pinned by `transaction_trailing_indented_comment_at_eof_stays_inside`.
1006fn is_indented_transaction_body_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
1007 if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
1008 return false;
1009 }
1010 !matches!(tokens.get(i + 1), Some((SyntaxKind::NEWLINE, _)) | None)
1011}
1012
1013/// Scan forward through any indented `WS META_KEY` sub-lines or
1014/// `WS <comment>` sub-lines (per [`is_comment_token`]) starting at
1015/// `tokens[i..]`, returning `true` iff at least one of them is a
1016/// metadata (`WS META_KEY`) sub-line. Stops at the first line that
1017/// is neither metadata nor an indented comment (blank line,
1018/// non-indented top-level content, EOF).
1019fn upcoming_indented_block_has_meta(tokens: &[(SyntaxKind, Range<usize>)], mut i: usize) -> bool {
1020 loop {
1021 let head = tokens.get(i).map(|(k, _)| *k);
1022 let next = tokens.get(i + 1).map(|(k, _)| *k);
1023 match (head, next) {
1024 (Some(SyntaxKind::WHITESPACE), Some(SyntaxKind::META_KEY)) => return true,
1025 (Some(SyntaxKind::WHITESPACE), Some(k)) if is_comment_token(k) => {
1026 // Skip past this indented-comment line.
1027 while i < tokens.len() && tokens[i].0 != SyntaxKind::NEWLINE {
1028 i += 1;
1029 }
1030 if i >= tokens.len() {
1031 return false;
1032 }
1033 i += 1; // past the NEWLINE
1034 }
1035 _ => return false,
1036 }
1037 }
1038}
1039
1040/// Returns true iff `tokens[i..]` starts an indented line that
1041/// CONTINUES the current multi-line directive: `WHITESPACE` (the
1042/// indent) followed by content that visually "belongs to" the
1043/// metadata block.
1044///
1045/// Recognizes:
1046/// - `WS META_KEY` — always a continuation regardless of context.
1047/// - `WS <comment>` (per [`is_comment_token`]) — a continuation iff
1048/// the surrounding indented block contains ANY `WS META_KEY` (the
1049/// `block_has_meta` argument). This prevents absorbing indented
1050/// comments that follow a header-only directive (rule 2 / rule
1051/// 4 cases) while still keeping documentation comments BEFORE
1052/// the first metadata entry inside the directive.
1053///
1054/// All other shapes (blank `\n`, non-indented content, EOF)
1055/// terminate the directive.
1056fn is_indented_directive_continuation(
1057 tokens: &[(SyntaxKind, Range<usize>)],
1058 i: usize,
1059 block_has_meta: bool,
1060) -> bool {
1061 // The META_KEY arm routes through `starts_meta_sub_line` so the
1062 // continuation predicate and the wrapping predicate
1063 // (`emit_body_sub_line`) cannot drift.
1064 if starts_meta_sub_line(tokens, i) {
1065 return true;
1066 }
1067 if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
1068 return false;
1069 }
1070 match tokens.get(i + 1) {
1071 Some((k, _)) if is_comment_token(*k) => block_has_meta,
1072 _ => false,
1073 }
1074}
1075
1076/// Given the token slice and the index of a non-trivia token,
1077/// decide whether it starts a recognized top-level directive of
1078/// any kind. Returns the directive `SyntaxKind` if yes, `None`
1079/// otherwise (random content that doesn't fit a known shape — the
1080/// caller wraps such content in an `ERROR_NODE` per PR 2.4).
1081///
1082/// Beancount directive line shapes recognized here:
1083///
1084/// - `DATE WHITESPACE <KEYWORD> ...`: OPEN / CLOSE / BALANCE / PAD
1085/// / EVENT / QUERY / NOTE / DOCUMENT / PRICE / COMMODITY (PR
1086/// 2.1a) + CUSTOM (PR 2.3)
1087/// - `DATE WHITESPACE <txn-trigger> ...`: TRANSACTION (PR 2.1b),
1088/// where `<txn-trigger>` is one of `STAR` / `PENDING_KW` (`!`)
1089/// / `FLAG` / `HASH` / `TXN_KW` / `STRING` ("implied" txn form
1090/// with no explicit flag) / single-char `CURRENCY` (ticker
1091/// letters). Mirrors `parse_dated_directive` in the legacy AST
1092/// parser at parser.rs:1707-1715.
1093/// - `<KEYWORD> ...` (no leading date): PUSHTAG / POPTAG /
1094/// PUSHMETA / POPMETA (PR 2.1a) + OPTION / INCLUDE / PLUGIN
1095/// (PR 2.3)
1096fn identify_directive(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> Option<SyntaxKind> {
1097 let (head, _) = tokens.get(i)?;
1098 match *head {
1099 // Top-level keyword directives — no leading date.
1100 SyntaxKind::PUSHTAG_KW => Some(SyntaxKind::PUSHTAG_DIRECTIVE),
1101 SyntaxKind::POPTAG_KW => Some(SyntaxKind::POPTAG_DIRECTIVE),
1102 SyntaxKind::PUSHMETA_KW => Some(SyntaxKind::PUSHMETA_DIRECTIVE),
1103 SyntaxKind::POPMETA_KW => Some(SyntaxKind::POPMETA_DIRECTIVE),
1104
1105 // Phase 2.3: edge directives (option / include / plugin).
1106 // These are top-level keyword directives — like
1107 // pushtag/poptag/pushmeta/popmeta above — so the same
1108 // single-line directive body shape applies. Their full
1109 // header is consumed by `emit_through_terminator`; trailing
1110 // indented metadata lines (a rare but legal Beancount idiom
1111 // for option / include / plugin) are absorbed by
1112 // `emit_directive_body`'s look-ahead, same as the other
1113 // top-level-keyword directives.
1114 SyntaxKind::OPTION_KW => Some(SyntaxKind::OPTION_DIRECTIVE),
1115 SyntaxKind::INCLUDE_KW => Some(SyntaxKind::INCLUDE_DIRECTIVE),
1116 SyntaxKind::PLUGIN_KW => Some(SyntaxKind::PLUGIN_DIRECTIVE),
1117
1118 // Dated directives — peek past SAME-LINE whitespace for the
1119 // keyword. Only WHITESPACE separates content tokens within a
1120 // directive's header line; a NEWLINE means we crossed into
1121 // the next line and the DATE/keyword pair is NOT a single
1122 // directive. Skipping `is_trivia()` (which includes NEWLINE
1123 // and COMMENT) would wrongly identify malformed `DATE\nopen ...`
1124 // as OPEN_DIRECTIVE while `emit_through_terminator` only
1125 // captures the first line, leaving the keyword orphaned.
1126 SyntaxKind::DATE => {
1127 let mut j = i + 1;
1128 while j < tokens.len() && tokens[j].0 == SyntaxKind::WHITESPACE {
1129 j += 1;
1130 }
1131 let (next, _) = tokens.get(j)?;
1132 match *next {
1133 SyntaxKind::OPEN_KW => Some(SyntaxKind::OPEN_DIRECTIVE),
1134 SyntaxKind::CLOSE_KW => Some(SyntaxKind::CLOSE_DIRECTIVE),
1135 SyntaxKind::BALANCE_KW => Some(SyntaxKind::BALANCE_DIRECTIVE),
1136 SyntaxKind::PAD_KW => Some(SyntaxKind::PAD_DIRECTIVE),
1137 SyntaxKind::EVENT_KW => Some(SyntaxKind::EVENT_DIRECTIVE),
1138 SyntaxKind::QUERY_KW => Some(SyntaxKind::QUERY_DIRECTIVE),
1139 SyntaxKind::NOTE_KW => Some(SyntaxKind::NOTE_DIRECTIVE),
1140 SyntaxKind::DOCUMENT_KW => Some(SyntaxKind::DOCUMENT_DIRECTIVE),
1141 SyntaxKind::PRICE_KW => Some(SyntaxKind::PRICE_DIRECTIVE),
1142 SyntaxKind::COMMODITY_KW => Some(SyntaxKind::COMMODITY_DIRECTIVE),
1143 // Phase 2.3: CUSTOM is a dated directive with a
1144 // type-name STRING followed by an arbitrary value
1145 // list (STRING / ACCOUNT / amount / DATE / CURRENCY
1146 // / BOOL_TRUE / BOOL_FALSE). The header consumption
1147 // is identical to the other dated single-line
1148 // directives; only the value list is open-ended,
1149 // which is fine for the CST since the trailing
1150 // tokens stay flat.
1151 SyntaxKind::CUSTOM_KW => Some(SyntaxKind::CUSTOM_DIRECTIVE),
1152 // Transaction triggers after the DATE. Beancount
1153 // accepts:
1154 // - `*` (STAR) for completed transactions
1155 // - `!` (PENDING_KW) for incomplete/warning
1156 // - letter flags P/S/T/C/U/R/M/?/& (FLAG)
1157 // - `#` (HASH) promoted to a flag in this position
1158 // (cf. `Token::is_txn_flag` and the AST parser's
1159 // `parse_flag` accepting Hash)
1160 // - the explicit `txn` keyword (TXN_KW)
1161 // - a bare STRING ("implied transaction": the AST
1162 // parser at parser.rs:1713 dispatches
1163 // `Token::String(_)` to `parse_transaction_directive`
1164 // with an implied `*` flag; common shorthand
1165 // form in real ledgers like
1166 // `2024-01-15 "Coffee"`)
1167 SyntaxKind::STAR
1168 | SyntaxKind::PENDING_KW
1169 | SyntaxKind::FLAG
1170 | SyntaxKind::HASH
1171 | SyntaxKind::TXN_KW
1172 | SyntaxKind::STRING => Some(SyntaxKind::TRANSACTION),
1173 // Single-character CURRENCY: NYSE/NASDAQ-style
1174 // ticker letters (T, V, F, X, ...) double as
1175 // transaction flags. The lexer prioritizes
1176 // CURRENCY over FLAG for single uppercase letters
1177 // (logos_lexer Currency priority 3); the AST parser
1178 // (`parse_flag` arm `Token::Currency(s) if s.len() == 1`)
1179 // mirrors this. We do the same to stay consistent
1180 // with the established lexer/parser contract.
1181 SyntaxKind::CURRENCY if tokens[j].1.len() == 1 => Some(SyntaxKind::TRANSACTION),
1182 // Anything else: unknown shape.
1183 _ => None,
1184 }
1185 }
1186 _ => None,
1187 }
1188}
1189
1190#[cfg(test)]
1191mod tests {
1192 use super::*;
1193
1194 fn assert_round_trips(source: &str) {
1195 let tree = parse_flat(source);
1196 assert_eq!(tree.text().to_string(), source);
1197 let structured = parse_structured(source);
1198 assert_eq!(structured.text().to_string(), source);
1199 }
1200
1201 /// Drift guard: `is_comment_token` and `is_trivia` must agree on
1202 /// what counts as comment-class trivia. Enforces two invariants:
1203 ///
1204 /// 1. `is_trivia() ⊆ is_comment_token ∪ non_comment_trivia`:
1205 /// every trivia kind is either a comment or in the explicit
1206 /// whitespace-class allow-list. Catches a new lexer-level
1207 /// addition to `is_trivia()` that's silently forgotten in
1208 /// `is_comment_token`.
1209 /// 2. `is_comment_token ⊆ is_trivia()`: every kind
1210 /// `is_comment_token` says yes to is actually trivia. Catches
1211 /// a future edit to `is_comment_token`'s match arm that
1212 /// accidentally pulls in a non-trivia content token,
1213 /// silently extending indent-attribution to real content
1214 /// inside POSTING / directive bodies.
1215 ///
1216 /// On failure (1), if the new trivia kind is neither comment-
1217 /// class nor whitespace-class (e.g., some future
1218 /// `SECTION_HEADER` that should NOT be absorbed as a
1219 /// continuation), don't reflexively add it to either set —
1220 /// revisit whether the body-continuation predicates need a
1221 /// different abstraction (`is_body_continuation_trivia` or
1222 /// similar) and propagate the choice to the three call sites.
1223 #[test]
1224 fn is_comment_token_covers_all_comment_class_trivia() {
1225 let non_comment_trivia = [SyntaxKind::BOM, SyntaxKind::WHITESPACE, SyntaxKind::NEWLINE];
1226
1227 let mut trivia_missed_from_comment: Vec<SyntaxKind> = Vec::new();
1228 let mut comment_not_trivia: Vec<SyntaxKind> = Vec::new();
1229 for d in 0u16..=u16::MAX {
1230 let Ok(kind) = SyntaxKind::try_from(d) else {
1231 continue;
1232 };
1233 // Invariant 1: trivia (minus whitespace allow-list) ⊆ comment.
1234 if kind.is_trivia() && !non_comment_trivia.contains(&kind) && !is_comment_token(kind) {
1235 trivia_missed_from_comment.push(kind);
1236 }
1237 // Invariant 2: comment ⊆ trivia.
1238 if is_comment_token(kind) && !kind.is_trivia() {
1239 comment_not_trivia.push(kind);
1240 }
1241 }
1242 assert!(
1243 trivia_missed_from_comment.is_empty(),
1244 "trivia kinds present in is_trivia() but missing from \
1245 is_comment_token: {trivia_missed_from_comment:?}. Three \
1246 options: (a) add them to is_comment_token if they are \
1247 comment-class; (b) extend the non_comment_trivia allow- \
1248 list in this test if they are whitespace-class; (c) if \
1249 they are neither, revisit whether the body-continuation \
1250 predicates need a different abstraction and propagate \
1251 the decision to the three call sites.",
1252 );
1253 assert!(
1254 comment_not_trivia.is_empty(),
1255 "is_comment_token claims these kinds are comments but \
1256 is_trivia() disagrees: {comment_not_trivia:?}. Either \
1257 add them to is_trivia() (if they really are trivia) or \
1258 remove them from is_comment_token (if they are content \
1259 tokens that should not be absorbed as comment \
1260 continuations).",
1261 );
1262 }
1263
1264 #[test]
1265 fn empty_source() {
1266 assert_round_trips("");
1267 }
1268
1269 #[test]
1270 fn whitespace_only() {
1271 assert_round_trips(" \t ");
1272 }
1273
1274 #[test]
1275 fn bom_round_trips() {
1276 assert_round_trips("\u{FEFF}2024-01-01 open Assets:Bank\n");
1277 }
1278
1279 #[test]
1280 fn full_directive_round_trips() {
1281 assert_round_trips(
1282 "2024-01-01 open Assets:Bank USD\n\
1283 2024-01-15 * \"Coffee\"\n \
1284 Assets:Bank -5.00 USD\n \
1285 Expenses:Food\n",
1286 );
1287 }
1288
1289 #[test]
1290 fn line_comment_round_trips() {
1291 assert_round_trips("; preamble\n2024-01-01 open Assets:Bank\n");
1292 }
1293
1294 #[test]
1295 fn no_trailing_newline_round_trips() {
1296 assert_round_trips("2024-01-01 open Assets:Bank");
1297 }
1298
1299 #[test]
1300 fn root_kind_is_source_file() {
1301 let tree = parse_flat("");
1302 assert_eq!(tree.kind(), SyntaxKind::SOURCE_FILE);
1303 let structured = parse_structured("");
1304 assert_eq!(structured.kind(), SyntaxKind::SOURCE_FILE);
1305 }
1306}