rustledger-parser 0.16.0

Beancount parser with error recovery and full syntax support
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
//! CST builders: phase 1 flat ([`parse_flat`]) + phase 2.1-2.4
//! structured ([`parse_structured`]).
//!
//! Both walk the lossless token stream and emit a `GreenNode` whose
//! `text()` is byte-identical to the input source. They differ in
//! what they wrap:
//!
//! - [`parse_flat`] (phase 1) puts every token as a direct child of
//!   a single `SOURCE_FILE` node. Useful for round-trip-only tests
//!   and the kind-sequence corpus baseline.
//! - [`parse_structured`] recognizes:
//!   - **Phase 2.1a**: 14 single-line directive shapes —
//!     `OPEN`/`CLOSE`/`BALANCE`/`PAD`/`EVENT`/`QUERY`/`NOTE`/
//!     `DOCUMENT`/`PRICE`/`COMMODITY` (dated) +
//!     `PUSHTAG`/`POPTAG`/`PUSHMETA`/`POPMETA` (top-level keyword).
//!   - **Phase 2.1b**: `TRANSACTION` — DATE + `STAR` / `PENDING_KW`
//!     (`!`) / `FLAG` / `TXN_KW`, multi-line scope through the last
//!     indented sub-line (postings, metadata, indented comments).
//!
//!   Each wraps in its specific node kind per the Directive-
//!   Terminator Rule (see [`crate::cst::trivia`]).
//!
//!   - **Phase 2.3**: edge directives —
//!     `OPTION_DIRECTIVE` / `INCLUDE_DIRECTIVE` /
//!     `PLUGIN_DIRECTIVE` (top-level keyword) +
//!     `CUSTOM_DIRECTIVE` (dated with arbitrary trailing value
//!     list). Body / metadata shape is identical to PR 2.1a's
//!     dated and standalone-keyword directives — only the header
//!     keyword recognition is new.
//!
//!   - **Phase 2.4**: error recovery — unrecognized / malformed
//!     top-level lines are wrapped in `ERROR_NODE` (terminated by
//!     NEWLINE or EOF per rule 5). Same trivia attachment policy
//!     as recognized directives (rule 2): pending leading trivia
//!     attaches inside the `ERROR_NODE` when it's not the very
//!     first content in the file. AMOUNT now also wraps full
//!     arithmetic expressions (`[sign] (NUMBER | PAREN_EXPR)
//!     ([WS] op [WS] (NUMBER | PAREN_EXPR))* [WS CURRENCY]`),
//!     closing the deferred 2.2c.1 divergence with Python
//!     beancount on `10+5 USD`-shape amounts.
//!
//! Phase 2.2a adds `META_ENTRY` sub-node structure around indented
//! `WS META_KEY ... (NEWLINE | EOF)` sub-lines inside any directive
//! or transaction (per rule 5 of `cst::trivia`, an unterminated
//! final sub-line at EOF still gets wrapped). Phase 2.2b adds
//! `POSTING` sub-node structure around each `WS [(FLAG | STAR |
//! PENDING_KW | HASH | single-char CURRENCY) WS] ACCOUNT ...`
//! posting line inside `TRANSACTION` (the flag arm mirrors
//! `parse_flag` in the legacy AST parser and `identify_directive`'s
//! transaction-trigger arm; single-char `CURRENCY` covers letters
//! like `T`/`V`/`F`/`X` that win the lexer's priority-3 Currency-
//! vs-Flag tie-break). Posting-attached metadata (`META_ENTRY` sub-
//! lines following the posting, indented `>=` the posting) becomes a
//! child of that `POSTING`. Phase 2.2c adds `AMOUNT` / `COST_SPEC` /
//! `PRICE_ANNOTATION` inside `POSTING`. Phase 5 deletes
//! `parse_flat` once `parse_structured` covers every byte in
//! every corpus file.

use std::ops::Range;

use rowan::GreenNodeBuilder;

use crate::cst::lossless_tokens::lossless_kind_tokens;
use crate::cst::syntax_kind::{SyntaxKind, SyntaxNode};

/// Parse `source` to a flat lossless CST.
///
/// The returned node's text serialization equals `source` byte-for-
/// byte for every UTF-8 input. Every token is a direct child of
/// `SOURCE_FILE`; no structural directive wrapping.
#[must_use]
pub fn parse_flat(source: &str) -> SyntaxNode {
    let mut builder = GreenNodeBuilder::new();
    builder.start_node(SyntaxKind::SOURCE_FILE.into());
    for (kind, range) in lossless_kind_tokens(source) {
        builder.token(kind.into(), &source[range]);
    }
    builder.finish_node();
    SyntaxNode::new_root(builder.finish())
}

/// Parse `source` to a structured lossless CST.
///
/// Recognizes the 14 single-line directive shapes (PR 2.1a) plus
/// `TRANSACTION` (PR 2.1b) plus the 4 edge directives `OPTION` /
/// `INCLUDE` / `PLUGIN` / `CUSTOM` (PR 2.3), and wraps each in its
/// specific node kind. Trivia attaches per the Directive-
/// Terminator Rule.
///
/// Unrecognized / malformed top-level lines are wrapped in an
/// `ERROR_NODE` (PR 2.4) — same trivia attachment policy as
/// recognized directives and the same rule-5 unterminated-at-EOF
/// behavior. Round-trip byte-identical for every UTF-8 input.
#[must_use]
pub fn parse_structured(source: &str) -> SyntaxNode {
    let tokens: Vec<(SyntaxKind, Range<usize>)> = lossless_kind_tokens(source);
    let mut builder = GreenNodeBuilder::new();
    builder.start_node(SyntaxKind::SOURCE_FILE.into());

    let mut pending_leading: Vec<(SyntaxKind, Range<usize>)> = Vec::new();
    let mut seen_first_content = false;
    let mut i = 0;

    while i < tokens.len() {
        let (kind, ref range) = tokens[i];
        if kind.is_trivia() {
            pending_leading.push((kind, range.clone()));
            i += 1;
            continue;
        }

        // Non-trivia at the top level. Identify what kind of line
        // starts here. Both branches share the same trivia-
        // attachment + node-emission shape: drain pending trivia
        // around `start_node(kind)` per rule 2 (the FIRST
        // non-trivia content's pending trivia attaches under
        // SOURCE_FILE; subsequent runs attach INSIDE the new
        // node), emit the body, then `finish_node()`.
        let node_kind = identify_directive(&tokens, i).unwrap_or(SyntaxKind::ERROR_NODE);
        if seen_first_content {
            builder.start_node(node_kind.into());
            emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
        } else {
            emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
            builder.start_node(node_kind.into());
        }
        seen_first_content = true;
        i = match node_kind {
            SyntaxKind::TRANSACTION => emit_transaction_body(&mut builder, source, &tokens, i),
            SyntaxKind::ERROR_NODE => emit_through_terminator(&mut builder, source, &tokens, i),
            // Recognized directive (PR 2.1a / 2.3 single-line shapes):
            // header + optional indented META_ENTRY sub-lines.
            _ => emit_directive_body(&mut builder, source, &tokens, i),
        };
        builder.finish_node();
    }

    // File-trailing trivia: drain any pending under SOURCE_FILE.
    emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));

    builder.finish_node();
    SyntaxNode::new_root(builder.finish())
}

/// Emit a sequence of `(kind, range)` tokens into the builder.
fn emit_tokens(
    builder: &mut GreenNodeBuilder<'_>,
    source: &str,
    tokens: impl IntoIterator<Item = (SyntaxKind, Range<usize>)>,
) {
    for (kind, range) in tokens {
        builder.token(kind.into(), &source[range]);
    }
}

/// Consume `tokens[i..]` into `builder` up to and including the
/// next `NEWLINE` token (or EOF). Returns the new index `i`.
fn emit_through_terminator(
    builder: &mut GreenNodeBuilder<'_>,
    source: &str,
    tokens: &[(SyntaxKind, Range<usize>)],
    mut i: usize,
) -> usize {
    while i < tokens.len() {
        let (kind, ref range) = tokens[i];
        builder.token(kind.into(), &source[range.clone()]);
        i += 1;
        if kind == SyntaxKind::NEWLINE {
            break;
        }
    }
    i
}

/// Consume one indented sub-line of a directive or transaction
/// body, wrapping it in a `META_ENTRY` node iff it's metadata
/// (i.e., starts `WS META_KEY ...`).
///
/// Phase 2.2a structural wrapping: each metadata sub-line becomes
/// its own `META_ENTRY` node containing the indent `WHITESPACE`,
/// the `META_KEY`, the rest of the line's content tokens, and —
/// when present — the terminator `NEWLINE`. An UNTERMINATED final
/// metadata sub-line at EOF (per rule 5 of `cst::trivia`) is still
/// wrapped: its `META_ENTRY` simply ends at the last content token
/// with no `NEWLINE` child. Token kinds inside the `META_ENTRY`
/// stay flat — phase 3's typed-AST surface will expose `key()` and
/// `value()` accessors that walk these children. Indented
/// `;`-comments flow through as flat children, NOT wrapped in
/// `META_ENTRY`. POSTING lines are recognized earlier in
/// `emit_transaction_body` and never reach this helper.
fn emit_body_sub_line(
    builder: &mut GreenNodeBuilder<'_>,
    source: &str,
    tokens: &[(SyntaxKind, Range<usize>)],
    i: usize,
) -> usize {
    if starts_meta_sub_line(tokens, i) {
        builder.start_node(SyntaxKind::META_ENTRY.into());
        let next = emit_through_terminator(builder, source, tokens, i);
        builder.finish_node();
        next
    } else {
        emit_through_terminator(builder, source, tokens, i)
    }
}

/// Returns true iff `tokens[i..]` starts an indented `WS META_KEY ...`
/// metadata sub-line.
///
/// **Single source of truth** for the `WS + META_KEY` recognition
/// pattern. Used by both `emit_body_sub_line` (decides whether to
/// open a `META_ENTRY` node around the sub-line) and
/// `is_indented_directive_continuation`'s `META_KEY` arm (decides
/// whether the directive body should keep consuming). Routing both
/// call sites through one helper prevents the predicate-pair drift
/// hazard where one widens (e.g. admits a different indent token)
/// without the other and the parser starts consuming sub-lines
/// without wrapping them, or wrapping sub-lines that the body loop
/// never reaches.
fn starts_meta_sub_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
    matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _)))
        && matches!(tokens.get(i + 1), Some((SyntaxKind::META_KEY, _)))
}

/// Consume the header line through its terminator NEWLINE, then
/// keep consuming any indented metadata sub-lines OR indented
/// `;`/`%` comment lines that follow at the same logical block.
///
/// The Directive-Terminator Rule (see `cst::trivia`) declares that
/// a directive carrying metadata spans multiple lines: its last
/// content token is the last content token of its LAST sub-line,
/// not the header. Stopping at the header NEWLINE would orphan
/// metadata under `SOURCE_FILE` and silently violate the rule. PR
/// 2.1a wraps the full multi-line span; PR 2.2 will introduce a
/// `META_ENTRY` sub-node around each `WHITESPACE META_KEY ...
/// NEWLINE` run inside.
///
/// A continuation sub-line is recognized as `WHITESPACE` (the
/// indent) followed by either:
/// - `META_KEY` — the standard metadata sub-line, or
/// - any comment-class trivia token (per [`is_comment_token`]: `;`,
///   `%`, `#!`, `#+`) — an indented documentation comment between
///   metadata entries (a common Beancount idiom; keeping it inside
///   the directive prevents subsequent metadata from getting
///   orphaned to `SOURCE_FILE`).
///
/// Anything else — a blank line, a non-indented top-level token,
/// EOF — terminates the directive. Blank-line separated metadata
/// blocks are currently a known limitation: a `\n` between two
/// metadata entries closes the directive and orphans the second
/// entry. PR 2.2's grammar will likely subsume this when it
/// introduces `META_ENTRY` structure.
fn emit_directive_body(
    builder: &mut GreenNodeBuilder<'_>,
    source: &str,
    tokens: &[(SyntaxKind, Range<usize>)],
    mut i: usize,
) -> usize {
    i = emit_through_terminator(builder, source, tokens, i);
    // PROSPECTIVELY scan the upcoming indented-content block for
    // any `WS META_KEY`. If the block contains metadata, any
    // indented comments anywhere in it — including BEFORE the
    // first META_KEY (the "doc-comment-for-the-following-field"
    // idiom) — are continuations that belong inside the directive.
    // If the block contains NO metadata, an indented comment is
    // inter-directive trivia (rule 2) or file-trailing (rule 4)
    // and must not be absorbed. Per-line bookkeeping was tried in
    // v4 but couldn't see the META_KEY that came AFTER a leading
    // comment, so a comment-before-first-metadata silently closed
    // the directive and orphaned the metadata.
    let block_has_meta = upcoming_indented_block_has_meta(tokens, i);
    while is_indented_directive_continuation(tokens, i, block_has_meta) {
        i = emit_body_sub_line(builder, source, tokens, i);
    }
    i
}

/// Consume the transaction header through its terminator NEWLINE,
/// then keep consuming ANY indented sub-line (postings, metadata,
/// indented comments — any line starting with `WHITESPACE`
/// followed by a non-`NEWLINE` token).
///
/// **Phase 2.2b attributes metadata by indent depth.** Beancount
/// distinguishes TRANSACTION-level metadata (at the transaction's
/// standard indent, typically two spaces, before any posting OR
/// interspersed between postings at that same indent) from
/// POSTING-attached metadata (at a DEEPER indent following a
/// posting line). The transaction-level case stays a direct child
/// of `TRANSACTION`; the posting-attached case becomes a child of
/// the preceding `POSTING` node.
///
/// State machine: walk the body lines while tracking the indent
/// width of the most-recently-opened `POSTING` (if any). For each
/// sub-line:
///
/// - **Posting line** (`WS [(FLAG | STAR | PENDING_KW | HASH |
///   single-char CURRENCY) WS] ACCOUNT ...`, full flag set per
///   [`starts_posting_sub_line`]):
///   close the open POSTING if any, then open a new POSTING and
///   consume the line. **Sibling POSTING indents are not required
///   to be uniform**: a transaction with postings at different
///   indent depths produces sibling POSTING nodes whose
///   `open_posting_indent` reflects each one's own header indent.
///   Subsequent metadata then attributes against the
///   most-recently-opened POSTING's indent, which means
///   metadata can attribute differently depending on which
///   posting precedes it. Beancount's grammar uses uniform
///   indentation by convention, so this is a defensive (not
///   primary) shape; pinned by
///   `postings_at_increasing_indents_produce_siblings_and_meta_attributes_to_latest`.
/// - **Metadata sub-line** (`WS META_KEY ...`): if a POSTING is
///   open AND this line's indent is `>=` the POSTING's indent, emit
///   the `META_ENTRY` INSIDE the POSTING. Otherwise (no open POSTING,
///   or strictly shallower indent), close any open POSTING and emit
///   the `META_ENTRY` at TRANSACTION level. The `>=` (not `>`) match
///   mirrors Beancount, which attributes metadata to the preceding
///   posting by POSITION, so same-indent `key: value` is posting
///   metadata.
/// - **Indented comment line** (`WS COMMENT` / `WS PERCENT_COMMENT`):
///   apply the same indent-attribution rule as metadata. If the
///   comment is strictly more indented than the open POSTING, it
///   stays INSIDE the POSTING (preserving the doc-comment-for-
///   following-posting-metadata idiom — a deeper-indented `; doc`
///   followed by deeper-indented `key: value` should both belong
///   to the same posting). Otherwise close any open POSTING and
///   emit the comment flat at TRANSACTION level (matches the
///   `posting_with_indented_comment_between_postings_terminates_posting`
///   test, where the comment is at the SAME indent as the postings
///   and is therefore transaction-level inter-posting trivia).
/// - **Any other indented content** (`WS STRING`, `WS NUMBER`,
///   unrecognized shape): close any open POSTING and emit the line
///   flat at TRANSACTION level. We don't know what to do with it
///   structurally; flat-passthrough preserves bytes.
///
/// Indent width is measured as the BYTE LENGTH of the leading
/// `WHITESPACE` token — sufficient when the source uses uniform
/// spaces (the standard Beancount convention). **Known divergence
/// from the legacy AST parser**: the legacy lexer's `Indent(N)` /
/// `DeepIndent(N)` variants (`logos_lexer.rs:615-616`) count tabs
/// as 4 spaces, so a tab-indented posting followed by space-
/// indented metadata is compared by VISUAL columns there but by
/// BYTE COUNT here. The two paths can disagree on mixed-indent
/// files. No test corpus file currently triggers the divergence in
/// posting-attached-metadata position; if one shows up, switching
/// `indent_width` to a column-aware count is the fix.
///
/// Compared with `emit_directive_body` (which only continues on
/// `WS META_KEY` and gated `WS COMMENT`), transactions have a
/// looser body shape. PR 2.2c will introduce `AMOUNT` /
/// `COST_SPEC` / `PRICE_ANNOTATION` sub-nodes INSIDE `POSTING`;
/// for now the POSTING's content tokens (account, amount,
/// currency, etc.) stay flat children of POSTING.
///
/// Termination: a blank line (NEWLINE alone, or WHITESPACE then
/// NEWLINE), any non-indented top-level token, or EOF. Any open
/// POSTING is closed before returning.
fn emit_transaction_body(
    builder: &mut GreenNodeBuilder<'_>,
    source: &str,
    tokens: &[(SyntaxKind, Range<usize>)],
    mut i: usize,
) -> usize {
    i = emit_through_terminator(builder, source, tokens, i);

    let mut open_posting_indent: Option<usize> = None;

    while is_indented_transaction_body_line(tokens, i) {
        let sub_line_indent = indent_width(tokens, i);

        if starts_posting_sub_line(tokens, i) {
            if open_posting_indent.is_some() {
                builder.finish_node();
            }
            builder.start_node(SyntaxKind::POSTING.into());
            open_posting_indent = Some(sub_line_indent);
            i = emit_posting_line(builder, source, tokens, i);
        } else if starts_meta_sub_line(tokens, i) {
            // Beancount attributes metadata by POSITION: a `key: value`
            // line following a posting attaches to that posting, even
            // at the SAME indent (`attach_on_equal = true`).
            close_open_posting_unless_attached(
                builder,
                &mut open_posting_indent,
                sub_line_indent,
                true,
            );
            i = emit_body_sub_line(builder, source, tokens, i);
        } else if starts_indented_comment(tokens, i) {
            // Comments use the STRICT (`>`) rule: deeper-indented
            // comments stay INSIDE the open POSTING; same-or-shallower
            // comments close the POSTING and emit flat at TRANSACTION
            // level. Comments are AST-invisible, so this only affects
            // formatter emission placement.
            close_open_posting_unless_attached(
                builder,
                &mut open_posting_indent,
                sub_line_indent,
                false,
            );
            i = emit_through_terminator(builder, source, tokens, i);
        } else {
            // Catch-all: any other indented content (e.g., `WS
            // STRING`, `WS NUMBER`, or unrecognized shapes that
            // future error-recovery work might surface). Close any
            // open POSTING and emit flat at TRANSACTION level. PR
            // 2.2c (AMOUNT / COST_SPEC / PRICE_ANNOTATION) lives
            // INSIDE a `POSTING` and reaches the parser through
            // `starts_posting_sub_line`, never this branch — but
            // if a future continuation form (e.g., multi-line
            // postings) gets added, this branch is where it would
            // need to be teased apart from genuine other content.
            if open_posting_indent.is_some() {
                builder.finish_node();
                open_posting_indent = None;
            }
            i = emit_through_terminator(builder, source, tokens, i);
        }
    }

    if open_posting_indent.is_some() {
        builder.finish_node();
    }

    i
}

/// Consume a posting sub-line through its terminator NEWLINE (or
/// EOF), wrapping the `AMOUNT`, `COST_SPEC`, and `PRICE_ANNOTATION`
/// sub-structures inside the already-open `POSTING` node.
///
/// Preconditions: the caller has opened a `POSTING` node and is
/// positioned at the first token of the posting line (`WS`).
/// `starts_posting_sub_line(tokens, i)` must hold.
///
/// Body shape (after the `WS [(flag) WS] ACCOUNT` prefix):
///
/// - `AMOUNT` is the units amount: `[(MINUS | PLUS)] NUMBER
///   [WS CURRENCY]`, or a bare `CURRENCY`. Mirrors the legacy AST
///   `parse_incomplete_amount`: NUMBER + optional CURRENCY, or
///   CURRENCY alone. Wrapping skips intervening `WHITESPACE`
///   between AMOUNT and CURRENCY so the sub-node owns both.
/// - `COST_SPEC` is a bracketed cost annotation, opened by
///   `L_BRACE` (per-unit), `L_BRACE_HASH` (per-unit + total), or
///   `L_DOUBLE_BRACE` (total-only), and closed by the matching
///   `R_BRACE` / `R_DOUBLE_BRACE`. Contents stay flat children;
///   phase 3 typed-AST will surface accessors. Per rule 5 of
///   `cst::trivia`, an unclosed brace at EOF still gets wrapped
///   (the `COST_SPEC` simply has no matching close-brace child).
/// - `PRICE_ANNOTATION` is opened by `AT` (per-unit price) or
///   `AT_AT` (total price). Its trailing amount is recursively
///   wrapped in `AMOUNT` so the structure mirrors the units-amount
///   case: `PRICE_ANNOTATION(AT [WS AMOUNT])`. The typed-AST
///   decodes per-unit-vs-total by the opener token kind, then
///   walks the `AMOUNT` child for the number/currency.
///
/// Canonical order on a well-formed posting line is `ACCOUNT
/// [AMOUNT] [COST_SPEC] [PRICE_ANNOTATION]`. The state machine
/// here is order-independent at the recognition level (each sub-
/// structure wraps when its opener token is encountered), so a
/// malformed posting with reordered or duplicated sub-structures
/// still round-trips byte-identically — duplicates each get their
/// own wrapper.
///
/// Trailing tokens (`WHITESPACE`, `COMMENT`, `PERCENT_COMMENT`,
/// `NEWLINE`) that follow the last recognized sub-structure stay
/// as flat children of `POSTING`.
fn emit_posting_line(
    builder: &mut GreenNodeBuilder<'_>,
    source: &str,
    tokens: &[(SyntaxKind, Range<usize>)],
    mut i: usize,
) -> usize {
    // Emit the indent `WHITESPACE`.
    if let Some((SyntaxKind::WHITESPACE, range)) = tokens.get(i) {
        builder.token(SyntaxKind::WHITESPACE.into(), &source[range.clone()]);
        i += 1;
    }

    // Optional flag (`FLAG` / `STAR` / `PENDING_KW` / `HASH` /
    // single-char `CURRENCY`) + separating `WHITESPACE`. Mirrors
    // `starts_posting_sub_line`'s flag arm.
    let next = tokens.get(i).map(|(k, _)| *k);
    let is_flag = match next {
        Some(SyntaxKind::FLAG | SyntaxKind::STAR | SyntaxKind::PENDING_KW | SyntaxKind::HASH) => {
            true
        }
        Some(SyntaxKind::CURRENCY) => tokens[i].1.len() == 1,
        _ => false,
    };
    if is_flag {
        // Emit flag + WHITESPACE pair.
        if let Some((kind, range)) = tokens.get(i) {
            builder.token((*kind).into(), &source[range.clone()]);
            i += 1;
        }
        if let Some((SyntaxKind::WHITESPACE, range)) = tokens.get(i) {
            builder.token(SyntaxKind::WHITESPACE.into(), &source[range.clone()]);
            i += 1;
        }
    }

    // Emit the required ACCOUNT.
    if let Some((SyntaxKind::ACCOUNT, range)) = tokens.get(i) {
        builder.token(SyntaxKind::ACCOUNT.into(), &source[range.clone()]);
        i += 1;
    }

    // Scan post-ACCOUNT tokens, wrapping AMOUNT / COST_SPEC /
    // PRICE_ANNOTATION as openers appear. Anything else flows as
    // flat children of POSTING.
    while i < tokens.len() {
        let (kind, range) = (tokens[i].0, tokens[i].1.clone());
        if kind == SyntaxKind::NEWLINE {
            builder.token(kind.into(), &source[range]);
            i += 1;
            break;
        }
        if starts_amount(tokens, i) {
            i = emit_amount(builder, source, tokens, i);
            continue;
        }
        if matches!(
            kind,
            SyntaxKind::L_BRACE | SyntaxKind::L_BRACE_HASH | SyntaxKind::L_DOUBLE_BRACE,
        ) {
            i = emit_cost_spec(builder, source, tokens, i);
            continue;
        }
        if matches!(kind, SyntaxKind::AT | SyntaxKind::AT_AT) {
            i = emit_price_annotation(builder, source, tokens, i);
            continue;
        }
        // Flat passthrough (WHITESPACE, COMMENT, PERCENT_COMMENT,
        // anything else).
        builder.token(kind.into(), &source[range]);
        i += 1;
    }

    i
}

/// Returns true iff `tokens[i..]` starts an AMOUNT-shape token
/// run: an arithmetic-expression operand (`NUMBER`, `L_PAREN`, or
/// signed variants), or a bare `CURRENCY`. Used by
/// `emit_posting_line` to gate whether to open an `AMOUNT` wrapper.
fn starts_amount(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
    match tokens.get(i).map(|(k, _)| *k) {
        Some(SyntaxKind::NUMBER | SyntaxKind::CURRENCY | SyntaxKind::L_PAREN) => true,
        Some(SyntaxKind::MINUS | SyntaxKind::PLUS) => matches!(
            tokens.get(i + 1).map(|(k, _)| *k),
            Some(SyntaxKind::NUMBER | SyntaxKind::L_PAREN),
        ),
        _ => false,
    }
}

/// Returns true iff `tokens[i]` is an arithmetic operator
/// (`PLUS` / `MINUS` / `STAR` / `SLASH`).
const fn is_arith_op(kind: SyntaxKind) -> bool {
    matches!(
        kind,
        SyntaxKind::PLUS | SyntaxKind::MINUS | SyntaxKind::STAR | SyntaxKind::SLASH,
    )
}

/// Emit an `AMOUNT` node containing the units amount.
///
/// Recognizes Python beancount's `parse_expr` grammar shape:
/// `[sign] operand ([WS] op [WS] [sign] operand)* [WS CURRENCY]`,
/// where `operand` is `NUMBER` or a parenthesized sub-expression
/// `L_PAREN expr R_PAREN`. Also accepts a bare `CURRENCY`
/// (currency-only amount). Closes the PR 2.2c.1 deferred
/// divergence: `bean-check` accepts `10+5 USD`, `-10+5 USD`, and
/// `-(10+5) USD`; this helper now wraps them as a single `AMOUNT`
/// node containing the full expression tokens flat (sign + operands
/// + operators + currency).
///
/// Stops at the first token that doesn't fit the grammar (e.g.,
/// `L_BRACE` cost-spec opener, `AT` price opener, `NEWLINE`,
/// `COMMENT`, etc.). Returns the new index.
fn emit_amount(
    builder: &mut GreenNodeBuilder<'_>,
    source: &str,
    tokens: &[(SyntaxKind, Range<usize>)],
    mut i: usize,
) -> usize {
    builder.start_node(SyntaxKind::AMOUNT.into());

    // Currency-only amount: bare `CURRENCY` and nothing more.
    if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::CURRENCY))
        && !starts_amount_operand(tokens, i + 1)
    {
        let range = tokens[i].1.clone();
        builder.token(SyntaxKind::CURRENCY.into(), &source[range]);
        i += 1;
        builder.finish_node();
        return i;
    }

    // Optional leading sign.
    if matches!(
        tokens.get(i).map(|(k, _)| *k),
        Some(SyntaxKind::MINUS | SyntaxKind::PLUS),
    ) {
        let (kind, range) = (tokens[i].0, tokens[i].1.clone());
        builder.token(kind.into(), &source[range]);
        i += 1;
    }

    // First operand.
    i = emit_amount_operand(builder, source, tokens, i);

    // Tail: zero or more `[WS] op [WS] [sign] operand` runs. Each
    // iteration commits the WS / op / WS / sign tokens BEFORE
    // dispatching the operand emission. Lookahead-only: do NOT
    // consume any token until the full op-operand prefix is
    // confirmed, so a trailing single WHITESPACE before CURRENCY
    // (the canonical `100 USD` shape) isn't accidentally consumed
    // as a leading op-prefix.
    loop {
        let mut j = i;
        if matches!(tokens.get(j).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE)) {
            j += 1;
        }
        let Some((op_kind, _)) = tokens.get(j) else {
            break;
        };
        if !is_arith_op(*op_kind) {
            break;
        }
        let op_kind = *op_kind;
        j += 1;
        if matches!(tokens.get(j).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE)) {
            j += 1;
        }
        // Optional sign before next operand.
        let signed = matches!(
            tokens.get(j).map(|(k, _)| *k),
            Some(SyntaxKind::MINUS | SyntaxKind::PLUS),
        );
        let operand_start = if signed { j + 1 } else { j };
        if !starts_amount_operand(tokens, operand_start) {
            break;
        }
        // Commit tokens [i..j) (WS? op WS?) into AMOUNT.
        while i < j {
            let (kind, range) = (tokens[i].0, tokens[i].1.clone());
            // Sanity: the only non-op tokens we should be committing
            // here are WHITESPACE. The op token itself was already
            // verified.
            debug_assert!(
                kind == SyntaxKind::WHITESPACE || kind == op_kind || is_arith_op(kind),
                "unexpected token kind {kind:?} during op-prefix commit",
            );
            builder.token(kind.into(), &source[range]);
            i += 1;
        }
        if signed {
            let (kind, range) = (tokens[i].0, tokens[i].1.clone());
            builder.token(kind.into(), &source[range]);
            i += 1;
        }
        i = emit_amount_operand(builder, source, tokens, i);
    }

    // Optional trailing CURRENCY, either directly adjacent (`100USD`,
    // `(10+5)USD`) or separated by WHITESPACE (`100 USD`).
    if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE))
        && matches!(
            tokens.get(i + 1).map(|(k, _)| *k),
            Some(SyntaxKind::CURRENCY),
        )
    {
        let ws_range = tokens[i].1.clone();
        builder.token(SyntaxKind::WHITESPACE.into(), &source[ws_range]);
        i += 1;
        let cur_range = tokens[i].1.clone();
        builder.token(SyntaxKind::CURRENCY.into(), &source[cur_range]);
        i += 1;
    } else if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::CURRENCY)) {
        let cur_range = tokens[i].1.clone();
        builder.token(SyntaxKind::CURRENCY.into(), &source[cur_range]);
        i += 1;
    }

    builder.finish_node();
    i
}

/// Returns true iff `tokens[i]` starts an arithmetic-expression
/// operand (a bare `NUMBER` or a parenthesized sub-expression
/// opener `L_PAREN`). Used by `emit_amount` to gate operand
/// emission inside the op-loop tail.
fn starts_amount_operand(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
    matches!(
        tokens.get(i).map(|(k, _)| *k),
        Some(SyntaxKind::NUMBER | SyntaxKind::L_PAREN),
    )
}

/// Emit one operand of an arithmetic expression: either a bare
/// `NUMBER` or a parenthesized `L_PAREN expr R_PAREN` sub-
/// expression. The sub-expression's content tokens stay flat
/// children of the surrounding `AMOUNT` node (no separate
/// `EXPR` / `PAREN_GROUP` wrapping for now). Per rule 5, an
/// unclosed paren at EOF or NEWLINE stops without emitting a
/// closing paren — round-trip preserves bytes.
fn emit_amount_operand(
    builder: &mut GreenNodeBuilder<'_>,
    source: &str,
    tokens: &[(SyntaxKind, Range<usize>)],
    mut i: usize,
) -> usize {
    match tokens.get(i).map(|(k, _)| *k) {
        Some(SyntaxKind::NUMBER) => {
            let range = tokens[i].1.clone();
            builder.token(SyntaxKind::NUMBER.into(), &source[range]);
            i += 1;
        }
        Some(SyntaxKind::L_PAREN) => {
            // Emit opener.
            let range = tokens[i].1.clone();
            builder.token(SyntaxKind::L_PAREN.into(), &source[range]);
            i += 1;
            // Consume balanced content until matching R_PAREN.
            // Track nesting depth so `((1+2))` works. Stop at
            // NEWLINE / EOF (rule 5 unterminated case).
            let mut depth = 1usize;
            while depth > 0 {
                let Some((kind, range)) = tokens.get(i) else {
                    break;
                };
                let (kind, range) = (*kind, range.clone());
                if kind == SyntaxKind::NEWLINE {
                    break;
                }
                builder.token(kind.into(), &source[range]);
                i += 1;
                match kind {
                    SyntaxKind::L_PAREN => depth += 1,
                    SyntaxKind::R_PAREN => depth -= 1,
                    _ => {}
                }
            }
        }
        _ => {}
    }
    i
}

/// Emit a `COST_SPEC` node spanning `L_BRACE` / `L_BRACE_HASH` /
/// `L_DOUBLE_BRACE` ... matching `R_BRACE` / `R_DOUBLE_BRACE`. Per
/// rule 5 (unterminated final directive), an unclosed brace at
/// EOF or hitting a NEWLINE still gets wrapped — the `COST_SPEC`
/// simply has no matching close-brace child. Contents stay flat
/// children of `COST_SPEC`.
fn emit_cost_spec(
    builder: &mut GreenNodeBuilder<'_>,
    source: &str,
    tokens: &[(SyntaxKind, Range<usize>)],
    mut i: usize,
) -> usize {
    builder.start_node(SyntaxKind::COST_SPEC.into());

    // Emit opening brace token.
    if let Some((kind, range)) = tokens.get(i) {
        builder.token((*kind).into(), &source[range.clone()]);
        i += 1;
    }

    // Emit content tokens up to and including the matching close
    // brace, or until NEWLINE / EOF (unclosed-brace case).
    while i < tokens.len() {
        let (kind, range) = (tokens[i].0, tokens[i].1.clone());
        if kind == SyntaxKind::NEWLINE {
            // Unclosed brace: stop BEFORE the NEWLINE so the
            // NEWLINE remains a sibling of COST_SPEC (the
            // posting-line terminator), not a child.
            break;
        }
        builder.token(kind.into(), &source[range]);
        i += 1;
        if matches!(kind, SyntaxKind::R_BRACE | SyntaxKind::R_DOUBLE_BRACE) {
            break;
        }
    }

    builder.finish_node();
    i
}

/// Emit a `PRICE_ANNOTATION` node opened by `AT` or `AT_AT`,
/// optionally followed by `WS` and a nested `AMOUNT`. The nested
/// `AMOUNT` mirrors the units-amount wrapping above; the typed-AST
/// decodes per-unit-vs-total by inspecting the opener token kind
/// (`AT` vs `AT_AT`) and walks the `AMOUNT` child for the number
/// and currency. Avoids absorbing a trailing-only `WHITESPACE`
/// before a comment or `NEWLINE` (only swallows WS that precedes
/// an actual amount start).
fn emit_price_annotation(
    builder: &mut GreenNodeBuilder<'_>,
    source: &str,
    tokens: &[(SyntaxKind, Range<usize>)],
    mut i: usize,
) -> usize {
    builder.start_node(SyntaxKind::PRICE_ANNOTATION.into());

    // Emit the `AT` / `AT_AT` opener.
    if let Some((kind, range)) = tokens.get(i) {
        builder.token((*kind).into(), &source[range.clone()]);
        i += 1;
    }

    // Optional intervening WHITESPACE, but only if an amount
    // follows; trailing-only WS belongs as a sibling of
    // PRICE_ANNOTATION, not a child.
    let ws_then_amount = matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE),)
        && starts_amount(tokens, i + 1);
    if ws_then_amount {
        let ws_range = tokens[i].1.clone();
        builder.token(SyntaxKind::WHITESPACE.into(), &source[ws_range]);
        i += 1;
    }
    if starts_amount(tokens, i) {
        i = emit_amount(builder, source, tokens, i);
    }

    builder.finish_node();
    i
}

/// Close any currently-open POSTING node IF the next sub-line at
/// `sub_line_indent` should NOT be attached to it. Shared between the
/// `META_ENTRY` and indented-comment branches of
/// `emit_transaction_body`, which differ ONLY in their same-indent
/// tie-break (`attach_on_equal`).
///
/// `attach_on_equal` selects the attachment threshold:
///
/// - **Metadata (`true`)**: a `key: value` sub-line attaches when it
///   is indented `>=` the open POSTING. This matches Beancount, whose
///   grammar attributes metadata by POSITION (any `key_value` line
///   following a posting, before the next posting, attaches to that
///   posting) rather than by relative indent — so the common
///   `key: value` at the SAME column as the posting (e.g. the
///   `effective_date:` idiom) is posting metadata, not transaction
///   metadata. Pinned by
///   `same_indent_metadata_attaches_to_preceding_posting`.
/// - **Indented comment (`false`)**: a `; doc` / `% doc` sub-line
///   attaches only when STRICTLY more indented (`>`). A same-indent
///   comment closes the POSTING and emits as transaction-level
///   inter-posting trivia. Comments are AST-invisible, so this
///   threshold only affects CST/formatter emission placement; it is
///   pinned by
///   `posting_with_indented_comment_between_postings_terminates_posting`
///   and must stay strict to preserve that formatter contract.
///
/// A sub-line below the attachment threshold closes the POSTING.
/// Called with `open_posting_indent = None` is a no-op (no POSTING to
/// close).
fn close_open_posting_unless_attached(
    builder: &mut GreenNodeBuilder<'_>,
    open_posting_indent: &mut Option<usize>,
    sub_line_indent: usize,
    attach_on_equal: bool,
) {
    let attach = open_posting_indent.is_some_and(|p_indent| {
        if attach_on_equal {
            sub_line_indent >= p_indent
        } else {
            sub_line_indent > p_indent
        }
    });
    if !attach && open_posting_indent.is_some() {
        builder.finish_node();
        *open_posting_indent = None;
    }
}

/// Returns true iff `tokens[i..]` starts a posting sub-line:
/// `WHITESPACE` (the indent) followed by `ACCOUNT`, or by an
/// optional flag (`FLAG` / `STAR` / `PENDING_KW` / `HASH` /
/// single-char `CURRENCY`) plus another `WHITESPACE` then
/// `ACCOUNT`. Mirrors the legacy AST parser's `parse_posting` shape
/// (`parser.rs:866-880`): indent, optional flag, then a required
/// account. The flag set MUST stay in sync with `parse_flag` in the
/// legacy parser (`Token::Star | Pending | Flag(_) | Hash` plus
/// single-char `Currency`) and with `identify_directive`'s
/// transaction-trigger arm above; drift would silently leave
/// HASH-flagged or single-char-CURRENCY-flagged posting lines flat
/// under `TRANSACTION` instead of wrapped in `POSTING`. The single-
/// char `CURRENCY`-as-flag arm exists because the lexer's priority-3
/// Currency-vs-Flag tie-break makes letters like `T`/`V`/`F`/`X`
/// tokenize as `CURRENCY`, but they still function as posting flags
/// by Beancount convention.
fn starts_posting_sub_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
    if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
        return false;
    }
    if matches!(tokens.get(i + 1), Some((SyntaxKind::ACCOUNT, _))) {
        return true;
    }
    let has_flag = match tokens.get(i + 1) {
        Some((
            SyntaxKind::FLAG | SyntaxKind::STAR | SyntaxKind::PENDING_KW | SyntaxKind::HASH,
            _,
        )) => true,
        Some((SyntaxKind::CURRENCY, range)) => range.len() == 1,
        _ => false,
    };
    if !has_flag {
        return false;
    }
    matches!(tokens.get(i + 2), Some((SyntaxKind::WHITESPACE, _)))
        && matches!(tokens.get(i + 3), Some((SyntaxKind::ACCOUNT, _)))
}

/// Byte length of the leading `WHITESPACE` token at `tokens[i]`,
/// or 0 if there is no leading whitespace. Used by
/// `emit_transaction_body` to decide whether a metadata or
/// comment sub-line's indent is strictly deeper than the
/// surrounding POSTING's indent (the posting-attached-metadata /
/// posting-attached-comment rule).
///
/// **Known divergence from the legacy AST parser**: the legacy
/// lexer's `Indent(N)` / `DeepIndent(N)` variants
/// (`logos_lexer.rs:615-616`) count tabs as 4 spaces, but this
/// helper returns raw bytes. Mixed tab+space indentation can
/// therefore produce different attribution between the two paths.
/// Acceptable for now because (a) Beancount idiom is uniform
/// spaces, (b) no corpus file currently triggers the divergence in
/// posting-attached-metadata position, and (c) the CST round-trip
/// is byte-identical regardless of how `indent_width` classifies.
/// If a file shows up, switch to a column-aware count.
fn indent_width(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> usize {
    match tokens.get(i) {
        Some((SyntaxKind::WHITESPACE, range)) => range.len(),
        _ => 0,
    }
}

/// Returns true iff `kind` is one of the four comment-class trivia
/// token kinds: `COMMENT` (`;`), `PERCENT_COMMENT` (`%`), `SHEBANG`
/// (`#!`), or `EMACS_DIRECTIVE` (`#+`). Mirrors the comment subset
/// of `SyntaxKind::is_trivia()` and is the single source of truth
/// for the three call sites that need to decide whether a token
/// "is a comment" for body-continuation / indent-attribution
/// purposes (`starts_indented_comment`,
/// `upcoming_indented_block_has_meta`,
/// `is_indented_directive_continuation`). A new comment-class
/// token would otherwise require three coordinated edits;
/// `is_comment_token_covers_all_comment_class_trivia` in this
/// module's tests asserts membership stays in sync with `is_trivia`.
///
/// **Known CST/AST divergence**: The legacy AST parser's
/// `parse_posting_metadata` / `parse_transaction_directive` paths
/// in `crates/rustledger-parser/src/parser.rs` only treat
/// `Token::Comment` and `Token::PercentComment` as in-body trivia
/// for transaction / directive bodies. `Token::Shebang` and
/// `Token::EmacsDirective` are processed only at top level
/// (`parse_directive` dispatch). So a deeper-indented `#+STARTUP:
/// overview` between two postings is INSIDE the POSTING for the
/// CST but TERMINATES the transaction for the AST. Phase-isolated
/// in practice: the loader, LSP, validator, query, booking, and
/// CLI all run through the AST path; the only current
/// `parse_structured` consumers are this crate's corpus baseline
/// test and `examples/dump_top_level_directives.rs`. Phase 5
/// deletes `parse_flat` and the AST; that reconciliation should
/// adopt the CST behavior (consistent with `is_trivia()`'s
/// classification of all four comment-class tokens) rather than
/// the AST behavior (an indented comment-class line silently
/// terminating the directive is the surprising outcome).
const fn is_comment_token(kind: SyntaxKind) -> bool {
    matches!(
        kind,
        SyntaxKind::COMMENT
            | SyntaxKind::PERCENT_COMMENT
            | SyntaxKind::SHEBANG
            | SyntaxKind::EMACS_DIRECTIVE,
    )
}

/// Returns true iff `tokens[i..]` starts an indented comment line:
/// `WHITESPACE` (the indent) followed by a comment-class token (per
/// [`is_comment_token`]). Used by `emit_transaction_body` to apply
/// the same indent-attribution rule to comments that it applies to
/// metadata.
fn starts_indented_comment(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
    matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _)))
        && matches!(tokens.get(i + 1), Some((k, _)) if is_comment_token(*k))
}

/// Returns true iff `tokens[i..]` starts an indented line with
/// actual content: `WHITESPACE` followed by ANY non-`NEWLINE`
/// token. A blank line (`NEWLINE` alone, or `WHITESPACE NEWLINE`)
/// or EOF terminates the transaction body.
///
/// **Deliberate divergence from rule 4 of `cst::trivia`:** unlike
/// the single-line-directive body, a TRANSACTION body absorbs an
/// indented trailing `;`-comment AT EOF (file-trailing-ish) into
/// the directive. Rationale: documentation comments interleaved
/// with postings are a Beancount idiom, and forcing the body to
/// "back-track" the last comment if it's trailing would require
/// look-ahead the per-line predicate can't do without extra state.
/// Pinned by `transaction_trailing_indented_comment_at_eof_stays_inside`.
fn is_indented_transaction_body_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
    if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
        return false;
    }
    !matches!(tokens.get(i + 1), Some((SyntaxKind::NEWLINE, _)) | None)
}

/// Scan forward through any indented `WS META_KEY` sub-lines or
/// `WS <comment>` sub-lines (per [`is_comment_token`]) starting at
/// `tokens[i..]`, returning `true` iff at least one of them is a
/// metadata (`WS META_KEY`) sub-line. Stops at the first line that
/// is neither metadata nor an indented comment (blank line,
/// non-indented top-level content, EOF).
fn upcoming_indented_block_has_meta(tokens: &[(SyntaxKind, Range<usize>)], mut i: usize) -> bool {
    loop {
        let head = tokens.get(i).map(|(k, _)| *k);
        let next = tokens.get(i + 1).map(|(k, _)| *k);
        match (head, next) {
            (Some(SyntaxKind::WHITESPACE), Some(SyntaxKind::META_KEY)) => return true,
            (Some(SyntaxKind::WHITESPACE), Some(k)) if is_comment_token(k) => {
                // Skip past this indented-comment line.
                while i < tokens.len() && tokens[i].0 != SyntaxKind::NEWLINE {
                    i += 1;
                }
                if i >= tokens.len() {
                    return false;
                }
                i += 1; // past the NEWLINE
            }
            _ => return false,
        }
    }
}

/// Returns true iff `tokens[i..]` starts an indented line that
/// CONTINUES the current multi-line directive: `WHITESPACE` (the
/// indent) followed by content that visually "belongs to" the
/// metadata block.
///
/// Recognizes:
/// - `WS META_KEY` — always a continuation regardless of context.
/// - `WS <comment>` (per [`is_comment_token`]) — a continuation iff
///   the surrounding indented block contains ANY `WS META_KEY` (the
///   `block_has_meta` argument). This prevents absorbing indented
///   comments that follow a header-only directive (rule 2 / rule
///   4 cases) while still keeping documentation comments BEFORE
///   the first metadata entry inside the directive.
///
/// All other shapes (blank `\n`, non-indented content, EOF)
/// terminate the directive.
fn is_indented_directive_continuation(
    tokens: &[(SyntaxKind, Range<usize>)],
    i: usize,
    block_has_meta: bool,
) -> bool {
    // The META_KEY arm routes through `starts_meta_sub_line` so the
    // continuation predicate and the wrapping predicate
    // (`emit_body_sub_line`) cannot drift.
    if starts_meta_sub_line(tokens, i) {
        return true;
    }
    if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
        return false;
    }
    match tokens.get(i + 1) {
        Some((k, _)) if is_comment_token(*k) => block_has_meta,
        _ => false,
    }
}

/// Given the token slice and the index of a non-trivia token,
/// decide whether it starts a recognized top-level directive of
/// any kind. Returns the directive `SyntaxKind` if yes, `None`
/// otherwise (random content that doesn't fit a known shape — the
/// caller wraps such content in an `ERROR_NODE` per PR 2.4).
///
/// Beancount directive line shapes recognized here:
///
/// - `DATE WHITESPACE <KEYWORD> ...`: OPEN / CLOSE / BALANCE / PAD
///   / EVENT / QUERY / NOTE / DOCUMENT / PRICE / COMMODITY (PR
///   2.1a) + CUSTOM (PR 2.3)
/// - `DATE WHITESPACE <txn-trigger> ...`: TRANSACTION (PR 2.1b),
///   where `<txn-trigger>` is one of `STAR` / `PENDING_KW` (`!`)
///   / `FLAG` / `HASH` / `TXN_KW` / `STRING` ("implied" txn form
///   with no explicit flag) / single-char `CURRENCY` (ticker
///   letters). Mirrors `parse_dated_directive` in the legacy AST
///   parser at parser.rs:1707-1715.
/// - `<KEYWORD> ...` (no leading date): PUSHTAG / POPTAG /
///   PUSHMETA / POPMETA (PR 2.1a) + OPTION / INCLUDE / PLUGIN
///   (PR 2.3)
fn identify_directive(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> Option<SyntaxKind> {
    let (head, _) = tokens.get(i)?;
    match *head {
        // Top-level keyword directives — no leading date.
        SyntaxKind::PUSHTAG_KW => Some(SyntaxKind::PUSHTAG_DIRECTIVE),
        SyntaxKind::POPTAG_KW => Some(SyntaxKind::POPTAG_DIRECTIVE),
        SyntaxKind::PUSHMETA_KW => Some(SyntaxKind::PUSHMETA_DIRECTIVE),
        SyntaxKind::POPMETA_KW => Some(SyntaxKind::POPMETA_DIRECTIVE),

        // Phase 2.3: edge directives (option / include / plugin).
        // These are top-level keyword directives — like
        // pushtag/poptag/pushmeta/popmeta above — so the same
        // single-line directive body shape applies. Their full
        // header is consumed by `emit_through_terminator`; trailing
        // indented metadata lines (a rare but legal Beancount idiom
        // for option / include / plugin) are absorbed by
        // `emit_directive_body`'s look-ahead, same as the other
        // top-level-keyword directives.
        SyntaxKind::OPTION_KW => Some(SyntaxKind::OPTION_DIRECTIVE),
        SyntaxKind::INCLUDE_KW => Some(SyntaxKind::INCLUDE_DIRECTIVE),
        SyntaxKind::PLUGIN_KW => Some(SyntaxKind::PLUGIN_DIRECTIVE),

        // Dated directives — peek past SAME-LINE whitespace for the
        // keyword. Only WHITESPACE separates content tokens within a
        // directive's header line; a NEWLINE means we crossed into
        // the next line and the DATE/keyword pair is NOT a single
        // directive. Skipping `is_trivia()` (which includes NEWLINE
        // and COMMENT) would wrongly identify malformed `DATE\nopen ...`
        // as OPEN_DIRECTIVE while `emit_through_terminator` only
        // captures the first line, leaving the keyword orphaned.
        SyntaxKind::DATE => {
            let mut j = i + 1;
            while j < tokens.len() && tokens[j].0 == SyntaxKind::WHITESPACE {
                j += 1;
            }
            let (next, _) = tokens.get(j)?;
            match *next {
                SyntaxKind::OPEN_KW => Some(SyntaxKind::OPEN_DIRECTIVE),
                SyntaxKind::CLOSE_KW => Some(SyntaxKind::CLOSE_DIRECTIVE),
                SyntaxKind::BALANCE_KW => Some(SyntaxKind::BALANCE_DIRECTIVE),
                SyntaxKind::PAD_KW => Some(SyntaxKind::PAD_DIRECTIVE),
                SyntaxKind::EVENT_KW => Some(SyntaxKind::EVENT_DIRECTIVE),
                SyntaxKind::QUERY_KW => Some(SyntaxKind::QUERY_DIRECTIVE),
                SyntaxKind::NOTE_KW => Some(SyntaxKind::NOTE_DIRECTIVE),
                SyntaxKind::DOCUMENT_KW => Some(SyntaxKind::DOCUMENT_DIRECTIVE),
                SyntaxKind::PRICE_KW => Some(SyntaxKind::PRICE_DIRECTIVE),
                SyntaxKind::COMMODITY_KW => Some(SyntaxKind::COMMODITY_DIRECTIVE),
                // Phase 2.3: CUSTOM is a dated directive with a
                // type-name STRING followed by an arbitrary value
                // list (STRING / ACCOUNT / amount / DATE / CURRENCY
                // / BOOL_TRUE / BOOL_FALSE). The header consumption
                // is identical to the other dated single-line
                // directives; only the value list is open-ended,
                // which is fine for the CST since the trailing
                // tokens stay flat.
                SyntaxKind::CUSTOM_KW => Some(SyntaxKind::CUSTOM_DIRECTIVE),
                // Transaction triggers after the DATE. Beancount
                // accepts:
                // - `*` (STAR) for completed transactions
                // - `!` (PENDING_KW) for incomplete/warning
                // - letter flags P/S/T/C/U/R/M/?/& (FLAG)
                // - `#` (HASH) promoted to a flag in this position
                //   (cf. `Token::is_txn_flag` and the AST parser's
                //   `parse_flag` accepting Hash)
                // - the explicit `txn` keyword (TXN_KW)
                // - a bare STRING ("implied transaction": the AST
                //   parser at parser.rs:1713 dispatches
                //   `Token::String(_)` to `parse_transaction_directive`
                //   with an implied `*` flag; common shorthand
                //   form in real ledgers like
                //   `2024-01-15 "Coffee"`)
                SyntaxKind::STAR
                | SyntaxKind::PENDING_KW
                | SyntaxKind::FLAG
                | SyntaxKind::HASH
                | SyntaxKind::TXN_KW
                | SyntaxKind::STRING => Some(SyntaxKind::TRANSACTION),
                // Single-character CURRENCY: NYSE/NASDAQ-style
                // ticker letters (T, V, F, X, ...) double as
                // transaction flags. The lexer prioritizes
                // CURRENCY over FLAG for single uppercase letters
                // (logos_lexer Currency priority 3); the AST parser
                // (`parse_flag` arm `Token::Currency(s) if s.len() == 1`)
                // mirrors this. We do the same to stay consistent
                // with the established lexer/parser contract.
                SyntaxKind::CURRENCY if tokens[j].1.len() == 1 => Some(SyntaxKind::TRANSACTION),
                // Anything else: unknown shape.
                _ => None,
            }
        }
        _ => None,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn assert_round_trips(source: &str) {
        let tree = parse_flat(source);
        assert_eq!(tree.text().to_string(), source);
        let structured = parse_structured(source);
        assert_eq!(structured.text().to_string(), source);
    }

    /// Drift guard: `is_comment_token` and `is_trivia` must agree on
    /// what counts as comment-class trivia. Enforces two invariants:
    ///
    /// 1. `is_trivia() ⊆ is_comment_token ∪ non_comment_trivia`:
    ///    every trivia kind is either a comment or in the explicit
    ///    whitespace-class allow-list. Catches a new lexer-level
    ///    addition to `is_trivia()` that's silently forgotten in
    ///    `is_comment_token`.
    /// 2. `is_comment_token ⊆ is_trivia()`: every kind
    ///    `is_comment_token` says yes to is actually trivia. Catches
    ///    a future edit to `is_comment_token`'s match arm that
    ///    accidentally pulls in a non-trivia content token,
    ///    silently extending indent-attribution to real content
    ///    inside POSTING / directive bodies.
    ///
    /// On failure (1), if the new trivia kind is neither comment-
    /// class nor whitespace-class (e.g., some future
    /// `SECTION_HEADER` that should NOT be absorbed as a
    /// continuation), don't reflexively add it to either set —
    /// revisit whether the body-continuation predicates need a
    /// different abstraction (`is_body_continuation_trivia` or
    /// similar) and propagate the choice to the three call sites.
    #[test]
    fn is_comment_token_covers_all_comment_class_trivia() {
        let non_comment_trivia = [SyntaxKind::BOM, SyntaxKind::WHITESPACE, SyntaxKind::NEWLINE];

        let mut trivia_missed_from_comment: Vec<SyntaxKind> = Vec::new();
        let mut comment_not_trivia: Vec<SyntaxKind> = Vec::new();
        for d in 0u16..=u16::MAX {
            let Ok(kind) = SyntaxKind::try_from(d) else {
                continue;
            };
            // Invariant 1: trivia (minus whitespace allow-list) ⊆ comment.
            if kind.is_trivia() && !non_comment_trivia.contains(&kind) && !is_comment_token(kind) {
                trivia_missed_from_comment.push(kind);
            }
            // Invariant 2: comment ⊆ trivia.
            if is_comment_token(kind) && !kind.is_trivia() {
                comment_not_trivia.push(kind);
            }
        }
        assert!(
            trivia_missed_from_comment.is_empty(),
            "trivia kinds present in is_trivia() but missing from \
             is_comment_token: {trivia_missed_from_comment:?}. Three \
             options: (a) add them to is_comment_token if they are \
             comment-class; (b) extend the non_comment_trivia allow- \
             list in this test if they are whitespace-class; (c) if \
             they are neither, revisit whether the body-continuation \
             predicates need a different abstraction and propagate \
             the decision to the three call sites.",
        );
        assert!(
            comment_not_trivia.is_empty(),
            "is_comment_token claims these kinds are comments but \
             is_trivia() disagrees: {comment_not_trivia:?}. Either \
             add them to is_trivia() (if they really are trivia) or \
             remove them from is_comment_token (if they are content \
             tokens that should not be absorbed as comment \
             continuations).",
        );
    }

    #[test]
    fn empty_source() {
        assert_round_trips("");
    }

    #[test]
    fn whitespace_only() {
        assert_round_trips("   \t  ");
    }

    #[test]
    fn bom_round_trips() {
        assert_round_trips("\u{FEFF}2024-01-01 open Assets:Bank\n");
    }

    #[test]
    fn full_directive_round_trips() {
        assert_round_trips(
            "2024-01-01 open Assets:Bank USD\n\
             2024-01-15 * \"Coffee\"\n  \
               Assets:Bank  -5.00 USD\n  \
               Expenses:Food\n",
        );
    }

    #[test]
    fn line_comment_round_trips() {
        assert_round_trips("; preamble\n2024-01-01 open Assets:Bank\n");
    }

    #[test]
    fn no_trailing_newline_round_trips() {
        assert_round_trips("2024-01-01 open Assets:Bank");
    }

    #[test]
    fn root_kind_is_source_file() {
        let tree = parse_flat("");
        assert_eq!(tree.kind(), SyntaxKind::SOURCE_FILE);
        let structured = parse_structured("");
        assert_eq!(structured.kind(), SyntaxKind::SOURCE_FILE);
    }
}