rustledger-parser 0.16.0

Beancount parser with error recovery and full syntax support
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
//! Beancount parser built on a Logos lexer + structured CST.
//!
//! [`parse`] tokenizes via [`logos_lexer`], constructs a lossless
//! CST through [`parse_structured`], and walks it via the
//! converter in `cst::convert` to produce a [`ParseResult`] with
//! the typed AST plus errors, options, includes, plugins,
//! comments, and currency occurrences.
//!
//! # Features
//!
//! - Full Beancount syntax support (all 12 directive types)
//! - Error recovery (continues parsing after errors)
//! - Precise source locations for error reporting
//! - Support for includes, options, plugins
//!
//! # Example
//!
//! ```ignore
//! use rustledger_parser::parse;
//!
//! let source = r#"
//! 2024-01-15 * "Coffee Shop" "Morning coffee"
//!   Expenses:Food:Coffee  5.00 USD
//!   Assets:Cash
//! "#;
//!
//! let result = parse(source);
//! assert!(result.errors.is_empty());
//! assert_eq!(result.directives.len(), 1);
//! ```

#![forbid(unsafe_code)]
#![warn(missing_docs)]

pub mod bom;
pub mod cst;
mod diagnostics;
mod error;
pub mod logos_lexer;

/// Opinionated CST-backed formatter entries.
///
/// **Sole** import path for the formatter surface - `format_source`,
/// `format_source_with_parsed`, `try_format_source`, `format_node`,
/// `format_node_range`, `format_node_with_alignment`,
/// `format_node_range_with_alignment`, `PostingAlignment`,
/// `compute_alignment`, `canonicalize_directives`,
/// `CanonicalizeError`, `lf_to_crlf_outside_strings`,
/// `crlf_to_lf_outside_strings`, `cr_outside_strings_present`. The
/// flat crate-root re-exports were removed in round-5 and the
/// duplicate `crate::cst::format` path was sealed in round-6 of
/// the PR #1284 reviews, so a future deprecation can be done at
/// exactly one site.
pub mod format {
    pub use crate::cst::format::{
        CanonicalizeError, PostingAlignment, canonicalize_directives, compute_alignment,
        cr_outside_strings_present, crlf_to_lf_outside_strings, format_node, format_node_range,
        format_node_range_with_alignment, format_node_with_alignment, format_source,
        format_source_with_parsed, lf_to_crlf_outside_strings, try_format_source,
    };
}

pub use cst::{
    BeancountLanguage, SyntaxElement, SyntaxKind, SyntaxNode, SyntaxToken, lossless_kind_tokens,
    parse_flat, parse_structured, parse_via_cst,
};

// Rowan types CST consumers need. Flat re-exports at the crate
// root match the surrounding `SyntaxNode` / `SyntaxToken` shape -
// downstream `use rustledger_parser::{SyntaxNode, TextRange};`
// resolves both halves uniformly without a sub-module hop.
//
// The set covers what LSP handlers need for tree walking:
// - `TextRange` / `TextSize`: byte-offset ranges on every node
// - `TokenAtOffset`: cursor-position lookup
// - `WalkEvent`: preorder / postorder traversal for folding-range
//   and semantic-tokens implementations
// - `NodeOrToken`: pattern-matching `SyntaxElement` children
// - `Direction`: sibling iteration
//
// `GreenNode` is deliberately NOT re-exported - it's the
// thread-safe storage backing for `SyntaxNode` but downstream
// consumers should walk via the cursor API, not the green tree.
//
// **Stability.** These types are versioned in lockstep with this
// crate, NOT with `rowan` directly. A rowan minor bump that
// changes any of these will require a coordinated bump of this
// crate so the re-export contract holds at THIS crate's semver.
pub use error::{ParseError, ParseErrorKind};
pub use rowan::{Direction, NodeOrToken, TextRange, TextSize, TokenAtOffset, WalkEvent};
pub use rustledger_core::{InternedStr, SYNTHESIZED_FILE_ID, Span, Spanned};

use rustledger_core::Directive;

/// Result of parsing a beancount file.
///
/// Marked `#[non_exhaustive]` so external consumers must go through
/// [`parse`] rather than constructing the struct by literal. Future
/// field additions (e.g., diagnostic metadata, source-map back-
/// references) then land as non-breaking changes.
#[derive(Debug)]
#[non_exhaustive]
pub struct ParseResult {
    /// Successfully parsed directives.
    pub directives: Vec<Spanned<Directive>>,
    /// Options found in the file.
    pub options: Vec<(String, String, Span)>,
    /// Include directives found.
    pub includes: Vec<(String, Span)>,
    /// Plugin directives found.
    pub plugins: Vec<(String, Option<String>, Span)>,
    /// Standalone comments found in the file.
    pub comments: Vec<Spanned<String>>,
    /// Parse errors encountered.
    pub errors: Vec<ParseError>,
    /// Deprecation warnings.
    pub warnings: Vec<ParseWarning>,
    /// Every `Currency` token the parser consumed, paired with its
    /// interned value and source-byte range.
    ///
    /// Source-position-aware tooling (LSP rename / references /
    /// document-highlight) walks this list to produce edits, locations,
    /// and highlights without resorting to string search of the source,
    /// which produces false positives in comments, payee strings,
    /// account-name segments, etc. The order matches source order
    /// because the parser fills it as tokens are consumed (and the
    /// parser is strictly forward-advancing, including on error
    /// recovery).
    ///
    /// **Error-recovery contract.** Tokens consumed during a
    /// directive that ultimately fails to parse remain in this list.
    /// Rationale: the lexer's classification of a token as a
    /// `Currency` is independent of whether the surrounding syntax is
    /// valid, and tooling that wants to rename or highlight a
    /// currency the user typed should follow that classification.
    /// Do not "clean up" partially-consumed entries after a parse
    /// failure - that would hide real currency identifiers from
    /// downstream tooling while the user is mid-edit.
    ///
    /// **`file_id` is always 0 in parser output.** The parser
    /// processes one file at a time and doesn't know its own file
    /// id. The loader sets the correct id on each entry via
    /// `.with_file_id(n)` when assembling a multi-file `SourceMap`,
    /// the same way it does for `directives`. Per-file consumers
    /// (today: every LSP handler) can ignore `file_id`; future
    /// multi-file consumers must remember to thread it through.
    pub currency_occurrences: Vec<Spanned<rustledger_core::Currency>>,
    /// Every `Account` token the parser consumed, paired with its
    /// interned value and source-byte range.
    ///
    /// Mirrors [`Self::currency_occurrences`] for the account
    /// shape. The CST conversion (`walk_descendants_once`) tracks
    /// every `ACCOUNT` token whose ancestors do NOT include an
    /// `ERROR_NODE`. The LSP rename handler (phase 5.4) walks
    /// this list to emit exact-span edits without resorting to
    /// per-directive substring search, which used to produce
    /// false positives wherever an account-name fragment appeared
    /// inside a payee string, a STRING-typed metadata value, or a
    /// comment. ACCOUNT-typed metadata values (e.g.
    /// `counterparty: Assets:Bank`) DO produce an `ACCOUNT` token
    /// at the lexer level and ARE included in this list - so a
    /// rename of `Assets:Bank` correctly rewrites that metadata
    /// value too.
    ///
    /// **Migration status (#1262 phase 5.4).** Only the LSP
    /// rename handler currently consumes this index. The sibling
    /// handlers `references`, `document_highlight`, and
    /// `linked_editing` still walk the typed AST with substring
    /// search for accounts (see those modules' rustdoc); migrating
    /// them to consume `account_occurrences` is tracked as a
    /// phase 5.5+ follow-up.
    ///
    /// **Error-recovery contract.** Two notions of "failing
    /// directive" need to be distinguished:
    ///
    /// - A directive that PARSES SYNTACTICALLY but whose
    ///   typed-AST conversion errors (e.g.,
    ///   [`crate::ParseErrorKind::InvalidBookingMethod`] on an
    ///   `open Assets:Bank "GARBAGE"`). The ACCOUNT node is
    ///   intact in the CST and NOT inside an `ERROR_NODE`. The
    ///   token IS tracked - tooling can still rename it during
    ///   the mid-edit state.
    /// - A directive so garbled that the CST wraps the region
    ///   in an `ERROR_NODE`. The ACCOUNT token is inside an
    ///   `ERROR_NODE` and is NOT tracked. This is deliberate -
    ///   the recovery boundary is fuzzy and including such
    ///   tokens would surface as confusing rename hits inside
    ///   garbage source.
    ///
    /// # Limitations
    ///
    /// The list is undifferentiated: declarations (from
    /// open/close/balance/pad/note/document) and references
    /// (from posting accounts and ACCOUNT-typed metadata) are
    /// mixed together. There is no equivalent of the
    /// `commodity_declaration_spans` helper used for currencies
    /// (the account case has six declaration directive shapes vs.
    /// the single `Commodity` shape, so no symmetric helper
    /// exists yet). A future go-to-definition migration will need
    /// either a re-walk over `directives` or an additional
    /// `account_declarations: Vec<Span>` field.
    ///
    /// **`file_id` is always 0 in parser output** - same loader
    /// contract as `currency_occurrences`.
    pub account_occurrences: Vec<Spanned<rustledger_core::Account>>,
    /// `true` iff the parsed source began with a UTF-8 BOM (strict
    /// byte 0).
    ///
    /// This is the **single source of truth** for downstream consumers
    /// that need to know whether to preserve a leading BOM on output
    /// (notably `format_source`). Do NOT inspect the source bytes
    /// directly; the parser already handled the strip/detect logic in
    /// one place ([`crate::bom::strip_leading`]) and stored the result
    /// here. Reproducing the check elsewhere is exactly the contract-
    /// drift class of bug this field was introduced to eliminate.
    ///
    /// Span coordinates in this `ParseResult` are in the **original
    /// source frame** - i.e., if `has_leading_bom` is true, spans
    /// already include the 3-byte BOM offset and index directly into
    /// the caller's source.
    pub has_leading_bom: bool,
    /// The lossless CST root the converter walked to produce
    /// everything above. Stored as a [`rowan::GreenNode`], which
    /// is `Send + Sync` and reference-counted internally, so an
    /// `Arc<ParseResult>` (the shape the LSP caches per document)
    /// shares this handle across handler invocations without
    /// re-parsing.
    ///
    /// **Prefer [`Self::syntax_node`]** over reading this field
    /// directly. The method is the supported entry point: it
    /// returns a [`SyntaxNode`] (the cursor-API view), keeps the
    /// `rowan::GreenNode` type name out of consumer code, and
    /// shields callers from minor rowan upgrades that touch the
    /// `GreenNode` shape. The field is public for two reasons —
    /// the exhaustive destructure in
    /// [`__baseline_canonical_payload`] needs to bind it, and
    /// `Arc::clone`-style sharing patterns benefit from direct
    /// access — but downstream code should reach for the method.
    ///
    /// **Byte-offset frame: post-BOM.** The CST is built from
    /// the BOM-stripped source — the parser strips a strict-
    /// byte-0 UTF-8 BOM (see [`crate::bom::strip_leading`]) and
    /// feeds the stripped slice to `parse_structured`. So every
    /// `TextRange` / `TextSize` reachable through this tree is
    /// in the **post-BOM** byte frame: an offset of `0` here
    /// corresponds to byte `BOM_LEN == 3` of the original source
    /// when [`Self::has_leading_bom`] is `true`. This differs
    /// from the typed-AST fields above ([`Self::directives`],
    /// [`Self::currency_occurrences`], [`Self::account_occurrences`],
    /// [`Self::errors`], …), whose spans the converter
    /// pre-shifts back into the *original*-source frame so
    /// downstream consumers can index directly into the caller's
    /// source bytes. CST-walking consumers must apply the
    /// equivalent shift themselves: subtract `BOM_LEN` when
    /// translating an original-source offset down to a CST
    /// offset (e.g., `cst.token_at_offset(orig - BOM_LEN)`), and
    /// add `BOM_LEN` back when emitting an original-source
    /// position from a `TextRange`. The LSP `selection_range`
    /// handler does this — see its rustdoc and the
    /// `bom_prefixed_source_does_not_shift_ranges` regression
    /// test.
    ///
    /// **Canonical-payload exclusion.** This field is deliberately
    /// NOT fed into [`__baseline_canonical_payload`]. The green
    /// node is a redundant cache of the source bytes; the
    /// existing `directives` / `currency_occurrences` /
    /// `account_occurrences` / `errors` fields already capture
    /// everything downstream consumers track for drift detection.
    /// Adding the green node's `Debug` output would multiply
    /// the fingerprint size without surfacing any new drift
    /// signal. The corresponding `assert_field_in_hash` arm is
    /// also intentionally absent in `tests/corpus_baseline.rs`.
    /// A negative-form test (`__canonical_payload_excludes_syntax_root`
    /// in this file) pins the exclusion: it confirms that mutating
    /// `syntax_root` while every other field is equal does NOT
    /// change the canonical payload bytes.
    pub syntax_root: rowan::GreenNode,
    /// File-wide alignment columns the formatter would use for
    /// this source — pre-computed at parse time so hot formatting
    /// paths skip the `O(N_postings)` per-call walk.
    ///
    /// `PostingAlignment` is `Copy`; pass it directly into the
    /// `_with_alignment` variants of the formatter
    /// ([`crate::format::format_node_with_alignment`],
    /// [`crate::format::format_node_range_with_alignment`],
    /// [`crate::format::format_source_with_parsed`]) to reuse this
    /// cached value. The LSP `format_document` /
    /// `range_formatting` fallback handlers, the FFI `format.source`
    /// endpoint, and the WASM `ParsedLedger::format` bridge all
    /// consume the cache to skip both the redundant parse and the
    /// redundant alignment walk.
    ///
    /// **Producer-only cache invariant.** This field is populated
    /// exactly once by `parse_via_cst`; the value is consistent with
    /// the `directives` / `syntax_root` fields *at parse time*.
    /// `ParseResult` exposes every cache input (`directives`,
    /// `syntax_root`) as `pub`, so technically a consumer with a
    /// `&mut ParseResult` can mutate one without refreshing the
    /// other — leaving `alignment` stale. That is OUT-OF-CONTRACT
    /// for this cache. Callers that mutate `ParseResult` directly
    /// must either (a) refresh `alignment` by calling
    /// `crate::format::compute_alignment(&SourceFile::cast(self.syntax_node()))`,
    /// (b) avoid the `_with_alignment` formatter variants and use
    /// the bare ones (which re-compute), or (c) treat the
    /// `ParseResult` as immutable after construction (the common
    /// case — the LSP wraps it in `Arc<ParseResult>`).
    ///
    /// **Equivalence pinned.**
    /// `parse_result_alignment_cache::*` (7 fixtures) assert that
    /// `parse(s).alignment` equals
    /// `compute_alignment(&SourceFile::cast(parse(s).syntax_node()).unwrap())`
    /// across representative fixtures, so any future divergence
    /// (a converter change that forgets to refresh the cache, a
    /// `compute_alignment` change that breaks the contract)
    /// fails CI.
    ///
    /// **Canonical-payload exclusion.** Excluded from
    /// [`__baseline_canonical_payload`] for the same reason as
    /// `syntax_root`: it's a redundant derivation of `directives`
    /// content. Mutating it without changing `directives` would
    /// silently flip the corpus hash; including it in the
    /// payload would change the hash for every source with a
    /// non-default alignment (i.e. essentially every real
    /// Beancount file). The exclusion is pinned by
    /// `canonical_payload_excludes_alignment`.
    pub alignment: crate::format::PostingAlignment,
}

impl ParseResult {
    /// Cursor-API view of the lossless CST that produced this
    /// `ParseResult`. Equivalent to
    /// `SyntaxNode::new_root(self.syntax_root.clone())`.
    ///
    /// Construction is an `Arc` bump (the green node's internal
    /// refcount); cheap enough to call per request. This is the
    /// supported entry point for CST consumers — prefer it over
    /// reading [`Self::syntax_root`] directly, so the `rowan`
    /// dependency stays an implementation detail.
    #[must_use]
    pub fn syntax_node(&self) -> SyntaxNode {
        SyntaxNode::new_root(self.syntax_root.clone())
    }
}

// Compile-time assertion: `ParseResult` is shared as
// `Arc<ParseResult>` across the LSP's main thread and its
// background worker (see `rustledger-lsp/src/main_loop.rs`).
// A future field whose type is not `Send + Sync` (e.g. an `Rc`,
// a `Cell`, or a non-thread-safe handle) would silently break
// the LSP build at the call site, far from the parser change
// that caused it. This assertion fences the invariant at the
// definition site so the parser crate's own build fails first.
const _: fn() = || {
    const fn assert_send_sync<T: Send + Sync>() {}
    assert_send_sync::<ParseResult>();
};

/// A warning from the parser (non-fatal).
#[derive(Debug, Clone)]
pub struct ParseWarning {
    /// The warning message.
    pub message: String,
    /// Location in source.
    pub span: Span,
}

impl ParseWarning {
    /// Create a new warning.
    pub fn new(message: impl Into<String>, span: Span) -> Self {
        Self {
            message: message.into(),
            span,
        }
    }
}

/// Parse beancount source code.
///
/// Routes through the CST-backed implementation
/// ([`parse_via_cst`]): a lossless Logos lexer feeds a structured
/// CST builder, and the converter in `crate::cst::convert` walks
/// the resulting tree to produce the [`ParseResult`].
///
/// # Arguments
///
/// * `source` - The beancount source code to parse
///
/// # Returns
///
/// A `ParseResult` containing directives, options, includes, plugins, and errors.
#[must_use]
pub fn parse(source: &str) -> ParseResult {
    parse_via_cst(source)
}

/// Parse beancount source code, returning only directives and errors.
///
/// This is a simpler interface when you don't need options/includes/plugins.
#[must_use]
pub fn parse_directives(source: &str) -> (Vec<Spanned<Directive>>, Vec<ParseError>) {
    let result = parse(source);
    (result.directives, result.errors)
}

/// Canonical hash-payload serialization for the corpus baseline
/// (#1262 phase 0). **Internal**: this exists only so the baseline
/// integration test can hash a `ParseResult` without listing fields
/// outside the defining crate.
///
/// Returns a byte string that uniquely identifies the `ParseResult`'s
/// observable content. Directives route through `serde_json::to_value`
/// to normalize the `FxHashMap` iteration order in metadata; all
/// other fields use `Debug` formatting, which is deterministic for
/// `Vec`-based types.
///
/// **Why this lives in `rustledger-parser` instead of the test:**
/// `ParseResult` is `#[non_exhaustive]`, which blocks exhaustive
/// destructuring from external crates (including the integration
/// test). Performing the destructure here forces the compiler to
/// flag any field added to `ParseResult` that the canonical
/// serialization does not feed into its output. Without this, a new
/// `ParseResult` field could silently exit the baseline fingerprint -
/// the BOM-flag-omission class of bug the round-3 review caught.
///
/// **Add a new field?** Add a binding (NOT `_`) AND a hasher feed
/// line to the destructure below. The compiler enforces the binding;
/// reviewers must enforce the feed.
///
/// **Determinism precondition:** this routes directives through
/// `serde_json::to_value`, which is only sort-stable when
/// `serde_json`'s `preserve_order` feature is **off**. Cargo feature
/// unification can flip this on workspace-wide; the unit test
/// `serde_json_object_is_sorted` in this crate's tests catches that
/// flip before the canonical hash silently desyncs.
#[doc(hidden)]
#[must_use]
pub fn __baseline_canonical_payload(result: &ParseResult) -> Vec<u8> {
    let ParseResult {
        directives,
        options,
        includes,
        plugins,
        comments,
        errors,
        warnings,
        currency_occurrences,
        account_occurrences,
        has_leading_bom,
        syntax_root,
        alignment,
    } = result;
    // Both `syntax_root` and `alignment` are redundant
    // derivations of fields already in the canonical payload
    // (`syntax_root` of the source bytes captured by
    // `directives`/`occurrences`/`errors`; `alignment` of the
    // posting widths inside `directives`). Bind them so the
    // compiler still flags future field additions on this
    // exhaustive destructure, but discard them from the canonical
    // payload. Pinned by `canonical_payload_excludes_syntax_root`
    // and `canonical_payload_excludes_alignment`.
    let _ = syntax_root;
    let _ = alignment;
    let mut out: Vec<u8> = Vec::new();
    let directives_json = serde_json::to_value(directives)
        .map_or_else(|e| format!("serialize-error:{e}"), |v| v.to_string());
    out.extend_from_slice(b"directives:");
    out.extend_from_slice(directives_json.as_bytes());
    out.extend_from_slice(b"\noptions:");
    out.extend_from_slice(format!("{options:?}").as_bytes());
    out.extend_from_slice(b"\nincludes:");
    out.extend_from_slice(format!("{includes:?}").as_bytes());
    out.extend_from_slice(b"\nplugins:");
    out.extend_from_slice(format!("{plugins:?}").as_bytes());
    out.extend_from_slice(b"\ncomments:");
    out.extend_from_slice(format!("{comments:?}").as_bytes());
    out.extend_from_slice(b"\nerrors:");
    out.extend_from_slice(format!("{errors:?}").as_bytes());
    out.extend_from_slice(b"\nwarnings:");
    out.extend_from_slice(format!("{warnings:?}").as_bytes());
    out.extend_from_slice(b"\ncurrency_occurrences:");
    out.extend_from_slice(format!("{currency_occurrences:?}").as_bytes());
    out.extend_from_slice(b"\naccount_occurrences:");
    out.extend_from_slice(format!("{account_occurrences:?}").as_bytes());
    out.extend_from_slice(b"\nhas_leading_bom:");
    out.extend_from_slice(format!("{has_leading_bom:?}").as_bytes());
    out
}

#[cfg(test)]
mod canonical_payload_determinism {
    //! Guard against cargo feature unification silently enabling
    //! `serde_json/preserve_order` workspace-wide. When `preserve_order`
    //! is OFF, `serde_json::Value::Object` is BTreeMap-backed and sorts
    //! its keys; when ON, it's IndexMap-backed and preserves insertion
    //! order. `__baseline_canonical_payload` relies on the sort-stable
    //! behavior to neutralize `FxHashMap` iteration order in directive
    //! metadata. A workspace crate flipping the feature on would make
    //! canonical hashes vary with hashbrown state across machines -
    //! the very class of bug the canonicalization was added to
    //! prevent. This test fails fast and points at the cargo-feature
    //! cause instead of letting the corpus baseline mysteriously drift.
    use serde_json::json;

    #[test]
    fn serde_json_object_is_sorted() {
        // Insertion order `b, a` would survive under `preserve_order`.
        // Default features sort to `a, b`.
        let v = json!({ "b": 1, "a": 2 });
        let s = v.to_string();
        assert!(
            s.starts_with("{\"a\""),
            "serde_json::Value::Object is not sorting keys (got {s}). \
             This means cargo feature unification turned on \
             serde_json/preserve_order somewhere in the workspace. \
             The corpus baseline's canonical hash assumes sorted \
             Object keys to neutralize FxHashMap iteration order in \
             directive metadata. Find the crate that enabled \
             `serde_json = {{ ..., features = [\"preserve_order\"] }}` \
             and remove it, or thread an alternative canonicalization \
             through __baseline_canonical_payload.",
        );
    }
}

#[cfg(test)]
mod cached_syntax_root_matches_fresh_parse {
    //! The `selection_range` handler (and any future CST-walking
    //! handler) consumes [`ParseResult::syntax_root`] instead of
    //! re-invoking [`crate::parse_structured`]. The safety
    //! argument is "the cached green root is the same tree the
    //! converter walked, which is the same tree a fresh
    //! `parse_structured` would return."
    //!
    //! Today that argument is trivially true because the cache is
    //! populated directly from the converter's `source_file`.
    //! But if a future change introduces post-conversion CST
    //! mutation (span rewrites, error-recovery splicing, trivia
    //! reattachment) the cached root would diverge from a fresh
    //! re-parse — silently, since nothing else compares the two
    //! trees. This test pins the invariant across a small fixture
    //! set covering empty source, every directive kind, error
    //! recovery, mid-file BOM, and metadata-bearing transactions.
    use super::{cst::parse_structured, parse};

    fn assert_round_trip(label: &str, source: &str) {
        let parsed = parse(source);
        let (stripped, _bom) = crate::bom::strip_leading(source);
        let fresh = parse_structured(stripped).green().into_owned();
        assert_eq!(
            parsed.syntax_root, fresh,
            "cached syntax_root diverged from fresh parse_structured for {label}: \n\
             this means something is mutating the green tree between converter \
             capture and consumer access. The two are supposed to be identical."
        );
    }

    #[test]
    fn empty_source() {
        assert_round_trip("empty", "");
    }

    #[test]
    fn simple_directive() {
        assert_round_trip("open", "2024-01-01 open Assets:Bank USD\n");
    }

    #[test]
    fn every_directive_shape() {
        assert_round_trip(
            "directive zoo",
            r#"option "title" "Test"
plugin "myplugin"
include "other.beancount"
2024-01-01 open Assets:Bank USD
2024-01-01 commodity USD
2024-06-15 * "Coffee"
  Assets:Bank  -5.00 USD
  Expenses:Food
2024-12-31 close Assets:Bank
2024-01-31 balance Assets:Bank 100 USD
2024-01-15 pad Assets:Bank Equity:Opening
2024-01-15 note Assets:Bank "deposit pending"
2024-01-15 event "location" "SF"
2024-01-15 price USD 1.00 EUR
"#,
        );
    }

    #[test]
    fn with_parse_errors() {
        // Trigger error recovery (unterminated string, garbled
        // directive) to ensure the post-pass `fixup_directive_spans`
        // and error-node wrapping don't drift between cache and
        // fresh re-parse.
        assert_round_trip(
            "broken",
            "2024-01-01 open Assets:Bank \"unterminated\n2024-01-02 garbage line here\n",
        );
    }

    #[test]
    fn with_metadata_and_comments() {
        assert_round_trip(
            "metadata",
            r#"; standalone comment
2024-01-01 open Assets:Bank USD
  payee_account: Assets:Other
2024-06-15 * "Coffee"  ; eol comment
  memo: "morning"
  Assets:Bank  -5.00 USD
"#,
        );
    }
}

#[cfg(test)]
mod canonical_payload_excludes_syntax_root {
    //! Pins the deliberate exclusion of `ParseResult::syntax_root`
    //! from [`__baseline_canonical_payload`]. The exclusion is
    //! documented in three places (the field's rustdoc, the
    //! destructure comment in `__baseline_canonical_payload`, and
    //! the CHANGELOG entry under `[Unreleased] / Features`) but
    //! none of those are executable. A future contributor
    //! mechanically pattern-matching on "all fields get an arm"
    //! could add a `syntax_root` feed to the canonical payload —
    //! the corpus manifest would silently drift on every source
    //! that touched the green tree.
    //!
    //! This test mutates `syntax_root` while leaving every other
    //! field equal, and asserts the canonical payload bytes are
    //! unchanged.
    use super::{__baseline_canonical_payload, parse};

    #[test]
    fn mutating_syntax_root_does_not_change_canonical_payload() {
        let src_a = "2024-01-01 open Assets:Bank USD\n";
        // A different source produces a different green tree but
        // we want every OTHER field equal; pick a source that
        // produces an identical typed ParseResult on every field
        // EXCEPT `syntax_root`. Empty source is the simplest
        // counterexample for "syntax_root differs"; we go further
        // and synthesize the mutation explicitly to keep the test
        // independent of the converter's behavior.
        let parsed_a = parse(src_a);
        let mut mutated = parse(src_a);
        // Replace the green tree with a freshly-parsed but
        // structurally-different one. `parse("")` gives an empty
        // SOURCE_FILE green root; the original has an OPEN_DIRECTIVE
        // child. Other fields will differ for `parse("")`, so we
        // construct the mutation by swapping ONLY the field.
        mutated.syntax_root = parse("").syntax_root;

        let payload_original = __baseline_canonical_payload(&parsed_a);
        let payload_mutated = __baseline_canonical_payload(&mutated);
        assert_eq!(
            payload_original, payload_mutated,
            "canonical payload changed after mutating only `syntax_root`. \
             Either the destructure in `__baseline_canonical_payload` \
             grew a `syntax_root` feed line (revert that — the field \
             is deliberately excluded; see its rustdoc), or another \
             field now reads from `syntax_root` indirectly. Either \
             way the corpus manifest is about to drift."
        );
    }
}

#[cfg(test)]
mod canonical_payload_excludes_alignment {
    //! Pins the deliberate exclusion of `ParseResult::alignment`
    //! from [`__baseline_canonical_payload`]. Same shape as
    //! `canonical_payload_excludes_syntax_root`: mutate the field,
    //! re-hash, assert unchanged.
    //!
    //! Including `alignment` in the canonical payload would change
    //! the corpus hash for every source whose postings determine
    //! non-default column widths — i.e. essentially every real
    //! Beancount file. The field is a derivation of `directives`
    //! content (already in the payload via the typed-AST hash);
    //! it carries no independent drift signal.
    use super::{__baseline_canonical_payload, parse};
    use crate::cst::format::PostingAlignment;

    #[test]
    fn mutating_alignment_does_not_change_canonical_payload() {
        let src = "\
2024-01-15 * \"Coffee\"
  Assets:Bank  -5.00 USD
  Expenses:Food
";
        let parsed = parse(src);
        let mut mutated = parse(src);
        // Synthesize a different PostingAlignment value: bump number_col
        // by 100. Real-world alignment would never be this wide
        // for the fixture, so we get a guaranteed-different cache.
        mutated.alignment = PostingAlignment {
            number_col: parsed.alignment.number_col + 100,
            number_width: parsed.alignment.number_width + 7,
        };

        let payload_original = __baseline_canonical_payload(&parsed);
        let payload_mutated = __baseline_canonical_payload(&mutated);
        assert_eq!(
            payload_original, payload_mutated,
            "canonical payload changed after mutating only `alignment`. \
             Either the destructure in `__baseline_canonical_payload` \
             grew an `alignment` feed line (revert that — the field \
             is deliberately excluded), or another field now reads \
             from `alignment` indirectly. Either way the corpus \
             manifest is about to drift across every source with \
             postings.",
        );
    }
}

#[cfg(test)]
mod parse_result_alignment_cache {
    //! Pins the equivalence between `ParseResult::alignment` (the
    //! pre-computed cache populated by `parse_via_cst`) and a
    //! fresh `compute_alignment` call on the same syntax tree.
    //! A converter change that forgets to refresh the cache, or a
    //! `compute_alignment` change that breaks the cache's
    //! semantics, fails this test before reaching the LSP.
    use super::parse;
    use crate::cst::ast::{AstNode, SourceFile};
    use crate::cst::format::compute_alignment;

    fn assert_equivalent(label: &str, source: &str) {
        let result = parse(source);
        let source_file = SourceFile::cast(result.syntax_node())
            .expect("ParseResult::syntax_node() must be a SOURCE_FILE");
        let fresh = compute_alignment(&source_file);
        assert_eq!(
            result.alignment, fresh,
            "ParseResult::alignment cache diverged from a fresh \
             compute_alignment call for {label}: cache = {:?}, fresh = {:?}. \
             Either parse_via_cst forgot to call compute_alignment, or \
             compute_alignment's semantics changed without refreshing \
             the cache in the converter.",
            result.alignment, fresh,
        );
    }

    #[test]
    fn empty_source() {
        assert_equivalent("empty", "");
    }

    #[test]
    fn open_only_no_postings() {
        assert_equivalent("open only", "2024-01-01 open Assets:Bank USD\n");
    }

    #[test]
    fn single_transaction() {
        assert_equivalent(
            "single txn",
            "\
2024-01-15 * \"Coffee\"
  Assets:Bank  -5.00 USD
  Expenses:Food
",
        );
    }

    #[test]
    fn multi_transaction_varying_widths() {
        assert_equivalent(
            "varying widths",
            "\
2024-01-15 * \"A\"
  Assets:Bank  -5.00 USD
  Expenses:Food
2024-02-15 * \"B\"
  Assets:Investment:Long:Path  -123456.78 USD
  Expenses:Tax  100.00 USD
",
        );
    }

    #[test]
    fn arithmetic_amounts() {
        assert_equivalent(
            "arithmetic amounts",
            "\
2024-01-15 * \"Split\"
  Assets:Bank  -10.00 + 5.00 USD
  Expenses:Misc
",
        );
    }

    #[test]
    fn parse_errors() {
        // Even on parse-error files the cache must match a fresh
        // call. The LSP fallback path consumes the cache through
        // a broken file, so equivalence under error recovery is
        // load-bearing.
        assert_equivalent(
            "broken",
            "\
2024-01-15 * \"x\"
  Assets:Bank  -5.00 USD
}}}garbage
2024-02-15 * \"y\"
  Assets:Other  100.00 USD
",
        );
    }

    /// Mid-transaction recovery: when the WIDEST transaction's body
    /// breaks (becomes `ERROR_NODE` because a posting is
    /// syntactically incomplete), its postings are EXCLUDED from
    /// `compute_alignment` because the wrapping Transaction node
    /// fails the `ast::Directive::Transaction::cast` check inside
    /// the alignment walk. The cache reflects only the
    /// successfully-parsed transactions' alignment; this is the
    /// behavior the LSP fallback observes when format-on-type fires
    /// during a mid-edit broken state. The test pins the
    /// equivalence (cache matches fresh call) so the producer-side
    /// invariant holds even in this awkward transitional state.
    ///
    /// Note for users: as the user keeps typing and the parser
    /// recovers/breaks the wrapping Transaction across edits, the
    /// alignment columns may visibly shift. This is unavoidable
    /// without speculatively recovering wide-account information
    /// from the broken transaction's source bytes — out of scope
    /// for the cache.
    #[test]
    fn mid_transaction_error_node() {
        // First transaction has wide accounts (Assets:Investment:Long:Path)
        // but is broken — the posting line ends with garbage that
        // the recovery should wrap into an ERROR_NODE around the
        // whole transaction. Second transaction (narrow accounts)
        // parses cleanly. The cache's alignment reflects only the
        // narrow transaction's widths.
        assert_equivalent(
            "mid-transaction breakage",
            "\
2024-01-15 * \"wide broken\"
  Assets:Investment:Long:Path  -123456.78 USD }}}
  Expenses:Tax
2024-02-15 * \"narrow clean\"
  Assets:Bank  -5.00 USD
  Expenses:Food
",
        );
    }
}