Skip to main content

rustledger_parser/
lib.rs

1//! Beancount parser built on a Logos lexer + structured CST.
2//!
3//! [`parse`] tokenizes via [`logos_lexer`], constructs a lossless
4//! CST through [`parse_structured`], and walks it via the
5//! converter in `cst::convert` to produce a [`ParseResult`] with
6//! the typed AST plus errors, options, includes, plugins,
7//! comments, and currency occurrences.
8//!
9//! # Features
10//!
11//! - Full Beancount syntax support (all 12 directive types)
12//! - Error recovery (continues parsing after errors)
13//! - Precise source locations for error reporting
14//! - Support for includes, options, plugins
15//!
16//! # Example
17//!
18//! ```ignore
19//! use rustledger_parser::parse;
20//!
21//! let source = r#"
22//! 2024-01-15 * "Coffee Shop" "Morning coffee"
23//!   Expenses:Food:Coffee  5.00 USD
24//!   Assets:Cash
25//! "#;
26//!
27//! let result = parse(source);
28//! assert!(result.errors.is_empty());
29//! assert_eq!(result.directives.len(), 1);
30//! ```
31
32#![forbid(unsafe_code)]
33#![warn(missing_docs)]
34
35pub mod bom;
36pub mod cst;
37mod diagnostics;
38mod error;
39pub mod logos_lexer;
40
41/// Opinionated CST-backed formatter entries.
42///
43/// **Sole** import path for the formatter surface - `format_source`,
44/// `format_source_with_parsed`, `try_format_source`, `format_node`,
45/// `format_node_range`, `format_node_with_alignment`,
46/// `format_node_range_with_alignment`, `PostingAlignment`,
47/// `compute_alignment`, `canonicalize_directives`,
48/// `CanonicalizeError`, `lf_to_crlf_outside_strings`,
49/// `crlf_to_lf_outside_strings`, `cr_outside_strings_present`. The
50/// flat crate-root re-exports were removed in round-5 and the
51/// duplicate `crate::cst::format` path was sealed in round-6 of
52/// the PR #1284 reviews, so a future deprecation can be done at
53/// exactly one site.
54pub mod format {
55    pub use crate::cst::format::{
56        CanonicalizeError, PostingAlignment, canonicalize_directives, compute_alignment,
57        cr_outside_strings_present, crlf_to_lf_outside_strings, format_node, format_node_range,
58        format_node_range_with_alignment, format_node_with_alignment, format_source,
59        format_source_with_parsed, lf_to_crlf_outside_strings, try_format_source,
60    };
61}
62
63pub use cst::{
64    BeancountLanguage, SyntaxElement, SyntaxKind, SyntaxNode, SyntaxToken, lossless_kind_tokens,
65    parse_flat, parse_structured, parse_via_cst,
66};
67
68// Rowan types CST consumers need. Flat re-exports at the crate
69// root match the surrounding `SyntaxNode` / `SyntaxToken` shape -
70// downstream `use rustledger_parser::{SyntaxNode, TextRange};`
71// resolves both halves uniformly without a sub-module hop.
72//
73// The set covers what LSP handlers need for tree walking:
74// - `TextRange` / `TextSize`: byte-offset ranges on every node
75// - `TokenAtOffset`: cursor-position lookup
76// - `WalkEvent`: preorder / postorder traversal for folding-range
77//   and semantic-tokens implementations
78// - `NodeOrToken`: pattern-matching `SyntaxElement` children
79// - `Direction`: sibling iteration
80//
81// `GreenNode` is deliberately NOT re-exported - it's the
82// thread-safe storage backing for `SyntaxNode` but downstream
83// consumers should walk via the cursor API, not the green tree.
84//
85// **Stability.** These types are versioned in lockstep with this
86// crate, NOT with `rowan` directly. A rowan minor bump that
87// changes any of these will require a coordinated bump of this
88// crate so the re-export contract holds at THIS crate's semver.
89pub use error::{ParseError, ParseErrorKind};
90pub use rowan::{Direction, NodeOrToken, TextRange, TextSize, TokenAtOffset, WalkEvent};
91pub use rustledger_core::{InternedStr, SYNTHESIZED_FILE_ID, Span, Spanned};
92
93use rustledger_core::Directive;
94
95/// Result of parsing a beancount file.
96///
97/// Marked `#[non_exhaustive]` so external consumers must go through
98/// [`parse`] rather than constructing the struct by literal. Future
99/// field additions (e.g., diagnostic metadata, source-map back-
100/// references) then land as non-breaking changes.
101#[derive(Debug)]
102#[non_exhaustive]
103pub struct ParseResult {
104    /// Successfully parsed directives.
105    pub directives: Vec<Spanned<Directive>>,
106    /// Options found in the file.
107    pub options: Vec<(String, String, Span)>,
108    /// Include directives found.
109    pub includes: Vec<(String, Span)>,
110    /// Plugin directives found.
111    pub plugins: Vec<(String, Option<String>, Span)>,
112    /// Standalone comments found in the file.
113    pub comments: Vec<Spanned<String>>,
114    /// Parse errors encountered.
115    pub errors: Vec<ParseError>,
116    /// Deprecation warnings.
117    pub warnings: Vec<ParseWarning>,
118    /// Every `Currency` token the parser consumed, paired with its
119    /// interned value and source-byte range.
120    ///
121    /// Source-position-aware tooling (LSP rename / references /
122    /// document-highlight) walks this list to produce edits, locations,
123    /// and highlights without resorting to string search of the source,
124    /// which produces false positives in comments, payee strings,
125    /// account-name segments, etc. The order matches source order
126    /// because the parser fills it as tokens are consumed (and the
127    /// parser is strictly forward-advancing, including on error
128    /// recovery).
129    ///
130    /// **Error-recovery contract.** Tokens consumed during a
131    /// directive that ultimately fails to parse remain in this list.
132    /// Rationale: the lexer's classification of a token as a
133    /// `Currency` is independent of whether the surrounding syntax is
134    /// valid, and tooling that wants to rename or highlight a
135    /// currency the user typed should follow that classification.
136    /// Do not "clean up" partially-consumed entries after a parse
137    /// failure - that would hide real currency identifiers from
138    /// downstream tooling while the user is mid-edit.
139    ///
140    /// **`file_id` is always 0 in parser output.** The parser
141    /// processes one file at a time and doesn't know its own file
142    /// id. The loader sets the correct id on each entry via
143    /// `.with_file_id(n)` when assembling a multi-file `SourceMap`,
144    /// the same way it does for `directives`. Per-file consumers
145    /// (today: every LSP handler) can ignore `file_id`; future
146    /// multi-file consumers must remember to thread it through.
147    pub currency_occurrences: Vec<Spanned<rustledger_core::Currency>>,
148    /// Every `Account` token the parser consumed, paired with its
149    /// interned value and source-byte range.
150    ///
151    /// Mirrors [`Self::currency_occurrences`] for the account
152    /// shape. The CST conversion (`walk_descendants_once`) tracks
153    /// every `ACCOUNT` token whose ancestors do NOT include an
154    /// `ERROR_NODE`. The LSP rename handler (phase 5.4) walks
155    /// this list to emit exact-span edits without resorting to
156    /// per-directive substring search, which used to produce
157    /// false positives wherever an account-name fragment appeared
158    /// inside a payee string, a STRING-typed metadata value, or a
159    /// comment. ACCOUNT-typed metadata values (e.g.
160    /// `counterparty: Assets:Bank`) DO produce an `ACCOUNT` token
161    /// at the lexer level and ARE included in this list - so a
162    /// rename of `Assets:Bank` correctly rewrites that metadata
163    /// value too.
164    ///
165    /// **Migration status (#1262 phase 5.4).** Only the LSP
166    /// rename handler currently consumes this index. The sibling
167    /// handlers `references`, `document_highlight`, and
168    /// `linked_editing` still walk the typed AST with substring
169    /// search for accounts (see those modules' rustdoc); migrating
170    /// them to consume `account_occurrences` is tracked as a
171    /// phase 5.5+ follow-up.
172    ///
173    /// **Error-recovery contract.** Two notions of "failing
174    /// directive" need to be distinguished:
175    ///
176    /// - A directive that PARSES SYNTACTICALLY but whose
177    ///   typed-AST conversion errors (e.g.,
178    ///   [`crate::ParseErrorKind::InvalidBookingMethod`] on an
179    ///   `open Assets:Bank "GARBAGE"`). The ACCOUNT node is
180    ///   intact in the CST and NOT inside an `ERROR_NODE`. The
181    ///   token IS tracked - tooling can still rename it during
182    ///   the mid-edit state.
183    /// - A directive so garbled that the CST wraps the region
184    ///   in an `ERROR_NODE`. The ACCOUNT token is inside an
185    ///   `ERROR_NODE` and is NOT tracked. This is deliberate -
186    ///   the recovery boundary is fuzzy and including such
187    ///   tokens would surface as confusing rename hits inside
188    ///   garbage source.
189    ///
190    /// # Limitations
191    ///
192    /// The list is undifferentiated: declarations (from
193    /// open/close/balance/pad/note/document) and references
194    /// (from posting accounts and ACCOUNT-typed metadata) are
195    /// mixed together. There is no equivalent of the
196    /// `commodity_declaration_spans` helper used for currencies
197    /// (the account case has six declaration directive shapes vs.
198    /// the single `Commodity` shape, so no symmetric helper
199    /// exists yet). A future go-to-definition migration will need
200    /// either a re-walk over `directives` or an additional
201    /// `account_declarations: Vec<Span>` field.
202    ///
203    /// **`file_id` is always 0 in parser output** - same loader
204    /// contract as `currency_occurrences`.
205    pub account_occurrences: Vec<Spanned<rustledger_core::Account>>,
206    /// `true` iff the parsed source began with a UTF-8 BOM (strict
207    /// byte 0).
208    ///
209    /// This is the **single source of truth** for downstream consumers
210    /// that need to know whether to preserve a leading BOM on output
211    /// (notably `format_source`). Do NOT inspect the source bytes
212    /// directly; the parser already handled the strip/detect logic in
213    /// one place ([`crate::bom::strip_leading`]) and stored the result
214    /// here. Reproducing the check elsewhere is exactly the contract-
215    /// drift class of bug this field was introduced to eliminate.
216    ///
217    /// Span coordinates in this `ParseResult` are in the **original
218    /// source frame** - i.e., if `has_leading_bom` is true, spans
219    /// already include the 3-byte BOM offset and index directly into
220    /// the caller's source.
221    pub has_leading_bom: bool,
222    /// The lossless CST root the converter walked to produce
223    /// everything above. Stored as a [`rowan::GreenNode`], which
224    /// is `Send + Sync` and reference-counted internally, so an
225    /// `Arc<ParseResult>` (the shape the LSP caches per document)
226    /// shares this handle across handler invocations without
227    /// re-parsing.
228    ///
229    /// **Prefer [`Self::syntax_node`]** over reading this field
230    /// directly. The method is the supported entry point: it
231    /// returns a [`SyntaxNode`] (the cursor-API view), keeps the
232    /// `rowan::GreenNode` type name out of consumer code, and
233    /// shields callers from minor rowan upgrades that touch the
234    /// `GreenNode` shape. The field is public for two reasons —
235    /// the exhaustive destructure in
236    /// [`__baseline_canonical_payload`] needs to bind it, and
237    /// `Arc::clone`-style sharing patterns benefit from direct
238    /// access — but downstream code should reach for the method.
239    ///
240    /// **Byte-offset frame: post-BOM.** The CST is built from
241    /// the BOM-stripped source — the parser strips a strict-
242    /// byte-0 UTF-8 BOM (see [`crate::bom::strip_leading`]) and
243    /// feeds the stripped slice to `parse_structured`. So every
244    /// `TextRange` / `TextSize` reachable through this tree is
245    /// in the **post-BOM** byte frame: an offset of `0` here
246    /// corresponds to byte `BOM_LEN == 3` of the original source
247    /// when [`Self::has_leading_bom`] is `true`. This differs
248    /// from the typed-AST fields above ([`Self::directives`],
249    /// [`Self::currency_occurrences`], [`Self::account_occurrences`],
250    /// [`Self::errors`], …), whose spans the converter
251    /// pre-shifts back into the *original*-source frame so
252    /// downstream consumers can index directly into the caller's
253    /// source bytes. CST-walking consumers must apply the
254    /// equivalent shift themselves: subtract `BOM_LEN` when
255    /// translating an original-source offset down to a CST
256    /// offset (e.g., `cst.token_at_offset(orig - BOM_LEN)`), and
257    /// add `BOM_LEN` back when emitting an original-source
258    /// position from a `TextRange`. The LSP `selection_range`
259    /// handler does this — see its rustdoc and the
260    /// `bom_prefixed_source_does_not_shift_ranges` regression
261    /// test.
262    ///
263    /// **Canonical-payload exclusion.** This field is deliberately
264    /// NOT fed into [`__baseline_canonical_payload`]. The green
265    /// node is a redundant cache of the source bytes; the
266    /// existing `directives` / `currency_occurrences` /
267    /// `account_occurrences` / `errors` fields already capture
268    /// everything downstream consumers track for drift detection.
269    /// Adding the green node's `Debug` output would multiply
270    /// the fingerprint size without surfacing any new drift
271    /// signal. The corresponding `assert_field_in_hash` arm is
272    /// also intentionally absent in `tests/corpus_baseline.rs`.
273    /// A negative-form test (`__canonical_payload_excludes_syntax_root`
274    /// in this file) pins the exclusion: it confirms that mutating
275    /// `syntax_root` while every other field is equal does NOT
276    /// change the canonical payload bytes.
277    pub syntax_root: rowan::GreenNode,
278    /// File-wide alignment columns the formatter would use for
279    /// this source — pre-computed at parse time so hot formatting
280    /// paths skip the `O(N_postings)` per-call walk.
281    ///
282    /// `PostingAlignment` is `Copy`; pass it directly into the
283    /// `_with_alignment` variants of the formatter
284    /// ([`crate::format::format_node_with_alignment`],
285    /// [`crate::format::format_node_range_with_alignment`],
286    /// [`crate::format::format_source_with_parsed`]) to reuse this
287    /// cached value. The LSP `format_document` /
288    /// `range_formatting` fallback handlers, the FFI `format.source`
289    /// endpoint, and the WASM `ParsedLedger::format` bridge all
290    /// consume the cache to skip both the redundant parse and the
291    /// redundant alignment walk.
292    ///
293    /// **Producer-only cache invariant.** This field is populated
294    /// exactly once by `parse_via_cst`; the value is consistent with
295    /// the `directives` / `syntax_root` fields *at parse time*.
296    /// `ParseResult` exposes every cache input (`directives`,
297    /// `syntax_root`) as `pub`, so technically a consumer with a
298    /// `&mut ParseResult` can mutate one without refreshing the
299    /// other — leaving `alignment` stale. That is OUT-OF-CONTRACT
300    /// for this cache. Callers that mutate `ParseResult` directly
301    /// must either (a) refresh `alignment` by calling
302    /// `crate::format::compute_alignment(&SourceFile::cast(self.syntax_node()))`,
303    /// (b) avoid the `_with_alignment` formatter variants and use
304    /// the bare ones (which re-compute), or (c) treat the
305    /// `ParseResult` as immutable after construction (the common
306    /// case — the LSP wraps it in `Arc<ParseResult>`).
307    ///
308    /// **Equivalence pinned.**
309    /// `parse_result_alignment_cache::*` (7 fixtures) assert that
310    /// `parse(s).alignment` equals
311    /// `compute_alignment(&SourceFile::cast(parse(s).syntax_node()).unwrap())`
312    /// across representative fixtures, so any future divergence
313    /// (a converter change that forgets to refresh the cache, a
314    /// `compute_alignment` change that breaks the contract)
315    /// fails CI.
316    ///
317    /// **Canonical-payload exclusion.** Excluded from
318    /// [`__baseline_canonical_payload`] for the same reason as
319    /// `syntax_root`: it's a redundant derivation of `directives`
320    /// content. Mutating it without changing `directives` would
321    /// silently flip the corpus hash; including it in the
322    /// payload would change the hash for every source with a
323    /// non-default alignment (i.e. essentially every real
324    /// Beancount file). The exclusion is pinned by
325    /// `canonical_payload_excludes_alignment`.
326    pub alignment: crate::format::PostingAlignment,
327}
328
329impl ParseResult {
330    /// Cursor-API view of the lossless CST that produced this
331    /// `ParseResult`. Equivalent to
332    /// `SyntaxNode::new_root(self.syntax_root.clone())`.
333    ///
334    /// Construction is an `Arc` bump (the green node's internal
335    /// refcount); cheap enough to call per request. This is the
336    /// supported entry point for CST consumers — prefer it over
337    /// reading [`Self::syntax_root`] directly, so the `rowan`
338    /// dependency stays an implementation detail.
339    #[must_use]
340    pub fn syntax_node(&self) -> SyntaxNode {
341        SyntaxNode::new_root(self.syntax_root.clone())
342    }
343}
344
345// Compile-time assertion: `ParseResult` is shared as
346// `Arc<ParseResult>` across the LSP's main thread and its
347// background worker (see `rustledger-lsp/src/main_loop.rs`).
348// A future field whose type is not `Send + Sync` (e.g. an `Rc`,
349// a `Cell`, or a non-thread-safe handle) would silently break
350// the LSP build at the call site, far from the parser change
351// that caused it. This assertion fences the invariant at the
352// definition site so the parser crate's own build fails first.
353const _: fn() = || {
354    const fn assert_send_sync<T: Send + Sync>() {}
355    assert_send_sync::<ParseResult>();
356};
357
358/// A warning from the parser (non-fatal).
359#[derive(Debug, Clone)]
360pub struct ParseWarning {
361    /// The warning message.
362    pub message: String,
363    /// Location in source.
364    pub span: Span,
365}
366
367impl ParseWarning {
368    /// Create a new warning.
369    pub fn new(message: impl Into<String>, span: Span) -> Self {
370        Self {
371            message: message.into(),
372            span,
373        }
374    }
375}
376
377/// Parse beancount source code.
378///
379/// Routes through the CST-backed implementation
380/// ([`parse_via_cst`]): a lossless Logos lexer feeds a structured
381/// CST builder, and the converter in `crate::cst::convert` walks
382/// the resulting tree to produce the [`ParseResult`].
383///
384/// # Arguments
385///
386/// * `source` - The beancount source code to parse
387///
388/// # Returns
389///
390/// A `ParseResult` containing directives, options, includes, plugins, and errors.
391#[must_use]
392pub fn parse(source: &str) -> ParseResult {
393    parse_via_cst(source)
394}
395
396/// Parse beancount source code, returning only directives and errors.
397///
398/// This is a simpler interface when you don't need options/includes/plugins.
399#[must_use]
400pub fn parse_directives(source: &str) -> (Vec<Spanned<Directive>>, Vec<ParseError>) {
401    let result = parse(source);
402    (result.directives, result.errors)
403}
404
405/// Canonical hash-payload serialization for the corpus baseline
406/// (#1262 phase 0). **Internal**: this exists only so the baseline
407/// integration test can hash a `ParseResult` without listing fields
408/// outside the defining crate.
409///
410/// Returns a byte string that uniquely identifies the `ParseResult`'s
411/// observable content. Directives route through `serde_json::to_value`
412/// to normalize the `FxHashMap` iteration order in metadata; all
413/// other fields use `Debug` formatting, which is deterministic for
414/// `Vec`-based types.
415///
416/// **Why this lives in `rustledger-parser` instead of the test:**
417/// `ParseResult` is `#[non_exhaustive]`, which blocks exhaustive
418/// destructuring from external crates (including the integration
419/// test). Performing the destructure here forces the compiler to
420/// flag any field added to `ParseResult` that the canonical
421/// serialization does not feed into its output. Without this, a new
422/// `ParseResult` field could silently exit the baseline fingerprint -
423/// the BOM-flag-omission class of bug the round-3 review caught.
424///
425/// **Add a new field?** Add a binding (NOT `_`) AND a hasher feed
426/// line to the destructure below. The compiler enforces the binding;
427/// reviewers must enforce the feed.
428///
429/// **Determinism precondition:** this routes directives through
430/// `serde_json::to_value`, which is only sort-stable when
431/// `serde_json`'s `preserve_order` feature is **off**. Cargo feature
432/// unification can flip this on workspace-wide; the unit test
433/// `serde_json_object_is_sorted` in this crate's tests catches that
434/// flip before the canonical hash silently desyncs.
435#[doc(hidden)]
436#[must_use]
437pub fn __baseline_canonical_payload(result: &ParseResult) -> Vec<u8> {
438    let ParseResult {
439        directives,
440        options,
441        includes,
442        plugins,
443        comments,
444        errors,
445        warnings,
446        currency_occurrences,
447        account_occurrences,
448        has_leading_bom,
449        syntax_root,
450        alignment,
451    } = result;
452    // Both `syntax_root` and `alignment` are redundant
453    // derivations of fields already in the canonical payload
454    // (`syntax_root` of the source bytes captured by
455    // `directives`/`occurrences`/`errors`; `alignment` of the
456    // posting widths inside `directives`). Bind them so the
457    // compiler still flags future field additions on this
458    // exhaustive destructure, but discard them from the canonical
459    // payload. Pinned by `canonical_payload_excludes_syntax_root`
460    // and `canonical_payload_excludes_alignment`.
461    let _ = syntax_root;
462    let _ = alignment;
463    let mut out: Vec<u8> = Vec::new();
464    let directives_json = serde_json::to_value(directives)
465        .map_or_else(|e| format!("serialize-error:{e}"), |v| v.to_string());
466    out.extend_from_slice(b"directives:");
467    out.extend_from_slice(directives_json.as_bytes());
468    out.extend_from_slice(b"\noptions:");
469    out.extend_from_slice(format!("{options:?}").as_bytes());
470    out.extend_from_slice(b"\nincludes:");
471    out.extend_from_slice(format!("{includes:?}").as_bytes());
472    out.extend_from_slice(b"\nplugins:");
473    out.extend_from_slice(format!("{plugins:?}").as_bytes());
474    out.extend_from_slice(b"\ncomments:");
475    out.extend_from_slice(format!("{comments:?}").as_bytes());
476    out.extend_from_slice(b"\nerrors:");
477    out.extend_from_slice(format!("{errors:?}").as_bytes());
478    out.extend_from_slice(b"\nwarnings:");
479    out.extend_from_slice(format!("{warnings:?}").as_bytes());
480    out.extend_from_slice(b"\ncurrency_occurrences:");
481    out.extend_from_slice(format!("{currency_occurrences:?}").as_bytes());
482    out.extend_from_slice(b"\naccount_occurrences:");
483    out.extend_from_slice(format!("{account_occurrences:?}").as_bytes());
484    out.extend_from_slice(b"\nhas_leading_bom:");
485    out.extend_from_slice(format!("{has_leading_bom:?}").as_bytes());
486    out
487}
488
489#[cfg(test)]
490mod canonical_payload_determinism {
491    //! Guard against cargo feature unification silently enabling
492    //! `serde_json/preserve_order` workspace-wide. When `preserve_order`
493    //! is OFF, `serde_json::Value::Object` is BTreeMap-backed and sorts
494    //! its keys; when ON, it's IndexMap-backed and preserves insertion
495    //! order. `__baseline_canonical_payload` relies on the sort-stable
496    //! behavior to neutralize `FxHashMap` iteration order in directive
497    //! metadata. A workspace crate flipping the feature on would make
498    //! canonical hashes vary with hashbrown state across machines -
499    //! the very class of bug the canonicalization was added to
500    //! prevent. This test fails fast and points at the cargo-feature
501    //! cause instead of letting the corpus baseline mysteriously drift.
502    use serde_json::json;
503
504    #[test]
505    fn serde_json_object_is_sorted() {
506        // Insertion order `b, a` would survive under `preserve_order`.
507        // Default features sort to `a, b`.
508        let v = json!({ "b": 1, "a": 2 });
509        let s = v.to_string();
510        assert!(
511            s.starts_with("{\"a\""),
512            "serde_json::Value::Object is not sorting keys (got {s}). \
513             This means cargo feature unification turned on \
514             serde_json/preserve_order somewhere in the workspace. \
515             The corpus baseline's canonical hash assumes sorted \
516             Object keys to neutralize FxHashMap iteration order in \
517             directive metadata. Find the crate that enabled \
518             `serde_json = {{ ..., features = [\"preserve_order\"] }}` \
519             and remove it, or thread an alternative canonicalization \
520             through __baseline_canonical_payload.",
521        );
522    }
523}
524
525#[cfg(test)]
526mod cached_syntax_root_matches_fresh_parse {
527    //! The `selection_range` handler (and any future CST-walking
528    //! handler) consumes [`ParseResult::syntax_root`] instead of
529    //! re-invoking [`crate::parse_structured`]. The safety
530    //! argument is "the cached green root is the same tree the
531    //! converter walked, which is the same tree a fresh
532    //! `parse_structured` would return."
533    //!
534    //! Today that argument is trivially true because the cache is
535    //! populated directly from the converter's `source_file`.
536    //! But if a future change introduces post-conversion CST
537    //! mutation (span rewrites, error-recovery splicing, trivia
538    //! reattachment) the cached root would diverge from a fresh
539    //! re-parse — silently, since nothing else compares the two
540    //! trees. This test pins the invariant across a small fixture
541    //! set covering empty source, every directive kind, error
542    //! recovery, mid-file BOM, and metadata-bearing transactions.
543    use super::{cst::parse_structured, parse};
544
545    fn assert_round_trip(label: &str, source: &str) {
546        let parsed = parse(source);
547        let (stripped, _bom) = crate::bom::strip_leading(source);
548        let fresh = parse_structured(stripped).green().into_owned();
549        assert_eq!(
550            parsed.syntax_root, fresh,
551            "cached syntax_root diverged from fresh parse_structured for {label}: \n\
552             this means something is mutating the green tree between converter \
553             capture and consumer access. The two are supposed to be identical."
554        );
555    }
556
557    #[test]
558    fn empty_source() {
559        assert_round_trip("empty", "");
560    }
561
562    #[test]
563    fn simple_directive() {
564        assert_round_trip("open", "2024-01-01 open Assets:Bank USD\n");
565    }
566
567    #[test]
568    fn every_directive_shape() {
569        assert_round_trip(
570            "directive zoo",
571            r#"option "title" "Test"
572plugin "myplugin"
573include "other.beancount"
5742024-01-01 open Assets:Bank USD
5752024-01-01 commodity USD
5762024-06-15 * "Coffee"
577  Assets:Bank  -5.00 USD
578  Expenses:Food
5792024-12-31 close Assets:Bank
5802024-01-31 balance Assets:Bank 100 USD
5812024-01-15 pad Assets:Bank Equity:Opening
5822024-01-15 note Assets:Bank "deposit pending"
5832024-01-15 event "location" "SF"
5842024-01-15 price USD 1.00 EUR
585"#,
586        );
587    }
588
589    #[test]
590    fn with_parse_errors() {
591        // Trigger error recovery (unterminated string, garbled
592        // directive) to ensure the post-pass `fixup_directive_spans`
593        // and error-node wrapping don't drift between cache and
594        // fresh re-parse.
595        assert_round_trip(
596            "broken",
597            "2024-01-01 open Assets:Bank \"unterminated\n2024-01-02 garbage line here\n",
598        );
599    }
600
601    #[test]
602    fn with_metadata_and_comments() {
603        assert_round_trip(
604            "metadata",
605            r#"; standalone comment
6062024-01-01 open Assets:Bank USD
607  payee_account: Assets:Other
6082024-06-15 * "Coffee"  ; eol comment
609  memo: "morning"
610  Assets:Bank  -5.00 USD
611"#,
612        );
613    }
614}
615
616#[cfg(test)]
617mod canonical_payload_excludes_syntax_root {
618    //! Pins the deliberate exclusion of `ParseResult::syntax_root`
619    //! from [`__baseline_canonical_payload`]. The exclusion is
620    //! documented in three places (the field's rustdoc, the
621    //! destructure comment in `__baseline_canonical_payload`, and
622    //! the CHANGELOG entry under `[Unreleased] / Features`) but
623    //! none of those are executable. A future contributor
624    //! mechanically pattern-matching on "all fields get an arm"
625    //! could add a `syntax_root` feed to the canonical payload —
626    //! the corpus manifest would silently drift on every source
627    //! that touched the green tree.
628    //!
629    //! This test mutates `syntax_root` while leaving every other
630    //! field equal, and asserts the canonical payload bytes are
631    //! unchanged.
632    use super::{__baseline_canonical_payload, parse};
633
634    #[test]
635    fn mutating_syntax_root_does_not_change_canonical_payload() {
636        let src_a = "2024-01-01 open Assets:Bank USD\n";
637        // A different source produces a different green tree but
638        // we want every OTHER field equal; pick a source that
639        // produces an identical typed ParseResult on every field
640        // EXCEPT `syntax_root`. Empty source is the simplest
641        // counterexample for "syntax_root differs"; we go further
642        // and synthesize the mutation explicitly to keep the test
643        // independent of the converter's behavior.
644        let parsed_a = parse(src_a);
645        let mut mutated = parse(src_a);
646        // Replace the green tree with a freshly-parsed but
647        // structurally-different one. `parse("")` gives an empty
648        // SOURCE_FILE green root; the original has an OPEN_DIRECTIVE
649        // child. Other fields will differ for `parse("")`, so we
650        // construct the mutation by swapping ONLY the field.
651        mutated.syntax_root = parse("").syntax_root;
652
653        let payload_original = __baseline_canonical_payload(&parsed_a);
654        let payload_mutated = __baseline_canonical_payload(&mutated);
655        assert_eq!(
656            payload_original, payload_mutated,
657            "canonical payload changed after mutating only `syntax_root`. \
658             Either the destructure in `__baseline_canonical_payload` \
659             grew a `syntax_root` feed line (revert that — the field \
660             is deliberately excluded; see its rustdoc), or another \
661             field now reads from `syntax_root` indirectly. Either \
662             way the corpus manifest is about to drift."
663        );
664    }
665}
666
667#[cfg(test)]
668mod canonical_payload_excludes_alignment {
669    //! Pins the deliberate exclusion of `ParseResult::alignment`
670    //! from [`__baseline_canonical_payload`]. Same shape as
671    //! `canonical_payload_excludes_syntax_root`: mutate the field,
672    //! re-hash, assert unchanged.
673    //!
674    //! Including `alignment` in the canonical payload would change
675    //! the corpus hash for every source whose postings determine
676    //! non-default column widths — i.e. essentially every real
677    //! Beancount file. The field is a derivation of `directives`
678    //! content (already in the payload via the typed-AST hash);
679    //! it carries no independent drift signal.
680    use super::{__baseline_canonical_payload, parse};
681    use crate::cst::format::PostingAlignment;
682
683    #[test]
684    fn mutating_alignment_does_not_change_canonical_payload() {
685        let src = "\
6862024-01-15 * \"Coffee\"
687  Assets:Bank  -5.00 USD
688  Expenses:Food
689";
690        let parsed = parse(src);
691        let mut mutated = parse(src);
692        // Synthesize a different PostingAlignment value: bump number_col
693        // by 100. Real-world alignment would never be this wide
694        // for the fixture, so we get a guaranteed-different cache.
695        mutated.alignment = PostingAlignment {
696            number_col: parsed.alignment.number_col + 100,
697            number_width: parsed.alignment.number_width + 7,
698        };
699
700        let payload_original = __baseline_canonical_payload(&parsed);
701        let payload_mutated = __baseline_canonical_payload(&mutated);
702        assert_eq!(
703            payload_original, payload_mutated,
704            "canonical payload changed after mutating only `alignment`. \
705             Either the destructure in `__baseline_canonical_payload` \
706             grew an `alignment` feed line (revert that — the field \
707             is deliberately excluded), or another field now reads \
708             from `alignment` indirectly. Either way the corpus \
709             manifest is about to drift across every source with \
710             postings.",
711        );
712    }
713}
714
715#[cfg(test)]
716mod parse_result_alignment_cache {
717    //! Pins the equivalence between `ParseResult::alignment` (the
718    //! pre-computed cache populated by `parse_via_cst`) and a
719    //! fresh `compute_alignment` call on the same syntax tree.
720    //! A converter change that forgets to refresh the cache, or a
721    //! `compute_alignment` change that breaks the cache's
722    //! semantics, fails this test before reaching the LSP.
723    use super::parse;
724    use crate::cst::ast::{AstNode, SourceFile};
725    use crate::cst::format::compute_alignment;
726
727    fn assert_equivalent(label: &str, source: &str) {
728        let result = parse(source);
729        let source_file = SourceFile::cast(result.syntax_node())
730            .expect("ParseResult::syntax_node() must be a SOURCE_FILE");
731        let fresh = compute_alignment(&source_file);
732        assert_eq!(
733            result.alignment, fresh,
734            "ParseResult::alignment cache diverged from a fresh \
735             compute_alignment call for {label}: cache = {:?}, fresh = {:?}. \
736             Either parse_via_cst forgot to call compute_alignment, or \
737             compute_alignment's semantics changed without refreshing \
738             the cache in the converter.",
739            result.alignment, fresh,
740        );
741    }
742
743    #[test]
744    fn empty_source() {
745        assert_equivalent("empty", "");
746    }
747
748    #[test]
749    fn open_only_no_postings() {
750        assert_equivalent("open only", "2024-01-01 open Assets:Bank USD\n");
751    }
752
753    #[test]
754    fn single_transaction() {
755        assert_equivalent(
756            "single txn",
757            "\
7582024-01-15 * \"Coffee\"
759  Assets:Bank  -5.00 USD
760  Expenses:Food
761",
762        );
763    }
764
765    #[test]
766    fn multi_transaction_varying_widths() {
767        assert_equivalent(
768            "varying widths",
769            "\
7702024-01-15 * \"A\"
771  Assets:Bank  -5.00 USD
772  Expenses:Food
7732024-02-15 * \"B\"
774  Assets:Investment:Long:Path  -123456.78 USD
775  Expenses:Tax  100.00 USD
776",
777        );
778    }
779
780    #[test]
781    fn arithmetic_amounts() {
782        assert_equivalent(
783            "arithmetic amounts",
784            "\
7852024-01-15 * \"Split\"
786  Assets:Bank  -10.00 + 5.00 USD
787  Expenses:Misc
788",
789        );
790    }
791
792    #[test]
793    fn parse_errors() {
794        // Even on parse-error files the cache must match a fresh
795        // call. The LSP fallback path consumes the cache through
796        // a broken file, so equivalence under error recovery is
797        // load-bearing.
798        assert_equivalent(
799            "broken",
800            "\
8012024-01-15 * \"x\"
802  Assets:Bank  -5.00 USD
803}}}garbage
8042024-02-15 * \"y\"
805  Assets:Other  100.00 USD
806",
807        );
808    }
809
810    /// Mid-transaction recovery: when the WIDEST transaction's body
811    /// breaks (becomes `ERROR_NODE` because a posting is
812    /// syntactically incomplete), its postings are EXCLUDED from
813    /// `compute_alignment` because the wrapping Transaction node
814    /// fails the `ast::Directive::Transaction::cast` check inside
815    /// the alignment walk. The cache reflects only the
816    /// successfully-parsed transactions' alignment; this is the
817    /// behavior the LSP fallback observes when format-on-type fires
818    /// during a mid-edit broken state. The test pins the
819    /// equivalence (cache matches fresh call) so the producer-side
820    /// invariant holds even in this awkward transitional state.
821    ///
822    /// Note for users: as the user keeps typing and the parser
823    /// recovers/breaks the wrapping Transaction across edits, the
824    /// alignment columns may visibly shift. This is unavoidable
825    /// without speculatively recovering wide-account information
826    /// from the broken transaction's source bytes — out of scope
827    /// for the cache.
828    #[test]
829    fn mid_transaction_error_node() {
830        // First transaction has wide accounts (Assets:Investment:Long:Path)
831        // but is broken — the posting line ends with garbage that
832        // the recovery should wrap into an ERROR_NODE around the
833        // whole transaction. Second transaction (narrow accounts)
834        // parses cleanly. The cache's alignment reflects only the
835        // narrow transaction's widths.
836        assert_equivalent(
837            "mid-transaction breakage",
838            "\
8392024-01-15 * \"wide broken\"
840  Assets:Investment:Long:Path  -123456.78 USD }}}
841  Expenses:Tax
8422024-02-15 * \"narrow clean\"
843  Assets:Bank  -5.00 USD
844  Expenses:Food
845",
846        );
847    }
848}