rustledger_parser/lib.rs
1//! Beancount parser built on a Logos lexer + structured CST.
2//!
3//! [`parse`] tokenizes via [`logos_lexer`], constructs a lossless
4//! CST through [`parse_structured`], and walks it via the
5//! converter in `cst::convert` to produce a [`ParseResult`] with
6//! the typed AST plus errors, options, includes, plugins,
7//! comments, and currency occurrences.
8//!
9//! # Features
10//!
11//! - Full Beancount syntax support (all 12 directive types)
12//! - Error recovery (continues parsing after errors)
13//! - Precise source locations for error reporting
14//! - Support for includes, options, plugins
15//!
16//! # Example
17//!
18//! ```ignore
19//! use rustledger_parser::parse;
20//!
21//! let source = r#"
22//! 2024-01-15 * "Coffee Shop" "Morning coffee"
23//! Expenses:Food:Coffee 5.00 USD
24//! Assets:Cash
25//! "#;
26//!
27//! let result = parse(source);
28//! assert!(result.errors.is_empty());
29//! assert_eq!(result.directives.len(), 1);
30//! ```
31
32#![forbid(unsafe_code)]
33#![warn(missing_docs)]
34
35pub mod bom;
36pub mod cst;
37mod diagnostics;
38mod error;
39pub mod logos_lexer;
40
41/// Opinionated CST-backed formatter entries.
42///
43/// **Sole** import path for the formatter surface - `format_source`,
44/// `format_source_with_parsed`, `try_format_source`, `format_node`,
45/// `format_node_range`, `format_node_with_alignment`,
46/// `format_node_range_with_alignment`, `PostingAlignment`,
47/// `compute_alignment`, `canonicalize_directives`,
48/// `CanonicalizeError`, `lf_to_crlf_outside_strings`,
49/// `crlf_to_lf_outside_strings`, `cr_outside_strings_present`. The
50/// flat crate-root re-exports were removed in round-5 and the
51/// duplicate `crate::cst::format` path was sealed in round-6 of
52/// the PR #1284 reviews, so a future deprecation can be done at
53/// exactly one site.
54pub mod format {
55 pub use crate::cst::format::{
56 CanonicalizeError, PostingAlignment, canonicalize_directives, compute_alignment,
57 cr_outside_strings_present, crlf_to_lf_outside_strings, format_node, format_node_range,
58 format_node_range_with_alignment, format_node_with_alignment, format_source,
59 format_source_with_parsed, lf_to_crlf_outside_strings, try_format_source,
60 };
61}
62
63pub use cst::{
64 BeancountLanguage, SyntaxElement, SyntaxKind, SyntaxNode, SyntaxToken, lossless_kind_tokens,
65 parse_flat, parse_structured, parse_via_cst,
66};
67
68// Rowan types CST consumers need. Flat re-exports at the crate
69// root match the surrounding `SyntaxNode` / `SyntaxToken` shape -
70// downstream `use rustledger_parser::{SyntaxNode, TextRange};`
71// resolves both halves uniformly without a sub-module hop.
72//
73// The set covers what LSP handlers need for tree walking:
74// - `TextRange` / `TextSize`: byte-offset ranges on every node
75// - `TokenAtOffset`: cursor-position lookup
76// - `WalkEvent`: preorder / postorder traversal for folding-range
77// and semantic-tokens implementations
78// - `NodeOrToken`: pattern-matching `SyntaxElement` children
79// - `Direction`: sibling iteration
80//
81// `GreenNode` is deliberately NOT re-exported - it's the
82// thread-safe storage backing for `SyntaxNode` but downstream
83// consumers should walk via the cursor API, not the green tree.
84//
85// **Stability.** These types are versioned in lockstep with this
86// crate, NOT with `rowan` directly. A rowan minor bump that
87// changes any of these will require a coordinated bump of this
88// crate so the re-export contract holds at THIS crate's semver.
89pub use error::{ParseError, ParseErrorKind};
90pub use rowan::{Direction, NodeOrToken, TextRange, TextSize, TokenAtOffset, WalkEvent};
91pub use rustledger_core::{InternedStr, SYNTHESIZED_FILE_ID, Span, Spanned};
92
93use rustledger_core::Directive;
94
95/// Result of parsing a beancount file.
96///
97/// Marked `#[non_exhaustive]` so external consumers must go through
98/// [`parse`] rather than constructing the struct by literal. Future
99/// field additions (e.g., diagnostic metadata, source-map back-
100/// references) then land as non-breaking changes.
101#[derive(Debug)]
102#[non_exhaustive]
103pub struct ParseResult {
104 /// Successfully parsed directives.
105 pub directives: Vec<Spanned<Directive>>,
106 /// Options found in the file.
107 pub options: Vec<(String, String, Span)>,
108 /// Include directives found.
109 pub includes: Vec<(String, Span)>,
110 /// Plugin directives found.
111 pub plugins: Vec<(String, Option<String>, Span)>,
112 /// Standalone comments found in the file.
113 pub comments: Vec<Spanned<String>>,
114 /// Parse errors encountered.
115 pub errors: Vec<ParseError>,
116 /// Deprecation warnings.
117 pub warnings: Vec<ParseWarning>,
118 /// Every `Currency` token the parser consumed, paired with its
119 /// interned value and source-byte range.
120 ///
121 /// Source-position-aware tooling (LSP rename / references /
122 /// document-highlight) walks this list to produce edits, locations,
123 /// and highlights without resorting to string search of the source,
124 /// which produces false positives in comments, payee strings,
125 /// account-name segments, etc. The order matches source order
126 /// because the parser fills it as tokens are consumed (and the
127 /// parser is strictly forward-advancing, including on error
128 /// recovery).
129 ///
130 /// **Error-recovery contract.** Tokens consumed during a
131 /// directive that ultimately fails to parse remain in this list.
132 /// Rationale: the lexer's classification of a token as a
133 /// `Currency` is independent of whether the surrounding syntax is
134 /// valid, and tooling that wants to rename or highlight a
135 /// currency the user typed should follow that classification.
136 /// Do not "clean up" partially-consumed entries after a parse
137 /// failure - that would hide real currency identifiers from
138 /// downstream tooling while the user is mid-edit.
139 ///
140 /// **`file_id` is always 0 in parser output.** The parser
141 /// processes one file at a time and doesn't know its own file
142 /// id. The loader sets the correct id on each entry via
143 /// `.with_file_id(n)` when assembling a multi-file `SourceMap`,
144 /// the same way it does for `directives`. Per-file consumers
145 /// (today: every LSP handler) can ignore `file_id`; future
146 /// multi-file consumers must remember to thread it through.
147 pub currency_occurrences: Vec<Spanned<rustledger_core::Currency>>,
148 /// Every `Account` token the parser consumed, paired with its
149 /// interned value and source-byte range.
150 ///
151 /// Mirrors [`Self::currency_occurrences`] for the account
152 /// shape. The CST conversion (`walk_descendants_once`) tracks
153 /// every `ACCOUNT` token whose ancestors do NOT include an
154 /// `ERROR_NODE`. The LSP rename handler (phase 5.4) walks
155 /// this list to emit exact-span edits without resorting to
156 /// per-directive substring search, which used to produce
157 /// false positives wherever an account-name fragment appeared
158 /// inside a payee string, a STRING-typed metadata value, or a
159 /// comment. ACCOUNT-typed metadata values (e.g.
160 /// `counterparty: Assets:Bank`) DO produce an `ACCOUNT` token
161 /// at the lexer level and ARE included in this list - so a
162 /// rename of `Assets:Bank` correctly rewrites that metadata
163 /// value too.
164 ///
165 /// **Migration status (#1262 phase 5.4).** Only the LSP
166 /// rename handler currently consumes this index. The sibling
167 /// handlers `references`, `document_highlight`, and
168 /// `linked_editing` still walk the typed AST with substring
169 /// search for accounts (see those modules' rustdoc); migrating
170 /// them to consume `account_occurrences` is tracked as a
171 /// phase 5.5+ follow-up.
172 ///
173 /// **Error-recovery contract.** Two notions of "failing
174 /// directive" need to be distinguished:
175 ///
176 /// - A directive that PARSES SYNTACTICALLY but whose
177 /// typed-AST conversion errors (e.g.,
178 /// [`crate::ParseErrorKind::InvalidBookingMethod`] on an
179 /// `open Assets:Bank "GARBAGE"`). The ACCOUNT node is
180 /// intact in the CST and NOT inside an `ERROR_NODE`. The
181 /// token IS tracked - tooling can still rename it during
182 /// the mid-edit state.
183 /// - A directive so garbled that the CST wraps the region
184 /// in an `ERROR_NODE`. The ACCOUNT token is inside an
185 /// `ERROR_NODE` and is NOT tracked. This is deliberate -
186 /// the recovery boundary is fuzzy and including such
187 /// tokens would surface as confusing rename hits inside
188 /// garbage source.
189 ///
190 /// # Limitations
191 ///
192 /// The list is undifferentiated: declarations (from
193 /// open/close/balance/pad/note/document) and references
194 /// (from posting accounts and ACCOUNT-typed metadata) are
195 /// mixed together. There is no equivalent of the
196 /// `commodity_declaration_spans` helper used for currencies
197 /// (the account case has six declaration directive shapes vs.
198 /// the single `Commodity` shape, so no symmetric helper
199 /// exists yet). A future go-to-definition migration will need
200 /// either a re-walk over `directives` or an additional
201 /// `account_declarations: Vec<Span>` field.
202 ///
203 /// **`file_id` is always 0 in parser output** - same loader
204 /// contract as `currency_occurrences`.
205 pub account_occurrences: Vec<Spanned<rustledger_core::Account>>,
206 /// `true` iff the parsed source began with a UTF-8 BOM (strict
207 /// byte 0).
208 ///
209 /// This is the **single source of truth** for downstream consumers
210 /// that need to know whether to preserve a leading BOM on output
211 /// (notably `format_source`). Do NOT inspect the source bytes
212 /// directly; the parser already handled the strip/detect logic in
213 /// one place ([`crate::bom::strip_leading`]) and stored the result
214 /// here. Reproducing the check elsewhere is exactly the contract-
215 /// drift class of bug this field was introduced to eliminate.
216 ///
217 /// Span coordinates in this `ParseResult` are in the **original
218 /// source frame** - i.e., if `has_leading_bom` is true, spans
219 /// already include the 3-byte BOM offset and index directly into
220 /// the caller's source.
221 pub has_leading_bom: bool,
222 /// The lossless CST root the converter walked to produce
223 /// everything above. Stored as a [`rowan::GreenNode`], which
224 /// is `Send + Sync` and reference-counted internally, so an
225 /// `Arc<ParseResult>` (the shape the LSP caches per document)
226 /// shares this handle across handler invocations without
227 /// re-parsing.
228 ///
229 /// **Prefer [`Self::syntax_node`]** over reading this field
230 /// directly. The method is the supported entry point: it
231 /// returns a [`SyntaxNode`] (the cursor-API view), keeps the
232 /// `rowan::GreenNode` type name out of consumer code, and
233 /// shields callers from minor rowan upgrades that touch the
234 /// `GreenNode` shape. The field is public for two reasons —
235 /// the exhaustive destructure in
236 /// [`__baseline_canonical_payload`] needs to bind it, and
237 /// `Arc::clone`-style sharing patterns benefit from direct
238 /// access — but downstream code should reach for the method.
239 ///
240 /// **Byte-offset frame: post-BOM.** The CST is built from
241 /// the BOM-stripped source — the parser strips a strict-
242 /// byte-0 UTF-8 BOM (see [`crate::bom::strip_leading`]) and
243 /// feeds the stripped slice to `parse_structured`. So every
244 /// `TextRange` / `TextSize` reachable through this tree is
245 /// in the **post-BOM** byte frame: an offset of `0` here
246 /// corresponds to byte `BOM_LEN == 3` of the original source
247 /// when [`Self::has_leading_bom`] is `true`. This differs
248 /// from the typed-AST fields above ([`Self::directives`],
249 /// [`Self::currency_occurrences`], [`Self::account_occurrences`],
250 /// [`Self::errors`], …), whose spans the converter
251 /// pre-shifts back into the *original*-source frame so
252 /// downstream consumers can index directly into the caller's
253 /// source bytes. CST-walking consumers must apply the
254 /// equivalent shift themselves: subtract `BOM_LEN` when
255 /// translating an original-source offset down to a CST
256 /// offset (e.g., `cst.token_at_offset(orig - BOM_LEN)`), and
257 /// add `BOM_LEN` back when emitting an original-source
258 /// position from a `TextRange`. The LSP `selection_range`
259 /// handler does this — see its rustdoc and the
260 /// `bom_prefixed_source_does_not_shift_ranges` regression
261 /// test.
262 ///
263 /// **Canonical-payload exclusion.** This field is deliberately
264 /// NOT fed into [`__baseline_canonical_payload`]. The green
265 /// node is a redundant cache of the source bytes; the
266 /// existing `directives` / `currency_occurrences` /
267 /// `account_occurrences` / `errors` fields already capture
268 /// everything downstream consumers track for drift detection.
269 /// Adding the green node's `Debug` output would multiply
270 /// the fingerprint size without surfacing any new drift
271 /// signal. The corresponding `assert_field_in_hash` arm is
272 /// also intentionally absent in `tests/corpus_baseline.rs`.
273 /// A negative-form test (`__canonical_payload_excludes_syntax_root`
274 /// in this file) pins the exclusion: it confirms that mutating
275 /// `syntax_root` while every other field is equal does NOT
276 /// change the canonical payload bytes.
277 pub syntax_root: rowan::GreenNode,
278 /// File-wide alignment columns the formatter would use for
279 /// this source — pre-computed at parse time so hot formatting
280 /// paths skip the `O(N_postings)` per-call walk.
281 ///
282 /// `PostingAlignment` is `Copy`; pass it directly into the
283 /// `_with_alignment` variants of the formatter
284 /// ([`crate::format::format_node_with_alignment`],
285 /// [`crate::format::format_node_range_with_alignment`],
286 /// [`crate::format::format_source_with_parsed`]) to reuse this
287 /// cached value. The LSP `format_document` /
288 /// `range_formatting` fallback handlers, the FFI `format.source`
289 /// endpoint, and the WASM `ParsedLedger::format` bridge all
290 /// consume the cache to skip both the redundant parse and the
291 /// redundant alignment walk.
292 ///
293 /// **Producer-only cache invariant.** This field is populated
294 /// exactly once by `parse_via_cst`; the value is consistent with
295 /// the `directives` / `syntax_root` fields *at parse time*.
296 /// `ParseResult` exposes every cache input (`directives`,
297 /// `syntax_root`) as `pub`, so technically a consumer with a
298 /// `&mut ParseResult` can mutate one without refreshing the
299 /// other — leaving `alignment` stale. That is OUT-OF-CONTRACT
300 /// for this cache. Callers that mutate `ParseResult` directly
301 /// must either (a) refresh `alignment` by calling
302 /// `crate::format::compute_alignment(&SourceFile::cast(self.syntax_node()))`,
303 /// (b) avoid the `_with_alignment` formatter variants and use
304 /// the bare ones (which re-compute), or (c) treat the
305 /// `ParseResult` as immutable after construction (the common
306 /// case — the LSP wraps it in `Arc<ParseResult>`).
307 ///
308 /// **Equivalence pinned.**
309 /// `parse_result_alignment_cache::*` (7 fixtures) assert that
310 /// `parse(s).alignment` equals
311 /// `compute_alignment(&SourceFile::cast(parse(s).syntax_node()).unwrap())`
312 /// across representative fixtures, so any future divergence
313 /// (a converter change that forgets to refresh the cache, a
314 /// `compute_alignment` change that breaks the contract)
315 /// fails CI.
316 ///
317 /// **Canonical-payload exclusion.** Excluded from
318 /// [`__baseline_canonical_payload`] for the same reason as
319 /// `syntax_root`: it's a redundant derivation of `directives`
320 /// content. Mutating it without changing `directives` would
321 /// silently flip the corpus hash; including it in the
322 /// payload would change the hash for every source with a
323 /// non-default alignment (i.e. essentially every real
324 /// Beancount file). The exclusion is pinned by
325 /// `canonical_payload_excludes_alignment`.
326 pub alignment: crate::format::PostingAlignment,
327}
328
329impl ParseResult {
330 /// Cursor-API view of the lossless CST that produced this
331 /// `ParseResult`. Equivalent to
332 /// `SyntaxNode::new_root(self.syntax_root.clone())`.
333 ///
334 /// Construction is an `Arc` bump (the green node's internal
335 /// refcount); cheap enough to call per request. This is the
336 /// supported entry point for CST consumers — prefer it over
337 /// reading [`Self::syntax_root`] directly, so the `rowan`
338 /// dependency stays an implementation detail.
339 #[must_use]
340 pub fn syntax_node(&self) -> SyntaxNode {
341 SyntaxNode::new_root(self.syntax_root.clone())
342 }
343}
344
345// Compile-time assertion: `ParseResult` is shared as
346// `Arc<ParseResult>` across the LSP's main thread and its
347// background worker (see `rustledger-lsp/src/main_loop.rs`).
348// A future field whose type is not `Send + Sync` (e.g. an `Rc`,
349// a `Cell`, or a non-thread-safe handle) would silently break
350// the LSP build at the call site, far from the parser change
351// that caused it. This assertion fences the invariant at the
352// definition site so the parser crate's own build fails first.
353const _: fn() = || {
354 const fn assert_send_sync<T: Send + Sync>() {}
355 assert_send_sync::<ParseResult>();
356};
357
358/// A warning from the parser (non-fatal).
359#[derive(Debug, Clone)]
360pub struct ParseWarning {
361 /// The warning message.
362 pub message: String,
363 /// Location in source.
364 pub span: Span,
365}
366
367impl ParseWarning {
368 /// Create a new warning.
369 pub fn new(message: impl Into<String>, span: Span) -> Self {
370 Self {
371 message: message.into(),
372 span,
373 }
374 }
375}
376
377/// Parse beancount source code.
378///
379/// Routes through the CST-backed implementation
380/// ([`parse_via_cst`]): a lossless Logos lexer feeds a structured
381/// CST builder, and the converter in `crate::cst::convert` walks
382/// the resulting tree to produce the [`ParseResult`].
383///
384/// # Arguments
385///
386/// * `source` - The beancount source code to parse
387///
388/// # Returns
389///
390/// A `ParseResult` containing directives, options, includes, plugins, and errors.
391#[must_use]
392pub fn parse(source: &str) -> ParseResult {
393 parse_via_cst(source)
394}
395
396/// Parse beancount source code, returning only directives and errors.
397///
398/// This is a simpler interface when you don't need options/includes/plugins.
399#[must_use]
400pub fn parse_directives(source: &str) -> (Vec<Spanned<Directive>>, Vec<ParseError>) {
401 let result = parse(source);
402 (result.directives, result.errors)
403}
404
405/// Canonical hash-payload serialization for the corpus baseline
406/// (#1262 phase 0). **Internal**: this exists only so the baseline
407/// integration test can hash a `ParseResult` without listing fields
408/// outside the defining crate.
409///
410/// Returns a byte string that uniquely identifies the `ParseResult`'s
411/// observable content. Directives route through `serde_json::to_value`
412/// to normalize the `FxHashMap` iteration order in metadata; all
413/// other fields use `Debug` formatting, which is deterministic for
414/// `Vec`-based types.
415///
416/// **Why this lives in `rustledger-parser` instead of the test:**
417/// `ParseResult` is `#[non_exhaustive]`, which blocks exhaustive
418/// destructuring from external crates (including the integration
419/// test). Performing the destructure here forces the compiler to
420/// flag any field added to `ParseResult` that the canonical
421/// serialization does not feed into its output. Without this, a new
422/// `ParseResult` field could silently exit the baseline fingerprint -
423/// the BOM-flag-omission class of bug the round-3 review caught.
424///
425/// **Add a new field?** Add a binding (NOT `_`) AND a hasher feed
426/// line to the destructure below. The compiler enforces the binding;
427/// reviewers must enforce the feed.
428///
429/// **Determinism precondition:** this routes directives through
430/// `serde_json::to_value`, which is only sort-stable when
431/// `serde_json`'s `preserve_order` feature is **off**. Cargo feature
432/// unification can flip this on workspace-wide; the unit test
433/// `serde_json_object_is_sorted` in this crate's tests catches that
434/// flip before the canonical hash silently desyncs.
435#[doc(hidden)]
436#[must_use]
437pub fn __baseline_canonical_payload(result: &ParseResult) -> Vec<u8> {
438 let ParseResult {
439 directives,
440 options,
441 includes,
442 plugins,
443 comments,
444 errors,
445 warnings,
446 currency_occurrences,
447 account_occurrences,
448 has_leading_bom,
449 syntax_root,
450 alignment,
451 } = result;
452 // Both `syntax_root` and `alignment` are redundant
453 // derivations of fields already in the canonical payload
454 // (`syntax_root` of the source bytes captured by
455 // `directives`/`occurrences`/`errors`; `alignment` of the
456 // posting widths inside `directives`). Bind them so the
457 // compiler still flags future field additions on this
458 // exhaustive destructure, but discard them from the canonical
459 // payload. Pinned by `canonical_payload_excludes_syntax_root`
460 // and `canonical_payload_excludes_alignment`.
461 let _ = syntax_root;
462 let _ = alignment;
463 let mut out: Vec<u8> = Vec::new();
464 let directives_json = serde_json::to_value(directives)
465 .map_or_else(|e| format!("serialize-error:{e}"), |v| v.to_string());
466 out.extend_from_slice(b"directives:");
467 out.extend_from_slice(directives_json.as_bytes());
468 out.extend_from_slice(b"\noptions:");
469 out.extend_from_slice(format!("{options:?}").as_bytes());
470 out.extend_from_slice(b"\nincludes:");
471 out.extend_from_slice(format!("{includes:?}").as_bytes());
472 out.extend_from_slice(b"\nplugins:");
473 out.extend_from_slice(format!("{plugins:?}").as_bytes());
474 out.extend_from_slice(b"\ncomments:");
475 out.extend_from_slice(format!("{comments:?}").as_bytes());
476 out.extend_from_slice(b"\nerrors:");
477 out.extend_from_slice(format!("{errors:?}").as_bytes());
478 out.extend_from_slice(b"\nwarnings:");
479 out.extend_from_slice(format!("{warnings:?}").as_bytes());
480 out.extend_from_slice(b"\ncurrency_occurrences:");
481 out.extend_from_slice(format!("{currency_occurrences:?}").as_bytes());
482 out.extend_from_slice(b"\naccount_occurrences:");
483 out.extend_from_slice(format!("{account_occurrences:?}").as_bytes());
484 out.extend_from_slice(b"\nhas_leading_bom:");
485 out.extend_from_slice(format!("{has_leading_bom:?}").as_bytes());
486 out
487}
488
489#[cfg(test)]
490mod canonical_payload_determinism {
491 //! Guard against cargo feature unification silently enabling
492 //! `serde_json/preserve_order` workspace-wide. When `preserve_order`
493 //! is OFF, `serde_json::Value::Object` is BTreeMap-backed and sorts
494 //! its keys; when ON, it's IndexMap-backed and preserves insertion
495 //! order. `__baseline_canonical_payload` relies on the sort-stable
496 //! behavior to neutralize `FxHashMap` iteration order in directive
497 //! metadata. A workspace crate flipping the feature on would make
498 //! canonical hashes vary with hashbrown state across machines -
499 //! the very class of bug the canonicalization was added to
500 //! prevent. This test fails fast and points at the cargo-feature
501 //! cause instead of letting the corpus baseline mysteriously drift.
502 use serde_json::json;
503
504 #[test]
505 fn serde_json_object_is_sorted() {
506 // Insertion order `b, a` would survive under `preserve_order`.
507 // Default features sort to `a, b`.
508 let v = json!({ "b": 1, "a": 2 });
509 let s = v.to_string();
510 assert!(
511 s.starts_with("{\"a\""),
512 "serde_json::Value::Object is not sorting keys (got {s}). \
513 This means cargo feature unification turned on \
514 serde_json/preserve_order somewhere in the workspace. \
515 The corpus baseline's canonical hash assumes sorted \
516 Object keys to neutralize FxHashMap iteration order in \
517 directive metadata. Find the crate that enabled \
518 `serde_json = {{ ..., features = [\"preserve_order\"] }}` \
519 and remove it, or thread an alternative canonicalization \
520 through __baseline_canonical_payload.",
521 );
522 }
523}
524
525#[cfg(test)]
526mod cached_syntax_root_matches_fresh_parse {
527 //! The `selection_range` handler (and any future CST-walking
528 //! handler) consumes [`ParseResult::syntax_root`] instead of
529 //! re-invoking [`crate::parse_structured`]. The safety
530 //! argument is "the cached green root is the same tree the
531 //! converter walked, which is the same tree a fresh
532 //! `parse_structured` would return."
533 //!
534 //! Today that argument is trivially true because the cache is
535 //! populated directly from the converter's `source_file`.
536 //! But if a future change introduces post-conversion CST
537 //! mutation (span rewrites, error-recovery splicing, trivia
538 //! reattachment) the cached root would diverge from a fresh
539 //! re-parse — silently, since nothing else compares the two
540 //! trees. This test pins the invariant across a small fixture
541 //! set covering empty source, every directive kind, error
542 //! recovery, mid-file BOM, and metadata-bearing transactions.
543 use super::{cst::parse_structured, parse};
544
545 fn assert_round_trip(label: &str, source: &str) {
546 let parsed = parse(source);
547 let (stripped, _bom) = crate::bom::strip_leading(source);
548 let fresh = parse_structured(stripped).green().into_owned();
549 assert_eq!(
550 parsed.syntax_root, fresh,
551 "cached syntax_root diverged from fresh parse_structured for {label}: \n\
552 this means something is mutating the green tree between converter \
553 capture and consumer access. The two are supposed to be identical."
554 );
555 }
556
557 #[test]
558 fn empty_source() {
559 assert_round_trip("empty", "");
560 }
561
562 #[test]
563 fn simple_directive() {
564 assert_round_trip("open", "2024-01-01 open Assets:Bank USD\n");
565 }
566
567 #[test]
568 fn every_directive_shape() {
569 assert_round_trip(
570 "directive zoo",
571 r#"option "title" "Test"
572plugin "myplugin"
573include "other.beancount"
5742024-01-01 open Assets:Bank USD
5752024-01-01 commodity USD
5762024-06-15 * "Coffee"
577 Assets:Bank -5.00 USD
578 Expenses:Food
5792024-12-31 close Assets:Bank
5802024-01-31 balance Assets:Bank 100 USD
5812024-01-15 pad Assets:Bank Equity:Opening
5822024-01-15 note Assets:Bank "deposit pending"
5832024-01-15 event "location" "SF"
5842024-01-15 price USD 1.00 EUR
585"#,
586 );
587 }
588
589 #[test]
590 fn with_parse_errors() {
591 // Trigger error recovery (unterminated string, garbled
592 // directive) to ensure the post-pass `fixup_directive_spans`
593 // and error-node wrapping don't drift between cache and
594 // fresh re-parse.
595 assert_round_trip(
596 "broken",
597 "2024-01-01 open Assets:Bank \"unterminated\n2024-01-02 garbage line here\n",
598 );
599 }
600
601 #[test]
602 fn with_metadata_and_comments() {
603 assert_round_trip(
604 "metadata",
605 r#"; standalone comment
6062024-01-01 open Assets:Bank USD
607 payee_account: Assets:Other
6082024-06-15 * "Coffee" ; eol comment
609 memo: "morning"
610 Assets:Bank -5.00 USD
611"#,
612 );
613 }
614}
615
616#[cfg(test)]
617mod canonical_payload_excludes_syntax_root {
618 //! Pins the deliberate exclusion of `ParseResult::syntax_root`
619 //! from [`__baseline_canonical_payload`]. The exclusion is
620 //! documented in three places (the field's rustdoc, the
621 //! destructure comment in `__baseline_canonical_payload`, and
622 //! the CHANGELOG entry under `[Unreleased] / Features`) but
623 //! none of those are executable. A future contributor
624 //! mechanically pattern-matching on "all fields get an arm"
625 //! could add a `syntax_root` feed to the canonical payload —
626 //! the corpus manifest would silently drift on every source
627 //! that touched the green tree.
628 //!
629 //! This test mutates `syntax_root` while leaving every other
630 //! field equal, and asserts the canonical payload bytes are
631 //! unchanged.
632 use super::{__baseline_canonical_payload, parse};
633
634 #[test]
635 fn mutating_syntax_root_does_not_change_canonical_payload() {
636 let src_a = "2024-01-01 open Assets:Bank USD\n";
637 // A different source produces a different green tree but
638 // we want every OTHER field equal; pick a source that
639 // produces an identical typed ParseResult on every field
640 // EXCEPT `syntax_root`. Empty source is the simplest
641 // counterexample for "syntax_root differs"; we go further
642 // and synthesize the mutation explicitly to keep the test
643 // independent of the converter's behavior.
644 let parsed_a = parse(src_a);
645 let mut mutated = parse(src_a);
646 // Replace the green tree with a freshly-parsed but
647 // structurally-different one. `parse("")` gives an empty
648 // SOURCE_FILE green root; the original has an OPEN_DIRECTIVE
649 // child. Other fields will differ for `parse("")`, so we
650 // construct the mutation by swapping ONLY the field.
651 mutated.syntax_root = parse("").syntax_root;
652
653 let payload_original = __baseline_canonical_payload(&parsed_a);
654 let payload_mutated = __baseline_canonical_payload(&mutated);
655 assert_eq!(
656 payload_original, payload_mutated,
657 "canonical payload changed after mutating only `syntax_root`. \
658 Either the destructure in `__baseline_canonical_payload` \
659 grew a `syntax_root` feed line (revert that — the field \
660 is deliberately excluded; see its rustdoc), or another \
661 field now reads from `syntax_root` indirectly. Either \
662 way the corpus manifest is about to drift."
663 );
664 }
665}
666
667#[cfg(test)]
668mod canonical_payload_excludes_alignment {
669 //! Pins the deliberate exclusion of `ParseResult::alignment`
670 //! from [`__baseline_canonical_payload`]. Same shape as
671 //! `canonical_payload_excludes_syntax_root`: mutate the field,
672 //! re-hash, assert unchanged.
673 //!
674 //! Including `alignment` in the canonical payload would change
675 //! the corpus hash for every source whose postings determine
676 //! non-default column widths — i.e. essentially every real
677 //! Beancount file. The field is a derivation of `directives`
678 //! content (already in the payload via the typed-AST hash);
679 //! it carries no independent drift signal.
680 use super::{__baseline_canonical_payload, parse};
681 use crate::cst::format::PostingAlignment;
682
683 #[test]
684 fn mutating_alignment_does_not_change_canonical_payload() {
685 let src = "\
6862024-01-15 * \"Coffee\"
687 Assets:Bank -5.00 USD
688 Expenses:Food
689";
690 let parsed = parse(src);
691 let mut mutated = parse(src);
692 // Synthesize a different PostingAlignment value: bump number_col
693 // by 100. Real-world alignment would never be this wide
694 // for the fixture, so we get a guaranteed-different cache.
695 mutated.alignment = PostingAlignment {
696 number_col: parsed.alignment.number_col + 100,
697 number_width: parsed.alignment.number_width + 7,
698 };
699
700 let payload_original = __baseline_canonical_payload(&parsed);
701 let payload_mutated = __baseline_canonical_payload(&mutated);
702 assert_eq!(
703 payload_original, payload_mutated,
704 "canonical payload changed after mutating only `alignment`. \
705 Either the destructure in `__baseline_canonical_payload` \
706 grew an `alignment` feed line (revert that — the field \
707 is deliberately excluded), or another field now reads \
708 from `alignment` indirectly. Either way the corpus \
709 manifest is about to drift across every source with \
710 postings.",
711 );
712 }
713}
714
715#[cfg(test)]
716mod parse_result_alignment_cache {
717 //! Pins the equivalence between `ParseResult::alignment` (the
718 //! pre-computed cache populated by `parse_via_cst`) and a
719 //! fresh `compute_alignment` call on the same syntax tree.
720 //! A converter change that forgets to refresh the cache, or a
721 //! `compute_alignment` change that breaks the cache's
722 //! semantics, fails this test before reaching the LSP.
723 use super::parse;
724 use crate::cst::ast::{AstNode, SourceFile};
725 use crate::cst::format::compute_alignment;
726
727 fn assert_equivalent(label: &str, source: &str) {
728 let result = parse(source);
729 let source_file = SourceFile::cast(result.syntax_node())
730 .expect("ParseResult::syntax_node() must be a SOURCE_FILE");
731 let fresh = compute_alignment(&source_file);
732 assert_eq!(
733 result.alignment, fresh,
734 "ParseResult::alignment cache diverged from a fresh \
735 compute_alignment call for {label}: cache = {:?}, fresh = {:?}. \
736 Either parse_via_cst forgot to call compute_alignment, or \
737 compute_alignment's semantics changed without refreshing \
738 the cache in the converter.",
739 result.alignment, fresh,
740 );
741 }
742
743 #[test]
744 fn empty_source() {
745 assert_equivalent("empty", "");
746 }
747
748 #[test]
749 fn open_only_no_postings() {
750 assert_equivalent("open only", "2024-01-01 open Assets:Bank USD\n");
751 }
752
753 #[test]
754 fn single_transaction() {
755 assert_equivalent(
756 "single txn",
757 "\
7582024-01-15 * \"Coffee\"
759 Assets:Bank -5.00 USD
760 Expenses:Food
761",
762 );
763 }
764
765 #[test]
766 fn multi_transaction_varying_widths() {
767 assert_equivalent(
768 "varying widths",
769 "\
7702024-01-15 * \"A\"
771 Assets:Bank -5.00 USD
772 Expenses:Food
7732024-02-15 * \"B\"
774 Assets:Investment:Long:Path -123456.78 USD
775 Expenses:Tax 100.00 USD
776",
777 );
778 }
779
780 #[test]
781 fn arithmetic_amounts() {
782 assert_equivalent(
783 "arithmetic amounts",
784 "\
7852024-01-15 * \"Split\"
786 Assets:Bank -10.00 + 5.00 USD
787 Expenses:Misc
788",
789 );
790 }
791
792 #[test]
793 fn parse_errors() {
794 // Even on parse-error files the cache must match a fresh
795 // call. The LSP fallback path consumes the cache through
796 // a broken file, so equivalence under error recovery is
797 // load-bearing.
798 assert_equivalent(
799 "broken",
800 "\
8012024-01-15 * \"x\"
802 Assets:Bank -5.00 USD
803}}}garbage
8042024-02-15 * \"y\"
805 Assets:Other 100.00 USD
806",
807 );
808 }
809
810 /// Mid-transaction recovery: when the WIDEST transaction's body
811 /// breaks (becomes `ERROR_NODE` because a posting is
812 /// syntactically incomplete), its postings are EXCLUDED from
813 /// `compute_alignment` because the wrapping Transaction node
814 /// fails the `ast::Directive::Transaction::cast` check inside
815 /// the alignment walk. The cache reflects only the
816 /// successfully-parsed transactions' alignment; this is the
817 /// behavior the LSP fallback observes when format-on-type fires
818 /// during a mid-edit broken state. The test pins the
819 /// equivalence (cache matches fresh call) so the producer-side
820 /// invariant holds even in this awkward transitional state.
821 ///
822 /// Note for users: as the user keeps typing and the parser
823 /// recovers/breaks the wrapping Transaction across edits, the
824 /// alignment columns may visibly shift. This is unavoidable
825 /// without speculatively recovering wide-account information
826 /// from the broken transaction's source bytes — out of scope
827 /// for the cache.
828 #[test]
829 fn mid_transaction_error_node() {
830 // First transaction has wide accounts (Assets:Investment:Long:Path)
831 // but is broken — the posting line ends with garbage that
832 // the recovery should wrap into an ERROR_NODE around the
833 // whole transaction. Second transaction (narrow accounts)
834 // parses cleanly. The cache's alignment reflects only the
835 // narrow transaction's widths.
836 assert_equivalent(
837 "mid-transaction breakage",
838 "\
8392024-01-15 * \"wide broken\"
840 Assets:Investment:Long:Path -123456.78 USD }}}
841 Expenses:Tax
8422024-02-15 * \"narrow clean\"
843 Assets:Bank -5.00 USD
844 Expenses:Food
845",
846 );
847 }
848}