rustledger_parser/bom.rs
1//! Canonical UTF-8 BOM handling — single source of truth.
2//!
3//! # Why this module exists
4//!
5//! Earlier iterations spread BOM-aware logic across three layers:
6//!
7//! 1. A `bom_filter` callback on the lexer that decided which BOMs were
8//! "leading" (skip) vs. "mid-file" (emit as an error token).
9//! 2. An indent-walker arm in `tokenize()` that tried to keep mid-file
10//! BOMs layout-transparent by adjusting `last_newline_end`.
11//! 3. A `source.starts_with('\u{FEFF}')` check in `format_source` to
12//! decide whether to re-prepend a BOM on output.
13//!
14//! Each of those three encoded the same concept ("the source had a
15//! leading BOM") with a different predicate. Every round-trip-fidelity
16//! bug we hit traced back to two of them disagreeing — most recently a
17//! pair of regressions where the lexer accepted a leading-whitespace-
18//! then-BOM input but the formatter dropped the BOM on output.
19//!
20//! # The architecture
21//!
22//! A UTF-8 BOM is a *serialization* concern, not a *parsing* concern.
23//! It carries no semantic meaning to beancount. So:
24//!
25//! * `strip_leading` runs ONCE at the parser's public entry. The lexer,
26//! parser, indent walker, and every other internal layer operate on a
27//! source that is BOM-free by construction.
28//! * The parser records whether a BOM was stripped in
29//! `ParseResult::has_leading_bom`. That flag is the *only* source of
30//! truth downstream. No layer inspects the BOM byte directly.
31//! * `restore_leading` runs ONCE at the formatter's public exit, gated
32//! on the flag, restoring byte-stable round-trip identity.
33//!
34//! # Mid-file BOMs
35//!
36//! Because the leading BOM is stripped before lexing, any U+FEFF byte
37//! the lexer encounters is by construction mid-file and unrecognized.
38//! Logos produces a `Token::Error` for it, and the parser's existing
39//! error classifier (`error_text.contains('\u{FEFF}')`) surfaces the
40//! dedicated diagnostic.
41//!
42//! # Span coordinates
43//!
44//! The parser preserves the *original-source* coordinate frame for all
45//! spans it returns: if a directive starts at byte 3 of the original
46//! source (because the file began with a 3-byte BOM), its span starts
47//! at 3. The parser shifts every span up by `BOM_LEN` after running the
48//! inner parser on the stripped source. Callers (LSP, FFI, doctor) see
49//! coordinates that index into the source they passed in, with no need
50//! to be BOM-aware themselves.
51
52/// The UTF-8 byte-order mark (`EF BB BF`).
53pub const BOM: &str = "\u{FEFF}";
54
55/// The same BOM as a `char`.
56///
57/// Use this for `char`-typed predicates like `s.contains(BOM_CHAR)`
58/// or `match c { BOM_CHAR => ... }` instead of open-coding
59/// `'\u{FEFF}'` — that scatters the BOM concept across every
60/// detection site and re-creates the contract drift this module
61/// exists to prevent.
62pub const BOM_CHAR: char = '\u{FEFF}';
63
64/// Byte length of [`BOM`] in UTF-8 (always 3).
65///
66/// Used by `parse()` to shift spans back into the original-source
67/// coordinate frame after running the inner parser on the
68/// BOM-stripped view. Inlined as a const so the compiler can constant-
69/// fold the shift arithmetic.
70pub const BOM_LEN: usize = BOM.len();
71
72/// Strip a strict-byte-0 leading BOM, returning `(stripped, had_bom)`.
73///
74/// "Strict byte 0" means the BOM must be the very first bytes of the
75/// source. A BOM preceded by ANY content (whitespace, another
76/// character, anything) is by definition mid-file and is left in place
77/// for the lexer's error path to surface — that's not a "leading BOM"
78/// no matter how innocuous the preceding bytes look.
79///
80/// ```
81/// # use rustledger_parser::bom::{strip_leading, BOM};
82/// let with_bom = format!("{BOM}2024-01-01 open Assets:Bank\n");
83/// let (stripped, had_bom) = strip_leading(&with_bom);
84/// assert!(had_bom);
85/// assert_eq!(stripped, "2024-01-01 open Assets:Bank\n");
86///
87/// let (stripped, had_bom) = strip_leading("2024-01-01 open Assets:Bank\n");
88/// assert!(!had_bom);
89/// assert_eq!(stripped, "2024-01-01 open Assets:Bank\n");
90/// ```
91#[must_use]
92pub fn strip_leading(source: &str) -> (&str, bool) {
93 source
94 .strip_prefix(BOM)
95 .map_or((source, false), |rest| (rest, true))
96}
97
98/// Re-prepend a leading BOM if `had_bom`. Idempotent: a call where
99/// `formatted` already starts with a BOM returns the input unchanged.
100///
101/// Takes and returns an owned `String` so the no-BOM path (the
102/// overwhelming majority of files) returns the input with zero
103/// reallocation and zero byte copies.
104///
105/// The BOM-prepend path is one allocation (guaranteed by the explicit
106/// `reserve(BOM_LEN)` before `insert_str`) plus an O(n) memmove of the
107/// existing bytes by 3 positions. Without the explicit reserve,
108/// `format_source`'s typically-tight-capacity `String` would force
109/// `insert_str` to grow the buffer first AND THEN shift — two passes
110/// over the bytes instead of one.
111///
112/// ```
113/// # use rustledger_parser::bom::{restore_leading, BOM};
114/// let body = "2024-01-01 open Assets:Bank\n".to_string();
115///
116/// // No BOM requested → return as-is.
117/// let out = restore_leading(body.clone(), false);
118/// assert_eq!(out, body);
119///
120/// // BOM requested and not present → prepend.
121/// let out = restore_leading(body.clone(), true);
122/// assert!(out.starts_with(BOM));
123/// assert_eq!(&out[BOM.len()..], body);
124///
125/// // BOM requested and already present → idempotent no-op.
126/// let with_bom = format!("{BOM}{body}");
127/// let out = restore_leading(with_bom.clone(), true);
128/// assert_eq!(out, with_bom);
129/// ```
130#[must_use]
131pub fn restore_leading(mut formatted: String, had_bom: bool) -> String {
132 if had_bom && !formatted.starts_with(BOM) {
133 formatted.reserve(BOM_LEN);
134 formatted.insert_str(0, BOM);
135 }
136 formatted
137}
138
139#[cfg(test)]
140mod tests {
141 use super::*;
142
143 #[test]
144 fn bom_len_matches_utf8_encoding() {
145 assert_eq!(BOM_LEN, 3);
146 assert_eq!(BOM.as_bytes(), &[0xEF, 0xBB, 0xBF]);
147 }
148
149 #[test]
150 fn strip_leading_strict_byte_0() {
151 // BOM at byte 0 → stripped.
152 let s = "\u{FEFF}foo";
153 let (out, had) = strip_leading(s);
154 assert_eq!(out, "foo");
155 assert!(had);
156 }
157
158 #[test]
159 fn strip_leading_no_bom_passthrough() {
160 let s = "foo";
161 let (out, had) = strip_leading(s);
162 assert_eq!(out, "foo");
163 assert!(!had);
164 }
165
166 #[test]
167 fn strip_leading_does_not_match_after_whitespace() {
168 // The clipboard-with-padding case we deliberately do NOT treat
169 // as a leading BOM — it's mid-file by strict definition. The
170 // lexer's error path will surface the U+FEFF byte.
171 let s = " \u{FEFF}foo";
172 let (out, had) = strip_leading(s);
173 assert_eq!(out, s);
174 assert!(!had);
175 }
176
177 #[test]
178 fn strip_leading_only_strips_one_bom() {
179 // Double BOM at byte 0: strip the first, leave the second for
180 // the lexer to flag as mid-file.
181 let s = "\u{FEFF}\u{FEFF}foo";
182 let (out, had) = strip_leading(s);
183 assert_eq!(out, "\u{FEFF}foo");
184 assert!(had);
185 }
186
187 #[test]
188 fn restore_leading_idempotent_when_already_present() {
189 let s = "\u{FEFF}foo".to_string();
190 let out = restore_leading(s.clone(), true);
191 assert_eq!(out, s);
192 }
193
194 #[test]
195 fn restore_leading_noop_when_flag_false() {
196 let s = "foo".to_string();
197 let out = restore_leading(s.clone(), false);
198 assert_eq!(out, s);
199 }
200
201 #[test]
202 fn restore_leading_prepends_when_requested() {
203 let s = "foo".to_string();
204 let out = restore_leading(s, true);
205 assert_eq!(out, "\u{FEFF}foo");
206 }
207
208 #[test]
209 fn strip_then_restore_round_trip() {
210 for input in [
211 "\u{FEFF}2024-01-01 open Assets:Bank USD\n",
212 "2024-01-01 open Assets:Bank USD\n",
213 "\u{FEFF}",
214 "",
215 ] {
216 let (stripped, had) = strip_leading(input);
217 let restored = restore_leading(stripped.to_string(), had);
218 assert_eq!(restored, input, "round trip broke for {input:?}");
219 }
220 }
221}