supersigil_parser/preprocess.rs
1//! Stage 1a: UTF-8 decode, BOM strip, CRLF normalization.
2
3use std::path::Path;
4
5use supersigil_core::ParseError;
6
7/// UTF-8 BOM character.
8const BOM: char = '\u{FEFF}';
9
10/// Stage 1: Preprocess raw bytes — decode UTF-8, strip BOM, normalize CRLF to LF.
11///
12/// # Errors
13///
14/// Returns `ParseError::IoError` if the input is not valid UTF-8.
15pub fn preprocess(raw: &[u8], path: &Path) -> Result<String, ParseError> {
16 let text = std::str::from_utf8(raw).map_err(|e| ParseError::IoError {
17 path: path.to_path_buf(),
18 source: std::io::Error::new(std::io::ErrorKind::InvalidData, e),
19 })?;
20
21 Ok(normalize(text))
22}
23
24/// Strip BOM and normalize CRLF → LF in already-decoded text.
25///
26/// This is the shared normalization logic used by both the parser pipeline
27/// (via [`preprocess`]) and the snapshot rewriter (which reads files as
28/// `&str` and needs matching byte offsets).
29#[must_use]
30pub fn normalize(text: &str) -> String {
31 let text = text.strip_prefix(BOM).unwrap_or(text);
32
33 // Fast path: no \r means no CRLF normalization needed.
34 if !text.as_bytes().contains(&b'\r') {
35 return text.to_owned();
36 }
37
38 // Normalize CRLF → LF without creating new CRLF from bare \r + replacement \n.
39 // \r and \n are single-byte ASCII, so we can safely scan bytes and reconstruct
40 // valid UTF-8 by copying non-CRLF spans verbatim.
41 let bytes = text.as_bytes();
42 let mut out = String::with_capacity(bytes.len());
43 let mut start = 0;
44 let mut i = 0;
45 while i < bytes.len() {
46 if bytes[i] == b'\r' && bytes.get(i + 1) == Some(&b'\n') {
47 // Flush the span before this \r\n, then emit \n
48 out.push_str(&text[start..i]);
49 out.push('\n');
50 i += 2;
51 start = i;
52 } else {
53 i += 1;
54 }
55 }
56 out.push_str(&text[start..]);
57 out
58}