Skip to main content

supersigil_parser/
preprocess.rs

1//! Stage 1a: UTF-8 decode, BOM strip, CRLF normalization.
2
3use std::path::Path;
4
5use supersigil_core::ParseError;
6
7/// UTF-8 BOM character.
8const BOM: char = '\u{FEFF}';
9
10/// Stage 1: Preprocess raw bytes — decode UTF-8, strip BOM, normalize CRLF to LF.
11///
12/// # Errors
13///
14/// Returns `ParseError::IoError` if the input is not valid UTF-8.
15pub fn preprocess(raw: &[u8], path: &Path) -> Result<String, ParseError> {
16    let text = std::str::from_utf8(raw).map_err(|e| ParseError::IoError {
17        path: path.to_path_buf(),
18        source: std::io::Error::new(std::io::ErrorKind::InvalidData, e),
19    })?;
20
21    Ok(normalize(text))
22}
23
24/// Strip BOM and normalize CRLF → LF in already-decoded text.
25///
26/// This is the shared normalization logic used by both the parser pipeline
27/// (via [`preprocess`]) and the snapshot rewriter (which reads files as
28/// `&str` and needs matching byte offsets).
29#[must_use]
30pub fn normalize(text: &str) -> String {
31    let text = text.strip_prefix(BOM).unwrap_or(text);
32
33    // Fast path: no \r means no CRLF normalization needed.
34    if !text.as_bytes().contains(&b'\r') {
35        return text.to_owned();
36    }
37
38    // Normalize CRLF → LF without creating new CRLF from bare \r + replacement \n.
39    // \r and \n are single-byte ASCII, so we can safely scan bytes and reconstruct
40    // valid UTF-8 by copying non-CRLF spans verbatim.
41    let bytes = text.as_bytes();
42    let mut out = String::with_capacity(bytes.len());
43    let mut start = 0;
44    let mut i = 0;
45    while i < bytes.len() {
46        if bytes[i] == b'\r' && bytes.get(i + 1) == Some(&b'\n') {
47            // Flush the span before this \r\n, then emit \n
48            out.push_str(&text[start..i]);
49            out.push('\n');
50            i += 2;
51            start = i;
52        } else {
53            i += 1;
54        }
55    }
56    out.push_str(&text[start..]);
57    out
58}