Skip to main content

rlsp_yaml_parser/
encoding.rs

1// SPDX-License-Identifier: MIT
2
3/// The encoding detected from the input byte stream.
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum Encoding {
6    Utf8,
7    Utf16Le,
8    Utf16Be,
9    Utf32Le,
10    Utf32Be,
11}
12
13/// Error produced when `decode` cannot convert the byte stream to UTF-8.
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum EncodingError {
16    /// Input bytes are not valid for the detected encoding.
17    InvalidBytes,
18    /// A UTF-16 or UTF-32 sequence contains an unpaired surrogate or an
19    /// out-of-range codepoint.
20    InvalidCodepoint(u32),
21    /// UTF-16 input has an odd number of bytes.
22    TruncatedUtf16,
23    /// UTF-32 input length is not a multiple of four.
24    TruncatedUtf32,
25}
26
27impl core::fmt::Display for EncodingError {
28    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
29        match self {
30            Self::InvalidBytes => write!(f, "invalid byte sequence for detected encoding"),
31            Self::InvalidCodepoint(cp) => write!(f, "invalid Unicode codepoint U+{cp:04X}"),
32            Self::TruncatedUtf16 => write!(f, "UTF-16 stream has an odd number of bytes"),
33            Self::TruncatedUtf32 => {
34                write!(f, "UTF-32 stream length is not a multiple of four")
35            }
36        }
37    }
38}
39
40// ---------------------------------------------------------------------------
41// Encoding detection
42// ---------------------------------------------------------------------------
43
44/// Detect the encoding of a YAML byte stream via BOM or null-byte heuristic.
45///
46/// Implements YAML 1.2 §5.2 encoding detection. UTF-32 BOMs are checked
47/// before UTF-16 because the UTF-32 LE BOM (`FF FE 00 00`) is a superset of
48/// the UTF-16 LE BOM (`FF FE`).
49#[must_use]
50pub fn detect_encoding(bytes: &[u8]) -> Encoding {
51    match bytes {
52        // UTF-32 BOMs (must come before UTF-16 checks)
53        [0x00, 0x00, 0xFE, 0xFF, ..] => Encoding::Utf32Be,
54        [0xFF, 0xFE, 0x00, 0x00, ..] => Encoding::Utf32Le,
55        // UTF-16 BOMs
56        [0xFE, 0xFF, ..] => Encoding::Utf16Be,
57        [0xFF, 0xFE, ..] => Encoding::Utf16Le,
58        // Null-byte heuristic (no BOM): YAML streams begin with ASCII content.
59        // UTF-16 LE: odd bytes are null  → first pair is [<ascii>, 0x00]
60        // UTF-16 BE: even bytes are null → first pair is [0x00, <ascii>]
61        [a, 0x00, b, 0x00, ..] if *a != 0 && *b != 0 => Encoding::Utf16Le,
62        [0x00, a, 0x00, b, ..] if *a != 0 && *b != 0 => Encoding::Utf16Be,
63        [a, 0x00, ..] if *a != 0 => Encoding::Utf16Le,
64        [0x00, a, ..] if *a != 0 => Encoding::Utf16Be,
65        _ => Encoding::Utf8,
66    }
67}
68
69// ---------------------------------------------------------------------------
70// Decoding
71// ---------------------------------------------------------------------------
72
73/// Decode a YAML byte stream to a UTF-8 `String`, stripping any BOM.
74///
75/// Detects encoding via [`detect_encoding`], converts to UTF-8, and removes
76/// the BOM character if present.
77///
78/// # Errors
79///
80/// Returns [`EncodingError`] if the byte stream is not valid for the detected
81/// encoding, contains an invalid Unicode codepoint, or is truncated (odd-length
82/// UTF-16 or non-multiple-of-four UTF-32).
83pub fn decode(bytes: &[u8]) -> Result<String, EncodingError> {
84    match detect_encoding(bytes) {
85        Encoding::Utf8 => decode_utf8(bytes),
86        Encoding::Utf16Le => decode_utf16(bytes, Endian::Little),
87        Encoding::Utf16Be => decode_utf16(bytes, Endian::Big),
88        Encoding::Utf32Le => decode_utf32(bytes, Endian::Little),
89        Encoding::Utf32Be => decode_utf32(bytes, Endian::Big),
90    }
91}
92
93#[derive(Clone, Copy)]
94enum Endian {
95    Little,
96    Big,
97}
98
99fn decode_utf8(bytes: &[u8]) -> Result<String, EncodingError> {
100    let s = core::str::from_utf8(bytes).map_err(|_| EncodingError::InvalidBytes)?;
101    // Strip UTF-8 BOM (U+FEFF) if present.
102    Ok(s.strip_prefix('\u{FEFF}').unwrap_or(s).to_owned())
103}
104
105fn decode_utf16(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
106    if !bytes.len().is_multiple_of(2) {
107        return Err(EncodingError::TruncatedUtf16);
108    }
109    // Collect u16 code units.
110    let units: Vec<u16> = bytes
111        .chunks_exact(2)
112        .map(|chunk| match (chunk, endian) {
113            ([lo, hi], Endian::Little) => u16::from_le_bytes([*lo, *hi]),
114            ([hi, lo], Endian::Big) => u16::from_be_bytes([*hi, *lo]),
115            _ => 0, // chunks_exact(2) guarantees length 2; unreachable
116        })
117        .collect();
118
119    // Strip BOM (U+FEFF) if first unit is the BOM codepoint.
120    let units = match units.as_slice() {
121        [0xFEFF, rest @ ..] => rest,
122        other => other,
123    };
124
125    // Decode UTF-16 surrogate pairs.
126    char::decode_utf16(units.iter().copied()).try_fold(
127        String::with_capacity(units.len()),
128        |mut s, r| match r {
129            Ok(ch) => {
130                s.push(ch);
131                Ok(s)
132            }
133            Err(e) => Err(EncodingError::InvalidCodepoint(u32::from(
134                e.unpaired_surrogate(),
135            ))),
136        },
137    )
138}
139
140fn decode_utf32(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
141    if !bytes.len().is_multiple_of(4) {
142        return Err(EncodingError::TruncatedUtf32);
143    }
144    let mut out = String::with_capacity(bytes.len() / 4);
145    let mut skip_bom = true;
146    for chunk in bytes.chunks_exact(4) {
147        let cp = match (chunk, endian) {
148            ([a, b, c, d], Endian::Little) => u32::from_le_bytes([*a, *b, *c, *d]),
149            ([a, b, c, d], Endian::Big) => u32::from_be_bytes([*a, *b, *c, *d]),
150            _ => 0, // chunks_exact(4) guarantees length 4; unreachable
151        };
152        // Strip leading BOM.
153        if skip_bom && cp == 0xFEFF {
154            skip_bom = false;
155            continue;
156        }
157        skip_bom = false;
158        let ch = char::from_u32(cp).ok_or(EncodingError::InvalidCodepoint(cp))?;
159        out.push(ch);
160    }
161    Ok(out)
162}
163
164// ---------------------------------------------------------------------------
165// Line-break normalization
166// ---------------------------------------------------------------------------
167
168/// Normalize all line breaks to LF (`\n`).
169///
170/// - `\r\n` (CRLF) → `\n`
171/// - `\r` (lone CR) → `\n`
172/// - `\n` (LF) — unchanged
173#[must_use]
174pub fn normalize_line_breaks(s: String) -> String {
175    // Fast path: no CR means nothing to do.
176    if !s.contains('\r') {
177        return s;
178    }
179    let mut out = String::with_capacity(s.len());
180    let mut chars = s.chars().peekable();
181    while let Some(ch) = chars.next() {
182        if ch == '\r' {
183            // Consume the following LF of a CRLF pair so it is not doubled.
184            if chars.peek() == Some(&'\n') {
185                let _ = chars.next();
186            }
187            out.push('\n');
188        } else {
189            out.push(ch);
190        }
191    }
192    out
193}
194
195// ---------------------------------------------------------------------------
196// Tests
197// ---------------------------------------------------------------------------
198
199#[cfg(test)]
200#[expect(clippy::unwrap_used, reason = "test code")]
201mod tests {
202    use rstest::rstest;
203
204    use super::*;
205
206    // -----------------------------------------------------------------------
207    // detect_encoding
208    // -----------------------------------------------------------------------
209
210    #[test]
211    fn detect_encoding_returns_utf8_for_empty_bytes() {
212        assert_eq!(detect_encoding(b""), Encoding::Utf8);
213    }
214
215    #[rstest]
216    #[case::utf8_bom(&[0xEF, 0xBB, 0xBF, b'a'], Encoding::Utf8)]
217    #[case::utf16_le_bom(&[0xFF, 0xFE, b'a', 0x00], Encoding::Utf16Le)]
218    #[case::utf16_be_bom(&[0xFE, 0xFF, 0x00, b'a'], Encoding::Utf16Be)]
219    #[case::utf32_le_bom(&[0xFF, 0xFE, 0x00, 0x00], Encoding::Utf32Le)]
220    #[case::utf32_be_bom(&[0x00, 0x00, 0xFE, 0xFF], Encoding::Utf32Be)]
221    fn detect_encoding_bom(#[case] bytes: &[u8], #[case] expected: Encoding) {
222        assert_eq!(detect_encoding(bytes), expected);
223    }
224
225    #[test]
226    fn detect_encoding_falls_back_to_utf8_for_plain_ascii() {
227        assert_eq!(detect_encoding(b"key: value\n"), Encoding::Utf8);
228    }
229
230    #[rstest]
231    #[case::utf16_le_without_bom(&[b'a', 0x00, b'b', 0x00], Encoding::Utf16Le)]
232    #[case::utf16_be_without_bom(&[0x00, b'a', 0x00, b'b'], Encoding::Utf16Be)]
233    fn detect_encoding_null_byte_heuristic(#[case] bytes: &[u8], #[case] expected: Encoding) {
234        assert_eq!(detect_encoding(bytes), expected);
235    }
236
237    // -----------------------------------------------------------------------
238    // decode
239    // -----------------------------------------------------------------------
240
241    #[rstest]
242    #[case::utf8_plain_ascii(b"hello: world\n" as &[u8], "hello: world\n")]
243    #[case::utf8_strips_bom(&[0xEF, 0xBB, 0xBF, b'k', b'e', b'y'], "key")]
244    #[case::utf16_le_no_bom(&[0x68, 0x00, 0x69, 0x00], "hi")]
245    #[case::utf16_be_no_bom(&[0x00, 0x68, 0x00, 0x69], "hi")]
246    #[case::utf16_le_strips_bom(&[0xFF, 0xFE, 0x68, 0x00, 0x69, 0x00], "hi")]
247    #[case::empty_input(b"", "")]
248    fn decode_ok(#[case] bytes: &[u8], #[case] expected: &str) {
249        assert_eq!(decode(bytes).unwrap(), expected);
250    }
251
252    #[test]
253    fn decode_invalid_utf8_returns_error() {
254        // Lone continuation byte — not valid UTF-8, no BOM so treated as UTF-8
255        assert!(decode(&[0x80]).is_err());
256    }
257
258    // -----------------------------------------------------------------------
259    // normalize_line_breaks
260    // -----------------------------------------------------------------------
261
262    #[rstest]
263    #[case::crlf_to_lf("a\r\nb\r\nc".to_string(), "a\nb\nc")]
264    #[case::lone_cr_to_lf("a\rb\rc".to_string(), "a\nb\nc")]
265    #[case::lf_only_unchanged("a\nb\nc".to_string(), "a\nb\nc")]
266    #[case::mixed_line_endings("a\r\nb\rc\nd".to_string(), "a\nb\nc\nd")]
267    #[case::empty_string_unchanged(String::new(), "")]
268    #[case::crlf_not_doubled("\r\n".to_string(), "\n")]
269    fn normalize_line_breaks_cases(#[case] input: String, #[case] expected: &str) {
270        assert_eq!(normalize_line_breaks(input), expected);
271    }
272}