Skip to main content

rlsp_yaml_parser/
encoding.rs

1// SPDX-License-Identifier: MIT
2
3/// The encoding detected from the input byte stream.
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum Encoding {
6    /// UTF-8 (default when no BOM or null-byte heuristic matches).
7    Utf8,
8    /// UTF-16 little-endian (BOM `FF FE` or null-byte heuristic).
9    Utf16Le,
10    /// UTF-16 big-endian (BOM `FE FF` or null-byte heuristic).
11    Utf16Be,
12    /// UTF-32 little-endian (BOM `FF FE 00 00` or null-byte heuristic).
13    Utf32Le,
14    /// UTF-32 big-endian (BOM `00 00 FE FF` or null-byte heuristic).
15    Utf32Be,
16}
17
18/// Error produced when `decode` cannot convert the byte stream to UTF-8.
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub enum EncodingError {
21    /// Input bytes are not valid for the detected encoding.
22    InvalidBytes,
23    /// A UTF-16 or UTF-32 sequence contains an unpaired surrogate or an
24    /// out-of-range codepoint.
25    InvalidCodepoint(u32),
26    /// UTF-16 input has an odd number of bytes.
27    TruncatedUtf16,
28    /// UTF-32 input length is not a multiple of four.
29    TruncatedUtf32,
30}
31
32impl core::fmt::Display for EncodingError {
33    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
34        match self {
35            Self::InvalidBytes => write!(f, "invalid byte sequence for detected encoding"),
36            Self::InvalidCodepoint(cp) => write!(f, "invalid Unicode codepoint U+{cp:04X}"),
37            Self::TruncatedUtf16 => write!(f, "UTF-16 stream has an odd number of bytes"),
38            Self::TruncatedUtf32 => {
39                write!(f, "UTF-32 stream length is not a multiple of four")
40            }
41        }
42    }
43}
44
45// ---------------------------------------------------------------------------
46// Encoding detection
47// ---------------------------------------------------------------------------
48
49/// Detect the encoding of a YAML byte stream via BOM or null-byte heuristic.
50///
51/// Implements YAML 1.2 §5.2 encoding detection. UTF-32 BOMs are checked
52/// before UTF-16 because the UTF-32 LE BOM (`FF FE 00 00`) is a superset of
53/// the UTF-16 LE BOM (`FF FE`). The same prefix-overlap reasoning applies to
54/// BOM-less heuristics: UTF-32 null-byte patterns must come before UTF-16
55/// null-byte patterns because `[a, 0x00, 0x00, 0x00, ..]` (UTF-32-LE) is a
56/// strict superset of `[a, 0x00, ..]` (UTF-16-LE).
57#[must_use]
58pub fn detect_encoding(bytes: &[u8]) -> Encoding {
59    match bytes {
60        // UTF-32 BOMs (must come before UTF-16 checks)
61        [0x00, 0x00, 0xFE, 0xFF, ..] => Encoding::Utf32Be,
62        [0xFF, 0xFE, 0x00, 0x00, ..] => Encoding::Utf32Le,
63        // UTF-16 BOMs
64        [0xFE, 0xFF, ..] => Encoding::Utf16Be,
65        [0xFF, 0xFE, ..] => Encoding::Utf16Le,
66        // BOM-less UTF-32 heuristics (must come before UTF-16 heuristics — see doc comment)
67        [0x00, 0x00, 0x00, a, ..] if *a != 0 => Encoding::Utf32Be,
68        [a, 0x00, 0x00, 0x00, ..] if *a != 0 => Encoding::Utf32Le,
69        // Null-byte heuristic (no BOM): YAML streams begin with ASCII content.
70        // UTF-16 LE: odd bytes are null  → first pair is [<ascii>, 0x00]
71        // UTF-16 BE: even bytes are null → first pair is [0x00, <ascii>]
72        [a, 0x00, b, 0x00, ..] if *a != 0 && *b != 0 => Encoding::Utf16Le,
73        [0x00, a, 0x00, b, ..] if *a != 0 && *b != 0 => Encoding::Utf16Be,
74        [a, 0x00, ..] if *a != 0 => Encoding::Utf16Le,
75        [0x00, a, ..] if *a != 0 => Encoding::Utf16Be,
76        _ => Encoding::Utf8,
77    }
78}
79
80// ---------------------------------------------------------------------------
81// Decoding
82// ---------------------------------------------------------------------------
83
84/// Decode a YAML byte stream to a UTF-8 `String`, stripping any BOM.
85///
86/// Detects encoding via [`detect_encoding`], converts to UTF-8, and removes
87/// the BOM character if present.
88///
89/// # Errors
90///
91/// Returns [`EncodingError`] if the byte stream is not valid for the detected
92/// encoding, contains an invalid Unicode codepoint, or is truncated (odd-length
93/// UTF-16 or non-multiple-of-four UTF-32).
94pub fn decode(bytes: &[u8]) -> Result<String, EncodingError> {
95    match detect_encoding(bytes) {
96        Encoding::Utf8 => decode_utf8(bytes),
97        Encoding::Utf16Le => decode_utf16(bytes, Endian::Little),
98        Encoding::Utf16Be => decode_utf16(bytes, Endian::Big),
99        Encoding::Utf32Le => decode_utf32(bytes, Endian::Little),
100        Encoding::Utf32Be => decode_utf32(bytes, Endian::Big),
101    }
102}
103
104#[derive(Clone, Copy)]
105enum Endian {
106    Little,
107    Big,
108}
109
110fn decode_utf8(bytes: &[u8]) -> Result<String, EncodingError> {
111    let s = core::str::from_utf8(bytes).map_err(|_| EncodingError::InvalidBytes)?;
112    // Strip UTF-8 BOM (U+FEFF) if present.
113    Ok(s.strip_prefix('\u{FEFF}').unwrap_or(s).to_owned())
114}
115
116fn decode_utf16(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
117    if !bytes.len().is_multiple_of(2) {
118        return Err(EncodingError::TruncatedUtf16);
119    }
120    // Collect u16 code units.
121    let units: Vec<u16> = bytes
122        .chunks_exact(2)
123        .map(|chunk| match (chunk, endian) {
124            ([lo, hi], Endian::Little) => u16::from_le_bytes([*lo, *hi]),
125            ([hi, lo], Endian::Big) => u16::from_be_bytes([*hi, *lo]),
126            _ => 0, // chunks_exact(2) guarantees length 2; unreachable
127        })
128        .collect();
129
130    // Strip BOM (U+FEFF) if first unit is the BOM codepoint.
131    let units = match units.as_slice() {
132        [0xFEFF, rest @ ..] => rest,
133        other => other,
134    };
135
136    // Decode UTF-16 surrogate pairs.
137    char::decode_utf16(units.iter().copied()).try_fold(
138        String::with_capacity(units.len()),
139        |mut s, r| match r {
140            Ok(ch) => {
141                s.push(ch);
142                Ok(s)
143            }
144            Err(e) => Err(EncodingError::InvalidCodepoint(u32::from(
145                e.unpaired_surrogate(),
146            ))),
147        },
148    )
149}
150
151fn decode_utf32(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
152    if !bytes.len().is_multiple_of(4) {
153        return Err(EncodingError::TruncatedUtf32);
154    }
155    let mut out = String::with_capacity(bytes.len() / 4);
156    let mut skip_bom = true;
157    for chunk in bytes.chunks_exact(4) {
158        let cp = match (chunk, endian) {
159            ([a, b, c, d], Endian::Little) => u32::from_le_bytes([*a, *b, *c, *d]),
160            ([a, b, c, d], Endian::Big) => u32::from_be_bytes([*a, *b, *c, *d]),
161            _ => 0, // chunks_exact(4) guarantees length 4; unreachable
162        };
163        // Strip leading BOM.
164        if skip_bom && cp == 0xFEFF {
165            skip_bom = false;
166            continue;
167        }
168        skip_bom = false;
169        let ch = char::from_u32(cp).ok_or(EncodingError::InvalidCodepoint(cp))?;
170        out.push(ch);
171    }
172    Ok(out)
173}
174
175// ---------------------------------------------------------------------------
176// Line-break normalization
177// ---------------------------------------------------------------------------
178
179/// Normalize all line breaks to LF (`\n`).
180///
181/// - `\r\n` (CRLF) → `\n`
182/// - `\r` (lone CR) → `\n`
183/// - `\n` (LF) — unchanged
184#[must_use]
185pub fn normalize_line_breaks(s: String) -> String {
186    // Fast path: no CR means nothing to do.
187    if !s.contains('\r') {
188        return s;
189    }
190    let mut out = String::with_capacity(s.len());
191    let mut chars = s.chars().peekable();
192    while let Some(ch) = chars.next() {
193        if ch == '\r' {
194            // Consume the following LF of a CRLF pair so it is not doubled.
195            if chars.peek() == Some(&'\n') {
196                let _ = chars.next();
197            }
198            out.push('\n');
199        } else {
200            out.push(ch);
201        }
202    }
203    out
204}
205
206// ---------------------------------------------------------------------------
207// Tests
208// ---------------------------------------------------------------------------
209
210#[cfg(test)]
211mod tests {
212    use rstest::rstest;
213
214    use super::*;
215
216    // -----------------------------------------------------------------------
217    // detect_encoding
218    // -----------------------------------------------------------------------
219
220    #[test]
221    fn detect_encoding_returns_utf8_for_empty_bytes() {
222        assert_eq!(detect_encoding(b""), Encoding::Utf8);
223    }
224
225    #[rstest]
226    #[case::utf8_bom(&[0xEF, 0xBB, 0xBF, b'a'], Encoding::Utf8)]
227    #[case::utf16_le_bom(&[0xFF, 0xFE, b'a', 0x00], Encoding::Utf16Le)]
228    #[case::utf16_be_bom(&[0xFE, 0xFF, 0x00, b'a'], Encoding::Utf16Be)]
229    #[case::utf32_le_bom(&[0xFF, 0xFE, 0x00, 0x00], Encoding::Utf32Le)]
230    #[case::utf32_be_bom(&[0x00, 0x00, 0xFE, 0xFF], Encoding::Utf32Be)]
231    fn detect_encoding_bom(#[case] bytes: &[u8], #[case] expected: Encoding) {
232        assert_eq!(detect_encoding(bytes), expected);
233    }
234
235    #[test]
236    fn detect_encoding_falls_back_to_utf8_for_plain_ascii() {
237        assert_eq!(detect_encoding(b"key: value\n"), Encoding::Utf8);
238    }
239
240    #[rstest]
241    #[case::utf16_le_without_bom(&[b'a', 0x00, b'b', 0x00], Encoding::Utf16Le)]
242    #[case::utf16_be_without_bom(&[0x00, b'a', 0x00, b'b'], Encoding::Utf16Be)]
243    // Test 63/64: 2-byte slices — covers the [a, 0x00, ..] and [0x00, a, ..] arms
244    // that also match when only two bytes are present.
245    #[case::utf16_le_two_byte_heuristic(&[b'a', 0x00], Encoding::Utf16Le)]
246    #[case::utf16_be_two_byte_heuristic(&[0x00, b'a'], Encoding::Utf16Be)]
247    fn detect_encoding_null_byte_heuristic(#[case] bytes: &[u8], #[case] expected: Encoding) {
248        assert_eq!(detect_encoding(bytes), expected);
249    }
250
251    // IT-1: BOM-less UTF-32-BE positive detection and decode round-trip
252    #[test]
253    fn detect_encoding_bom_less_utf32_be() {
254        let input: &[u8] = &[0x00, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x00, 0x3A];
255        assert_eq!(detect_encoding(input), Encoding::Utf32Be);
256        assert_eq!(decode(input).unwrap(), "k:");
257    }
258
259    // IT-2: BOM-less UTF-32-LE positive detection and decode round-trip
260    #[test]
261    fn detect_encoding_bom_less_utf32_le() {
262        let input: &[u8] = &[0x6B, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00];
263        assert_eq!(detect_encoding(input), Encoding::Utf32Le);
264        assert_eq!(decode(input).unwrap(), "k:");
265    }
266
267    // IT-3: exactly 4 bytes — minimum valid BOM-less UTF-32-BE input
268    #[test]
269    fn detect_encoding_exactly_four_bytes_utf32_be() {
270        assert_eq!(
271            detect_encoding(&[0x00, 0x00, 0x00, 0x41]),
272            Encoding::Utf32Be
273        );
274    }
275
276    // IT-4: exactly 4 bytes — minimum valid BOM-less UTF-32-LE input
277    #[test]
278    fn detect_encoding_exactly_four_bytes_utf32_le() {
279        assert_eq!(
280            detect_encoding(&[0x41, 0x00, 0x00, 0x00]),
281            Encoding::Utf32Le
282        );
283    }
284
285    // IT-5: all-zero input must fall through to UTF-8 (the *a != 0 guard)
286    #[test]
287    fn detect_encoding_all_zero_input_is_utf8() {
288        assert_eq!(detect_encoding(&[0x00u8; 16]), Encoding::Utf8);
289    }
290
291    // IT-6: UTF-16-BE regression — non-zero at byte 1 means it is not UTF-32-BE
292    #[test]
293    fn detect_encoding_bom_less_utf32_does_not_shadow_utf16_be() {
294        assert_eq!(
295            detect_encoding(&[0x00, 0x41, 0x00, 0x42]),
296            Encoding::Utf16Be
297        );
298    }
299
300    // IT-7: UTF-16-LE regression — non-zero at byte 2 means it is not UTF-32-LE
301    #[test]
302    fn detect_encoding_bom_less_utf32_does_not_shadow_utf16_le() {
303        assert_eq!(
304            detect_encoding(&[0x41, 0x00, 0x42, 0x00]),
305            Encoding::Utf16Le
306        );
307    }
308
309    // IT-8: 3-byte input must not match the 4-byte UTF-32 arm
310    #[test]
311    fn detect_encoding_three_bytes_does_not_match_utf32() {
312        assert_eq!(detect_encoding(&[0x00, 0x00, 0x00]), Encoding::Utf8);
313    }
314
315    // -----------------------------------------------------------------------
316    // decode
317    // -----------------------------------------------------------------------
318
319    #[rstest]
320    #[case::utf8_plain_ascii(b"hello: world\n" as &[u8], "hello: world\n")]
321    #[case::utf8_strips_bom(&[0xEF, 0xBB, 0xBF, b'k', b'e', b'y'], "key")]
322    #[case::utf16_le_no_bom(&[0x68, 0x00, 0x69, 0x00], "hi")]
323    #[case::utf16_be_no_bom(&[0x00, 0x68, 0x00, 0x69], "hi")]
324    #[case::utf16_le_strips_bom(&[0xFF, 0xFE, 0x68, 0x00, 0x69, 0x00], "hi")]
325    #[case::empty_input(b"", "")]
326    // Test 65: UTF-32 LE with BOM — covers the Endian::Little branch in decode_utf32.
327    // BOM (FF FE 00 00) + 'A' in UTF-32 LE (41 00 00 00).
328    #[case::utf32_le_with_bom(&[0xFF, 0xFE, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00], "A")]
329    fn decode_ok(#[case] bytes: &[u8], #[case] expected: &str) {
330        assert_eq!(decode(bytes).unwrap(), expected);
331    }
332
333    // Test 67: decode_utf32 BOM-skip flag — first chunk is BOM (stripped), second
334    // chunk is U+FEFF again (not skipped because skip_bom is false after first chunk).
335    #[test]
336    fn decode_utf32_be_second_bom_codepoint_kept_as_content() {
337        // UTF-32 BE BOM (first 4 bytes) + U+FEFF as content (next 4 bytes).
338        // Expected: first BOM stripped, second U+FEFF preserved as content.
339        let input: &[u8] = &[0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0xFE, 0xFF];
340        assert_eq!(decode(input).unwrap(), "\u{FEFF}");
341    }
342
343    #[test]
344    fn decode_invalid_utf8_returns_error() {
345        // Lone continuation byte — not valid UTF-8, no BOM so treated as UTF-8
346        assert!(decode(&[0x80]).is_err());
347    }
348
349    // -----------------------------------------------------------------------
350    // normalize_line_breaks
351    // -----------------------------------------------------------------------
352
353    #[rstest]
354    #[case::crlf_to_lf("a\r\nb\r\nc".to_string(), "a\nb\nc")]
355    #[case::lone_cr_to_lf("a\rb\rc".to_string(), "a\nb\nc")]
356    #[case::lf_only_unchanged("a\nb\nc".to_string(), "a\nb\nc")]
357    #[case::mixed_line_endings("a\r\nb\rc\nd".to_string(), "a\nb\nc\nd")]
358    #[case::empty_string_unchanged(String::new(), "")]
359    #[case::crlf_not_doubled("\r\n".to_string(), "\n")]
360    fn normalize_line_breaks_cases(#[case] input: String, #[case] expected: &str) {
361        assert_eq!(normalize_line_breaks(input), expected);
362    }
363}