Skip to main content

rlsp_yaml_parser/
encoding.rs

1// SPDX-License-Identifier: MIT
2
3/// The encoding detected from the input byte stream.
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum Encoding {
6    /// UTF-8 (default when no BOM or null-byte heuristic matches).
7    Utf8,
8    /// UTF-16 little-endian (BOM `FF FE` or null-byte heuristic).
9    Utf16Le,
10    /// UTF-16 big-endian (BOM `FE FF` or null-byte heuristic).
11    Utf16Be,
12    /// UTF-32 little-endian (BOM `FF FE 00 00`).
13    Utf32Le,
14    /// UTF-32 big-endian (BOM `00 00 FE FF`).
15    Utf32Be,
16}
17
18/// Error produced when `decode` cannot convert the byte stream to UTF-8.
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub enum EncodingError {
21    /// Input bytes are not valid for the detected encoding.
22    InvalidBytes,
23    /// A UTF-16 or UTF-32 sequence contains an unpaired surrogate or an
24    /// out-of-range codepoint.
25    InvalidCodepoint(u32),
26    /// UTF-16 input has an odd number of bytes.
27    TruncatedUtf16,
28    /// UTF-32 input length is not a multiple of four.
29    TruncatedUtf32,
30}
31
32impl core::fmt::Display for EncodingError {
33    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
34        match self {
35            Self::InvalidBytes => write!(f, "invalid byte sequence for detected encoding"),
36            Self::InvalidCodepoint(cp) => write!(f, "invalid Unicode codepoint U+{cp:04X}"),
37            Self::TruncatedUtf16 => write!(f, "UTF-16 stream has an odd number of bytes"),
38            Self::TruncatedUtf32 => {
39                write!(f, "UTF-32 stream length is not a multiple of four")
40            }
41        }
42    }
43}
44
45// ---------------------------------------------------------------------------
46// Encoding detection
47// ---------------------------------------------------------------------------
48
49/// Detect the encoding of a YAML byte stream via BOM or null-byte heuristic.
50///
51/// Implements YAML 1.2 §5.2 encoding detection. UTF-32 BOMs are checked
52/// before UTF-16 because the UTF-32 LE BOM (`FF FE 00 00`) is a superset of
53/// the UTF-16 LE BOM (`FF FE`).
54#[must_use]
55pub fn detect_encoding(bytes: &[u8]) -> Encoding {
56    match bytes {
57        // UTF-32 BOMs (must come before UTF-16 checks)
58        [0x00, 0x00, 0xFE, 0xFF, ..] => Encoding::Utf32Be,
59        [0xFF, 0xFE, 0x00, 0x00, ..] => Encoding::Utf32Le,
60        // UTF-16 BOMs
61        [0xFE, 0xFF, ..] => Encoding::Utf16Be,
62        [0xFF, 0xFE, ..] => Encoding::Utf16Le,
63        // Null-byte heuristic (no BOM): YAML streams begin with ASCII content.
64        // UTF-16 LE: odd bytes are null  → first pair is [<ascii>, 0x00]
65        // UTF-16 BE: even bytes are null → first pair is [0x00, <ascii>]
66        [a, 0x00, b, 0x00, ..] if *a != 0 && *b != 0 => Encoding::Utf16Le,
67        [0x00, a, 0x00, b, ..] if *a != 0 && *b != 0 => Encoding::Utf16Be,
68        [a, 0x00, ..] if *a != 0 => Encoding::Utf16Le,
69        [0x00, a, ..] if *a != 0 => Encoding::Utf16Be,
70        _ => Encoding::Utf8,
71    }
72}
73
74// ---------------------------------------------------------------------------
75// Decoding
76// ---------------------------------------------------------------------------
77
78/// Decode a YAML byte stream to a UTF-8 `String`, stripping any BOM.
79///
80/// Detects encoding via [`detect_encoding`], converts to UTF-8, and removes
81/// the BOM character if present.
82///
83/// # Errors
84///
85/// Returns [`EncodingError`] if the byte stream is not valid for the detected
86/// encoding, contains an invalid Unicode codepoint, or is truncated (odd-length
87/// UTF-16 or non-multiple-of-four UTF-32).
88pub fn decode(bytes: &[u8]) -> Result<String, EncodingError> {
89    match detect_encoding(bytes) {
90        Encoding::Utf8 => decode_utf8(bytes),
91        Encoding::Utf16Le => decode_utf16(bytes, Endian::Little),
92        Encoding::Utf16Be => decode_utf16(bytes, Endian::Big),
93        Encoding::Utf32Le => decode_utf32(bytes, Endian::Little),
94        Encoding::Utf32Be => decode_utf32(bytes, Endian::Big),
95    }
96}
97
98#[derive(Clone, Copy)]
99enum Endian {
100    Little,
101    Big,
102}
103
104fn decode_utf8(bytes: &[u8]) -> Result<String, EncodingError> {
105    let s = core::str::from_utf8(bytes).map_err(|_| EncodingError::InvalidBytes)?;
106    // Strip UTF-8 BOM (U+FEFF) if present.
107    Ok(s.strip_prefix('\u{FEFF}').unwrap_or(s).to_owned())
108}
109
110fn decode_utf16(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
111    if !bytes.len().is_multiple_of(2) {
112        return Err(EncodingError::TruncatedUtf16);
113    }
114    // Collect u16 code units.
115    let units: Vec<u16> = bytes
116        .chunks_exact(2)
117        .map(|chunk| match (chunk, endian) {
118            ([lo, hi], Endian::Little) => u16::from_le_bytes([*lo, *hi]),
119            ([hi, lo], Endian::Big) => u16::from_be_bytes([*hi, *lo]),
120            _ => 0, // chunks_exact(2) guarantees length 2; unreachable
121        })
122        .collect();
123
124    // Strip BOM (U+FEFF) if first unit is the BOM codepoint.
125    let units = match units.as_slice() {
126        [0xFEFF, rest @ ..] => rest,
127        other => other,
128    };
129
130    // Decode UTF-16 surrogate pairs.
131    char::decode_utf16(units.iter().copied()).try_fold(
132        String::with_capacity(units.len()),
133        |mut s, r| match r {
134            Ok(ch) => {
135                s.push(ch);
136                Ok(s)
137            }
138            Err(e) => Err(EncodingError::InvalidCodepoint(u32::from(
139                e.unpaired_surrogate(),
140            ))),
141        },
142    )
143}
144
145fn decode_utf32(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
146    if !bytes.len().is_multiple_of(4) {
147        return Err(EncodingError::TruncatedUtf32);
148    }
149    let mut out = String::with_capacity(bytes.len() / 4);
150    let mut skip_bom = true;
151    for chunk in bytes.chunks_exact(4) {
152        let cp = match (chunk, endian) {
153            ([a, b, c, d], Endian::Little) => u32::from_le_bytes([*a, *b, *c, *d]),
154            ([a, b, c, d], Endian::Big) => u32::from_be_bytes([*a, *b, *c, *d]),
155            _ => 0, // chunks_exact(4) guarantees length 4; unreachable
156        };
157        // Strip leading BOM.
158        if skip_bom && cp == 0xFEFF {
159            skip_bom = false;
160            continue;
161        }
162        skip_bom = false;
163        let ch = char::from_u32(cp).ok_or(EncodingError::InvalidCodepoint(cp))?;
164        out.push(ch);
165    }
166    Ok(out)
167}
168
169// ---------------------------------------------------------------------------
170// Line-break normalization
171// ---------------------------------------------------------------------------
172
173/// Normalize all line breaks to LF (`\n`).
174///
175/// - `\r\n` (CRLF) → `\n`
176/// - `\r` (lone CR) → `\n`
177/// - `\n` (LF) — unchanged
178#[must_use]
179pub fn normalize_line_breaks(s: String) -> String {
180    // Fast path: no CR means nothing to do.
181    if !s.contains('\r') {
182        return s;
183    }
184    let mut out = String::with_capacity(s.len());
185    let mut chars = s.chars().peekable();
186    while let Some(ch) = chars.next() {
187        if ch == '\r' {
188            // Consume the following LF of a CRLF pair so it is not doubled.
189            if chars.peek() == Some(&'\n') {
190                let _ = chars.next();
191            }
192            out.push('\n');
193        } else {
194            out.push(ch);
195        }
196    }
197    out
198}
199
200// ---------------------------------------------------------------------------
201// Tests
202// ---------------------------------------------------------------------------
203
204#[cfg(test)]
205#[expect(clippy::unwrap_used, reason = "test code")]
206mod tests {
207    use rstest::rstest;
208
209    use super::*;
210
211    // -----------------------------------------------------------------------
212    // detect_encoding
213    // -----------------------------------------------------------------------
214
215    #[test]
216    fn detect_encoding_returns_utf8_for_empty_bytes() {
217        assert_eq!(detect_encoding(b""), Encoding::Utf8);
218    }
219
220    #[rstest]
221    #[case::utf8_bom(&[0xEF, 0xBB, 0xBF, b'a'], Encoding::Utf8)]
222    #[case::utf16_le_bom(&[0xFF, 0xFE, b'a', 0x00], Encoding::Utf16Le)]
223    #[case::utf16_be_bom(&[0xFE, 0xFF, 0x00, b'a'], Encoding::Utf16Be)]
224    #[case::utf32_le_bom(&[0xFF, 0xFE, 0x00, 0x00], Encoding::Utf32Le)]
225    #[case::utf32_be_bom(&[0x00, 0x00, 0xFE, 0xFF], Encoding::Utf32Be)]
226    fn detect_encoding_bom(#[case] bytes: &[u8], #[case] expected: Encoding) {
227        assert_eq!(detect_encoding(bytes), expected);
228    }
229
230    #[test]
231    fn detect_encoding_falls_back_to_utf8_for_plain_ascii() {
232        assert_eq!(detect_encoding(b"key: value\n"), Encoding::Utf8);
233    }
234
235    #[rstest]
236    #[case::utf16_le_without_bom(&[b'a', 0x00, b'b', 0x00], Encoding::Utf16Le)]
237    #[case::utf16_be_without_bom(&[0x00, b'a', 0x00, b'b'], Encoding::Utf16Be)]
238    // Test 63/64: 2-byte slices — covers the [a, 0x00, ..] and [0x00, a, ..] arms
239    // that also match when only two bytes are present.
240    #[case::utf16_le_two_byte_heuristic(&[b'a', 0x00], Encoding::Utf16Le)]
241    #[case::utf16_be_two_byte_heuristic(&[0x00, b'a'], Encoding::Utf16Be)]
242    fn detect_encoding_null_byte_heuristic(#[case] bytes: &[u8], #[case] expected: Encoding) {
243        assert_eq!(detect_encoding(bytes), expected);
244    }
245
246    // -----------------------------------------------------------------------
247    // decode
248    // -----------------------------------------------------------------------
249
250    #[rstest]
251    #[case::utf8_plain_ascii(b"hello: world\n" as &[u8], "hello: world\n")]
252    #[case::utf8_strips_bom(&[0xEF, 0xBB, 0xBF, b'k', b'e', b'y'], "key")]
253    #[case::utf16_le_no_bom(&[0x68, 0x00, 0x69, 0x00], "hi")]
254    #[case::utf16_be_no_bom(&[0x00, 0x68, 0x00, 0x69], "hi")]
255    #[case::utf16_le_strips_bom(&[0xFF, 0xFE, 0x68, 0x00, 0x69, 0x00], "hi")]
256    #[case::empty_input(b"", "")]
257    // Test 65: UTF-32 LE with BOM — covers the Endian::Little branch in decode_utf32.
258    // BOM (FF FE 00 00) + 'A' in UTF-32 LE (41 00 00 00).
259    #[case::utf32_le_with_bom(&[0xFF, 0xFE, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00], "A")]
260    fn decode_ok(#[case] bytes: &[u8], #[case] expected: &str) {
261        assert_eq!(decode(bytes).unwrap(), expected);
262    }
263
264    // Test 67: decode_utf32 BOM-skip flag — first chunk is BOM (stripped), second
265    // chunk is U+FEFF again (not skipped because skip_bom is false after first chunk).
266    #[test]
267    fn decode_utf32_be_second_bom_codepoint_kept_as_content() {
268        // UTF-32 BE BOM (first 4 bytes) + U+FEFF as content (next 4 bytes).
269        // Expected: first BOM stripped, second U+FEFF preserved as content.
270        let input: &[u8] = &[0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0xFE, 0xFF];
271        assert_eq!(decode(input).unwrap(), "\u{FEFF}");
272    }
273
274    #[test]
275    fn decode_invalid_utf8_returns_error() {
276        // Lone continuation byte — not valid UTF-8, no BOM so treated as UTF-8
277        assert!(decode(&[0x80]).is_err());
278    }
279
280    // -----------------------------------------------------------------------
281    // normalize_line_breaks
282    // -----------------------------------------------------------------------
283
284    #[rstest]
285    #[case::crlf_to_lf("a\r\nb\r\nc".to_string(), "a\nb\nc")]
286    #[case::lone_cr_to_lf("a\rb\rc".to_string(), "a\nb\nc")]
287    #[case::lf_only_unchanged("a\nb\nc".to_string(), "a\nb\nc")]
288    #[case::mixed_line_endings("a\r\nb\rc\nd".to_string(), "a\nb\nc\nd")]
289    #[case::empty_string_unchanged(String::new(), "")]
290    #[case::crlf_not_doubled("\r\n".to_string(), "\n")]
291    fn normalize_line_breaks_cases(#[case] input: String, #[case] expected: &str) {
292        assert_eq!(normalize_line_breaks(input), expected);
293    }
294}