Skip to main content

rlsp_yaml_parser/
encoding.rs

1// SPDX-License-Identifier: MIT
2
3/// The encoding detected from the input byte stream.
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum Encoding {
6    Utf8,
7    Utf16Le,
8    Utf16Be,
9    Utf32Le,
10    Utf32Be,
11}
12
13/// Error produced when `decode` cannot convert the byte stream to UTF-8.
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum EncodingError {
16    /// Input bytes are not valid for the detected encoding.
17    InvalidBytes,
18    /// A UTF-16 or UTF-32 sequence contains an unpaired surrogate or an
19    /// out-of-range codepoint.
20    InvalidCodepoint(u32),
21    /// UTF-16 input has an odd number of bytes.
22    TruncatedUtf16,
23    /// UTF-32 input length is not a multiple of four.
24    TruncatedUtf32,
25}
26
27impl core::fmt::Display for EncodingError {
28    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
29        match self {
30            Self::InvalidBytes => write!(f, "invalid byte sequence for detected encoding"),
31            Self::InvalidCodepoint(cp) => write!(f, "invalid Unicode codepoint U+{cp:04X}"),
32            Self::TruncatedUtf16 => write!(f, "UTF-16 stream has an odd number of bytes"),
33            Self::TruncatedUtf32 => {
34                write!(f, "UTF-32 stream length is not a multiple of four")
35            }
36        }
37    }
38}
39
40// ---------------------------------------------------------------------------
41// Encoding detection
42// ---------------------------------------------------------------------------
43
44/// Detect the encoding of a YAML byte stream via BOM or null-byte heuristic.
45///
46/// Implements YAML 1.2 §5.2 encoding detection. UTF-32 BOMs are checked
47/// before UTF-16 because the UTF-32 LE BOM (`FF FE 00 00`) is a superset of
48/// the UTF-16 LE BOM (`FF FE`).
49#[must_use]
50pub fn detect_encoding(bytes: &[u8]) -> Encoding {
51    match bytes {
52        // UTF-32 BOMs (must come before UTF-16 checks)
53        [0x00, 0x00, 0xFE, 0xFF, ..] => Encoding::Utf32Be,
54        [0xFF, 0xFE, 0x00, 0x00, ..] => Encoding::Utf32Le,
55        // UTF-16 BOMs
56        [0xFE, 0xFF, ..] => Encoding::Utf16Be,
57        [0xFF, 0xFE, ..] => Encoding::Utf16Le,
58        // Null-byte heuristic (no BOM): YAML streams begin with ASCII content.
59        // UTF-16 LE: odd bytes are null  → first pair is [<ascii>, 0x00]
60        // UTF-16 BE: even bytes are null → first pair is [0x00, <ascii>]
61        [a, 0x00, b, 0x00, ..] if *a != 0 && *b != 0 => Encoding::Utf16Le,
62        [0x00, a, 0x00, b, ..] if *a != 0 && *b != 0 => Encoding::Utf16Be,
63        [a, 0x00, ..] if *a != 0 => Encoding::Utf16Le,
64        [0x00, a, ..] if *a != 0 => Encoding::Utf16Be,
65        _ => Encoding::Utf8,
66    }
67}
68
69// ---------------------------------------------------------------------------
70// Decoding
71// ---------------------------------------------------------------------------
72
73/// Decode a YAML byte stream to a UTF-8 `String`, stripping any BOM.
74///
75/// Detects encoding via [`detect_encoding`], converts to UTF-8, and removes
76/// the BOM character if present.
77///
78/// # Errors
79///
80/// Returns [`EncodingError`] if the byte stream is not valid for the detected
81/// encoding, contains an invalid Unicode codepoint, or is truncated (odd-length
82/// UTF-16 or non-multiple-of-four UTF-32).
83pub fn decode(bytes: &[u8]) -> Result<String, EncodingError> {
84    match detect_encoding(bytes) {
85        Encoding::Utf8 => decode_utf8(bytes),
86        Encoding::Utf16Le => decode_utf16(bytes, Endian::Little),
87        Encoding::Utf16Be => decode_utf16(bytes, Endian::Big),
88        Encoding::Utf32Le => decode_utf32(bytes, Endian::Little),
89        Encoding::Utf32Be => decode_utf32(bytes, Endian::Big),
90    }
91}
92
93#[derive(Clone, Copy)]
94enum Endian {
95    Little,
96    Big,
97}
98
99fn decode_utf8(bytes: &[u8]) -> Result<String, EncodingError> {
100    let s = core::str::from_utf8(bytes).map_err(|_| EncodingError::InvalidBytes)?;
101    // Strip UTF-8 BOM (U+FEFF) if present.
102    Ok(s.strip_prefix('\u{FEFF}').unwrap_or(s).to_owned())
103}
104
105fn decode_utf16(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
106    if !bytes.len().is_multiple_of(2) {
107        return Err(EncodingError::TruncatedUtf16);
108    }
109    // Collect u16 code units.
110    let units: Vec<u16> = bytes
111        .chunks_exact(2)
112        .map(|chunk| match (chunk, endian) {
113            ([lo, hi], Endian::Little) => u16::from_le_bytes([*lo, *hi]),
114            ([hi, lo], Endian::Big) => u16::from_be_bytes([*hi, *lo]),
115            _ => 0, // chunks_exact(2) guarantees length 2; unreachable
116        })
117        .collect();
118
119    // Strip BOM (U+FEFF) if first unit is the BOM codepoint.
120    let units = match units.as_slice() {
121        [0xFEFF, rest @ ..] => rest,
122        other => other,
123    };
124
125    // Decode UTF-16 surrogate pairs.
126    char::decode_utf16(units.iter().copied()).try_fold(
127        String::with_capacity(units.len()),
128        |mut s, r| match r {
129            Ok(ch) => {
130                s.push(ch);
131                Ok(s)
132            }
133            Err(e) => Err(EncodingError::InvalidCodepoint(u32::from(
134                e.unpaired_surrogate(),
135            ))),
136        },
137    )
138}
139
140fn decode_utf32(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
141    if !bytes.len().is_multiple_of(4) {
142        return Err(EncodingError::TruncatedUtf32);
143    }
144    let mut out = String::with_capacity(bytes.len() / 4);
145    let mut skip_bom = true;
146    for chunk in bytes.chunks_exact(4) {
147        let cp = match (chunk, endian) {
148            ([a, b, c, d], Endian::Little) => u32::from_le_bytes([*a, *b, *c, *d]),
149            ([a, b, c, d], Endian::Big) => u32::from_be_bytes([*a, *b, *c, *d]),
150            _ => 0, // chunks_exact(4) guarantees length 4; unreachable
151        };
152        // Strip leading BOM.
153        if skip_bom && cp == 0xFEFF {
154            skip_bom = false;
155            continue;
156        }
157        skip_bom = false;
158        let ch = char::from_u32(cp).ok_or(EncodingError::InvalidCodepoint(cp))?;
159        out.push(ch);
160    }
161    Ok(out)
162}
163
164// ---------------------------------------------------------------------------
165// Line-break normalization
166// ---------------------------------------------------------------------------
167
168/// Normalize all line breaks to LF (`\n`).
169///
170/// - `\r\n` (CRLF) → `\n`
171/// - `\r` (lone CR) → `\n`
172/// - `\n` (LF) — unchanged
173#[must_use]
174pub fn normalize_line_breaks(s: String) -> String {
175    // Fast path: no CR means nothing to do.
176    if !s.contains('\r') {
177        return s;
178    }
179    let mut out = String::with_capacity(s.len());
180    let mut chars = s.chars().peekable();
181    while let Some(ch) = chars.next() {
182        if ch == '\r' {
183            // Consume the following LF of a CRLF pair so it is not doubled.
184            if chars.peek() == Some(&'\n') {
185                let _ = chars.next();
186            }
187            out.push('\n');
188        } else {
189            out.push(ch);
190        }
191    }
192    out
193}
194
195// ---------------------------------------------------------------------------
196// Tests
197// ---------------------------------------------------------------------------
198
199#[cfg(test)]
200#[allow(clippy::indexing_slicing, clippy::expect_used, clippy::unwrap_used)]
201mod tests {
202    use super::*;
203
204    // -----------------------------------------------------------------------
205    // detect_encoding
206    // -----------------------------------------------------------------------
207
208    #[test]
209    fn detect_encoding_returns_utf8_for_empty_bytes() {
210        assert_eq!(detect_encoding(b""), Encoding::Utf8);
211    }
212
213    #[test]
214    fn detect_encoding_recognizes_utf8_bom() {
215        assert_eq!(detect_encoding(&[0xEF, 0xBB, 0xBF, b'a']), Encoding::Utf8);
216    }
217
218    #[test]
219    fn detect_encoding_recognizes_utf16_le_bom() {
220        assert_eq!(
221            detect_encoding(&[0xFF, 0xFE, b'a', 0x00]),
222            Encoding::Utf16Le
223        );
224    }
225
226    #[test]
227    fn detect_encoding_recognizes_utf16_be_bom() {
228        assert_eq!(
229            detect_encoding(&[0xFE, 0xFF, 0x00, b'a']),
230            Encoding::Utf16Be
231        );
232    }
233
234    #[test]
235    fn detect_encoding_recognizes_utf32_le_bom() {
236        assert_eq!(
237            detect_encoding(&[0xFF, 0xFE, 0x00, 0x00]),
238            Encoding::Utf32Le
239        );
240    }
241
242    #[test]
243    fn detect_encoding_recognizes_utf32_be_bom() {
244        assert_eq!(
245            detect_encoding(&[0x00, 0x00, 0xFE, 0xFF]),
246            Encoding::Utf32Be
247        );
248    }
249
250    #[test]
251    fn detect_encoding_falls_back_to_utf8_for_plain_ascii() {
252        assert_eq!(detect_encoding(b"key: value\n"), Encoding::Utf8);
253    }
254
255    #[test]
256    fn detect_encoding_uses_null_byte_heuristic_for_utf16_le_without_bom() {
257        assert_eq!(
258            detect_encoding(&[b'a', 0x00, b'b', 0x00]),
259            Encoding::Utf16Le
260        );
261    }
262
263    #[test]
264    fn detect_encoding_uses_null_byte_heuristic_for_utf16_be_without_bom() {
265        assert_eq!(
266            detect_encoding(&[0x00, b'a', 0x00, b'b']),
267            Encoding::Utf16Be
268        );
269    }
270
271    // -----------------------------------------------------------------------
272    // decode
273    // -----------------------------------------------------------------------
274
275    #[test]
276    fn decode_utf8_plain_ascii_roundtrips() {
277        let result = decode(b"hello: world\n");
278        assert_eq!(result.unwrap(), "hello: world\n");
279    }
280
281    #[test]
282    fn decode_utf8_strips_bom() {
283        let result = decode(&[0xEF, 0xBB, 0xBF, b'k', b'e', b'y']);
284        assert_eq!(result.unwrap(), "key");
285    }
286
287    #[test]
288    fn decode_utf16_le_produces_correct_utf8() {
289        // "hi" in UTF-16 LE (no BOM)
290        let result = decode(&[0x68, 0x00, 0x69, 0x00]);
291        assert_eq!(result.unwrap(), "hi");
292    }
293
294    #[test]
295    fn decode_utf16_be_produces_correct_utf8() {
296        // "hi" in UTF-16 BE (no BOM)
297        let result = decode(&[0x00, 0x68, 0x00, 0x69]);
298        assert_eq!(result.unwrap(), "hi");
299    }
300
301    #[test]
302    fn decode_invalid_utf8_returns_error() {
303        // Lone continuation byte — not valid UTF-8, no BOM so treated as UTF-8
304        let result = decode(&[0x80]);
305        assert!(result.is_err());
306    }
307
308    #[test]
309    fn decode_utf16_le_strips_bom() {
310        // UTF-16 LE BOM + "hi"
311        let result = decode(&[0xFF, 0xFE, 0x68, 0x00, 0x69, 0x00]);
312        assert_eq!(result.unwrap(), "hi");
313    }
314
315    #[test]
316    fn decode_empty_input_returns_empty_string() {
317        let result = decode(b"");
318        assert_eq!(result.unwrap(), "");
319    }
320
321    // -----------------------------------------------------------------------
322    // normalize_line_breaks
323    // -----------------------------------------------------------------------
324
325    #[test]
326    fn normalize_crlf_to_lf() {
327        assert_eq!(normalize_line_breaks("a\r\nb\r\nc".to_string()), "a\nb\nc");
328    }
329
330    #[test]
331    fn normalize_lone_cr_to_lf() {
332        assert_eq!(normalize_line_breaks("a\rb\rc".to_string()), "a\nb\nc");
333    }
334
335    #[test]
336    fn normalize_lf_only_is_unchanged() {
337        assert_eq!(normalize_line_breaks("a\nb\nc".to_string()), "a\nb\nc");
338    }
339
340    #[test]
341    fn normalize_mixed_line_endings() {
342        assert_eq!(
343            normalize_line_breaks("a\r\nb\rc\nd".to_string()),
344            "a\nb\nc\nd"
345        );
346    }
347
348    #[test]
349    fn normalize_empty_string_is_unchanged() {
350        assert_eq!(normalize_line_breaks(String::new()), "");
351    }
352
353    #[test]
354    fn normalize_does_not_double_lf_after_crlf() {
355        assert_eq!(normalize_line_breaks("\r\n".to_string()), "\n");
356    }
357}