1#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum Encoding {
6 Utf8,
8 Utf16Le,
10 Utf16Be,
12 Utf32Le,
14 Utf32Be,
16}
17
18#[derive(Debug, Clone, PartialEq, Eq)]
20pub enum EncodingError {
21 InvalidBytes,
23 InvalidCodepoint(u32),
26 TruncatedUtf16,
28 TruncatedUtf32,
30}
31
32impl core::fmt::Display for EncodingError {
33 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
34 match self {
35 Self::InvalidBytes => write!(f, "invalid byte sequence for detected encoding"),
36 Self::InvalidCodepoint(cp) => write!(f, "invalid Unicode codepoint U+{cp:04X}"),
37 Self::TruncatedUtf16 => write!(f, "UTF-16 stream has an odd number of bytes"),
38 Self::TruncatedUtf32 => {
39 write!(f, "UTF-32 stream length is not a multiple of four")
40 }
41 }
42 }
43}
44
45#[must_use]
58pub fn detect_encoding(bytes: &[u8]) -> Encoding {
59 match bytes {
60 [0x00, 0x00, 0xFE, 0xFF, ..] => Encoding::Utf32Be,
62 [0xFF, 0xFE, 0x00, 0x00, ..] => Encoding::Utf32Le,
63 [0xFE, 0xFF, ..] => Encoding::Utf16Be,
65 [0xFF, 0xFE, ..] => Encoding::Utf16Le,
66 [0x00, 0x00, 0x00, a, ..] if *a != 0 => Encoding::Utf32Be,
68 [a, 0x00, 0x00, 0x00, ..] if *a != 0 => Encoding::Utf32Le,
69 [a, 0x00, b, 0x00, ..] if *a != 0 && *b != 0 => Encoding::Utf16Le,
73 [0x00, a, 0x00, b, ..] if *a != 0 && *b != 0 => Encoding::Utf16Be,
74 [a, 0x00, ..] if *a != 0 => Encoding::Utf16Le,
75 [0x00, a, ..] if *a != 0 => Encoding::Utf16Be,
76 _ => Encoding::Utf8,
77 }
78}
79
80pub fn decode(bytes: &[u8]) -> Result<String, EncodingError> {
95 match detect_encoding(bytes) {
96 Encoding::Utf8 => decode_utf8(bytes),
97 Encoding::Utf16Le => decode_utf16(bytes, Endian::Little),
98 Encoding::Utf16Be => decode_utf16(bytes, Endian::Big),
99 Encoding::Utf32Le => decode_utf32(bytes, Endian::Little),
100 Encoding::Utf32Be => decode_utf32(bytes, Endian::Big),
101 }
102}
103
104#[derive(Clone, Copy)]
105enum Endian {
106 Little,
107 Big,
108}
109
110fn decode_utf8(bytes: &[u8]) -> Result<String, EncodingError> {
111 let s = core::str::from_utf8(bytes).map_err(|_| EncodingError::InvalidBytes)?;
112 Ok(s.strip_prefix('\u{FEFF}').unwrap_or(s).to_owned())
114}
115
116fn decode_utf16(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
117 if !bytes.len().is_multiple_of(2) {
118 return Err(EncodingError::TruncatedUtf16);
119 }
120 let units: Vec<u16> = bytes
122 .chunks_exact(2)
123 .map(|chunk| match (chunk, endian) {
124 ([lo, hi], Endian::Little) => u16::from_le_bytes([*lo, *hi]),
125 ([hi, lo], Endian::Big) => u16::from_be_bytes([*hi, *lo]),
126 _ => 0, })
128 .collect();
129
130 let units = match units.as_slice() {
132 [0xFEFF, rest @ ..] => rest,
133 other => other,
134 };
135
136 char::decode_utf16(units.iter().copied()).try_fold(
138 String::with_capacity(units.len()),
139 |mut s, r| match r {
140 Ok(ch) => {
141 s.push(ch);
142 Ok(s)
143 }
144 Err(e) => Err(EncodingError::InvalidCodepoint(u32::from(
145 e.unpaired_surrogate(),
146 ))),
147 },
148 )
149}
150
151fn decode_utf32(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
152 if !bytes.len().is_multiple_of(4) {
153 return Err(EncodingError::TruncatedUtf32);
154 }
155 let mut out = String::with_capacity(bytes.len() / 4);
156 let mut skip_bom = true;
157 for chunk in bytes.chunks_exact(4) {
158 let cp = match (chunk, endian) {
159 ([a, b, c, d], Endian::Little) => u32::from_le_bytes([*a, *b, *c, *d]),
160 ([a, b, c, d], Endian::Big) => u32::from_be_bytes([*a, *b, *c, *d]),
161 _ => 0, };
163 if skip_bom && cp == 0xFEFF {
165 skip_bom = false;
166 continue;
167 }
168 skip_bom = false;
169 let ch = char::from_u32(cp).ok_or(EncodingError::InvalidCodepoint(cp))?;
170 out.push(ch);
171 }
172 Ok(out)
173}
174
175#[must_use]
185pub fn normalize_line_breaks(s: String) -> String {
186 if !s.contains('\r') {
188 return s;
189 }
190 let mut out = String::with_capacity(s.len());
191 let mut chars = s.chars().peekable();
192 while let Some(ch) = chars.next() {
193 if ch == '\r' {
194 if chars.peek() == Some(&'\n') {
196 let _ = chars.next();
197 }
198 out.push('\n');
199 } else {
200 out.push(ch);
201 }
202 }
203 out
204}
205
206#[cfg(test)]
211#[expect(clippy::unwrap_used, reason = "test code")]
212mod tests {
213 use rstest::rstest;
214
215 use super::*;
216
217 #[test]
222 fn detect_encoding_returns_utf8_for_empty_bytes() {
223 assert_eq!(detect_encoding(b""), Encoding::Utf8);
224 }
225
226 #[rstest]
227 #[case::utf8_bom(&[0xEF, 0xBB, 0xBF, b'a'], Encoding::Utf8)]
228 #[case::utf16_le_bom(&[0xFF, 0xFE, b'a', 0x00], Encoding::Utf16Le)]
229 #[case::utf16_be_bom(&[0xFE, 0xFF, 0x00, b'a'], Encoding::Utf16Be)]
230 #[case::utf32_le_bom(&[0xFF, 0xFE, 0x00, 0x00], Encoding::Utf32Le)]
231 #[case::utf32_be_bom(&[0x00, 0x00, 0xFE, 0xFF], Encoding::Utf32Be)]
232 fn detect_encoding_bom(#[case] bytes: &[u8], #[case] expected: Encoding) {
233 assert_eq!(detect_encoding(bytes), expected);
234 }
235
236 #[test]
237 fn detect_encoding_falls_back_to_utf8_for_plain_ascii() {
238 assert_eq!(detect_encoding(b"key: value\n"), Encoding::Utf8);
239 }
240
241 #[rstest]
242 #[case::utf16_le_without_bom(&[b'a', 0x00, b'b', 0x00], Encoding::Utf16Le)]
243 #[case::utf16_be_without_bom(&[0x00, b'a', 0x00, b'b'], Encoding::Utf16Be)]
244 #[case::utf16_le_two_byte_heuristic(&[b'a', 0x00], Encoding::Utf16Le)]
247 #[case::utf16_be_two_byte_heuristic(&[0x00, b'a'], Encoding::Utf16Be)]
248 fn detect_encoding_null_byte_heuristic(#[case] bytes: &[u8], #[case] expected: Encoding) {
249 assert_eq!(detect_encoding(bytes), expected);
250 }
251
252 #[test]
254 fn detect_encoding_bom_less_utf32_be() {
255 let input: &[u8] = &[0x00, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x00, 0x3A];
256 assert_eq!(detect_encoding(input), Encoding::Utf32Be);
257 assert_eq!(decode(input).unwrap(), "k:");
258 }
259
260 #[test]
262 fn detect_encoding_bom_less_utf32_le() {
263 let input: &[u8] = &[0x6B, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00];
264 assert_eq!(detect_encoding(input), Encoding::Utf32Le);
265 assert_eq!(decode(input).unwrap(), "k:");
266 }
267
268 #[test]
270 fn detect_encoding_exactly_four_bytes_utf32_be() {
271 assert_eq!(
272 detect_encoding(&[0x00, 0x00, 0x00, 0x41]),
273 Encoding::Utf32Be
274 );
275 }
276
277 #[test]
279 fn detect_encoding_exactly_four_bytes_utf32_le() {
280 assert_eq!(
281 detect_encoding(&[0x41, 0x00, 0x00, 0x00]),
282 Encoding::Utf32Le
283 );
284 }
285
286 #[test]
288 fn detect_encoding_all_zero_input_is_utf8() {
289 assert_eq!(detect_encoding(&[0x00u8; 16]), Encoding::Utf8);
290 }
291
292 #[test]
294 fn detect_encoding_bom_less_utf32_does_not_shadow_utf16_be() {
295 assert_eq!(
296 detect_encoding(&[0x00, 0x41, 0x00, 0x42]),
297 Encoding::Utf16Be
298 );
299 }
300
301 #[test]
303 fn detect_encoding_bom_less_utf32_does_not_shadow_utf16_le() {
304 assert_eq!(
305 detect_encoding(&[0x41, 0x00, 0x42, 0x00]),
306 Encoding::Utf16Le
307 );
308 }
309
310 #[test]
312 fn detect_encoding_three_bytes_does_not_match_utf32() {
313 assert_eq!(detect_encoding(&[0x00, 0x00, 0x00]), Encoding::Utf8);
314 }
315
316 #[rstest]
321 #[case::utf8_plain_ascii(b"hello: world\n" as &[u8], "hello: world\n")]
322 #[case::utf8_strips_bom(&[0xEF, 0xBB, 0xBF, b'k', b'e', b'y'], "key")]
323 #[case::utf16_le_no_bom(&[0x68, 0x00, 0x69, 0x00], "hi")]
324 #[case::utf16_be_no_bom(&[0x00, 0x68, 0x00, 0x69], "hi")]
325 #[case::utf16_le_strips_bom(&[0xFF, 0xFE, 0x68, 0x00, 0x69, 0x00], "hi")]
326 #[case::empty_input(b"", "")]
327 #[case::utf32_le_with_bom(&[0xFF, 0xFE, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00], "A")]
330 fn decode_ok(#[case] bytes: &[u8], #[case] expected: &str) {
331 assert_eq!(decode(bytes).unwrap(), expected);
332 }
333
334 #[test]
337 fn decode_utf32_be_second_bom_codepoint_kept_as_content() {
338 let input: &[u8] = &[0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0xFE, 0xFF];
341 assert_eq!(decode(input).unwrap(), "\u{FEFF}");
342 }
343
344 #[test]
345 fn decode_invalid_utf8_returns_error() {
346 assert!(decode(&[0x80]).is_err());
348 }
349
350 #[rstest]
355 #[case::crlf_to_lf("a\r\nb\r\nc".to_string(), "a\nb\nc")]
356 #[case::lone_cr_to_lf("a\rb\rc".to_string(), "a\nb\nc")]
357 #[case::lf_only_unchanged("a\nb\nc".to_string(), "a\nb\nc")]
358 #[case::mixed_line_endings("a\r\nb\rc\nd".to_string(), "a\nb\nc\nd")]
359 #[case::empty_string_unchanged(String::new(), "")]
360 #[case::crlf_not_doubled("\r\n".to_string(), "\n")]
361 fn normalize_line_breaks_cases(#[case] input: String, #[case] expected: &str) {
362 assert_eq!(normalize_line_breaks(input), expected);
363 }
364}