1#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum Encoding {
6 Utf8,
7 Utf16Le,
8 Utf16Be,
9 Utf32Le,
10 Utf32Be,
11}
12
13#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum EncodingError {
16 InvalidBytes,
18 InvalidCodepoint(u32),
21 TruncatedUtf16,
23 TruncatedUtf32,
25}
26
27impl core::fmt::Display for EncodingError {
28 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
29 match self {
30 Self::InvalidBytes => write!(f, "invalid byte sequence for detected encoding"),
31 Self::InvalidCodepoint(cp) => write!(f, "invalid Unicode codepoint U+{cp:04X}"),
32 Self::TruncatedUtf16 => write!(f, "UTF-16 stream has an odd number of bytes"),
33 Self::TruncatedUtf32 => {
34 write!(f, "UTF-32 stream length is not a multiple of four")
35 }
36 }
37 }
38}
39
40#[must_use]
50pub fn detect_encoding(bytes: &[u8]) -> Encoding {
51 match bytes {
52 [0x00, 0x00, 0xFE, 0xFF, ..] => Encoding::Utf32Be,
54 [0xFF, 0xFE, 0x00, 0x00, ..] => Encoding::Utf32Le,
55 [0xFE, 0xFF, ..] => Encoding::Utf16Be,
57 [0xFF, 0xFE, ..] => Encoding::Utf16Le,
58 [a, 0x00, b, 0x00, ..] if *a != 0 && *b != 0 => Encoding::Utf16Le,
62 [0x00, a, 0x00, b, ..] if *a != 0 && *b != 0 => Encoding::Utf16Be,
63 [a, 0x00, ..] if *a != 0 => Encoding::Utf16Le,
64 [0x00, a, ..] if *a != 0 => Encoding::Utf16Be,
65 _ => Encoding::Utf8,
66 }
67}
68
69pub fn decode(bytes: &[u8]) -> Result<String, EncodingError> {
84 match detect_encoding(bytes) {
85 Encoding::Utf8 => decode_utf8(bytes),
86 Encoding::Utf16Le => decode_utf16(bytes, Endian::Little),
87 Encoding::Utf16Be => decode_utf16(bytes, Endian::Big),
88 Encoding::Utf32Le => decode_utf32(bytes, Endian::Little),
89 Encoding::Utf32Be => decode_utf32(bytes, Endian::Big),
90 }
91}
92
93#[derive(Clone, Copy)]
94enum Endian {
95 Little,
96 Big,
97}
98
99fn decode_utf8(bytes: &[u8]) -> Result<String, EncodingError> {
100 let s = core::str::from_utf8(bytes).map_err(|_| EncodingError::InvalidBytes)?;
101 Ok(s.strip_prefix('\u{FEFF}').unwrap_or(s).to_owned())
103}
104
105fn decode_utf16(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
106 if !bytes.len().is_multiple_of(2) {
107 return Err(EncodingError::TruncatedUtf16);
108 }
109 let units: Vec<u16> = bytes
111 .chunks_exact(2)
112 .map(|chunk| match (chunk, endian) {
113 ([lo, hi], Endian::Little) => u16::from_le_bytes([*lo, *hi]),
114 ([hi, lo], Endian::Big) => u16::from_be_bytes([*hi, *lo]),
115 _ => 0, })
117 .collect();
118
119 let units = match units.as_slice() {
121 [0xFEFF, rest @ ..] => rest,
122 other => other,
123 };
124
125 char::decode_utf16(units.iter().copied()).try_fold(
127 String::with_capacity(units.len()),
128 |mut s, r| match r {
129 Ok(ch) => {
130 s.push(ch);
131 Ok(s)
132 }
133 Err(e) => Err(EncodingError::InvalidCodepoint(u32::from(
134 e.unpaired_surrogate(),
135 ))),
136 },
137 )
138}
139
140fn decode_utf32(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
141 if !bytes.len().is_multiple_of(4) {
142 return Err(EncodingError::TruncatedUtf32);
143 }
144 let mut out = String::with_capacity(bytes.len() / 4);
145 let mut skip_bom = true;
146 for chunk in bytes.chunks_exact(4) {
147 let cp = match (chunk, endian) {
148 ([a, b, c, d], Endian::Little) => u32::from_le_bytes([*a, *b, *c, *d]),
149 ([a, b, c, d], Endian::Big) => u32::from_be_bytes([*a, *b, *c, *d]),
150 _ => 0, };
152 if skip_bom && cp == 0xFEFF {
154 skip_bom = false;
155 continue;
156 }
157 skip_bom = false;
158 let ch = char::from_u32(cp).ok_or(EncodingError::InvalidCodepoint(cp))?;
159 out.push(ch);
160 }
161 Ok(out)
162}
163
164#[must_use]
174pub fn normalize_line_breaks(s: String) -> String {
175 if !s.contains('\r') {
177 return s;
178 }
179 let mut out = String::with_capacity(s.len());
180 let mut chars = s.chars().peekable();
181 while let Some(ch) = chars.next() {
182 if ch == '\r' {
183 if chars.peek() == Some(&'\n') {
185 let _ = chars.next();
186 }
187 out.push('\n');
188 } else {
189 out.push(ch);
190 }
191 }
192 out
193}
194
195#[cfg(test)]
200#[allow(clippy::indexing_slicing, clippy::expect_used, clippy::unwrap_used)]
201mod tests {
202 use super::*;
203
204 #[test]
209 fn detect_encoding_returns_utf8_for_empty_bytes() {
210 assert_eq!(detect_encoding(b""), Encoding::Utf8);
211 }
212
213 #[test]
214 fn detect_encoding_recognizes_utf8_bom() {
215 assert_eq!(detect_encoding(&[0xEF, 0xBB, 0xBF, b'a']), Encoding::Utf8);
216 }
217
218 #[test]
219 fn detect_encoding_recognizes_utf16_le_bom() {
220 assert_eq!(
221 detect_encoding(&[0xFF, 0xFE, b'a', 0x00]),
222 Encoding::Utf16Le
223 );
224 }
225
226 #[test]
227 fn detect_encoding_recognizes_utf16_be_bom() {
228 assert_eq!(
229 detect_encoding(&[0xFE, 0xFF, 0x00, b'a']),
230 Encoding::Utf16Be
231 );
232 }
233
234 #[test]
235 fn detect_encoding_recognizes_utf32_le_bom() {
236 assert_eq!(
237 detect_encoding(&[0xFF, 0xFE, 0x00, 0x00]),
238 Encoding::Utf32Le
239 );
240 }
241
242 #[test]
243 fn detect_encoding_recognizes_utf32_be_bom() {
244 assert_eq!(
245 detect_encoding(&[0x00, 0x00, 0xFE, 0xFF]),
246 Encoding::Utf32Be
247 );
248 }
249
250 #[test]
251 fn detect_encoding_falls_back_to_utf8_for_plain_ascii() {
252 assert_eq!(detect_encoding(b"key: value\n"), Encoding::Utf8);
253 }
254
255 #[test]
256 fn detect_encoding_uses_null_byte_heuristic_for_utf16_le_without_bom() {
257 assert_eq!(
258 detect_encoding(&[b'a', 0x00, b'b', 0x00]),
259 Encoding::Utf16Le
260 );
261 }
262
263 #[test]
264 fn detect_encoding_uses_null_byte_heuristic_for_utf16_be_without_bom() {
265 assert_eq!(
266 detect_encoding(&[0x00, b'a', 0x00, b'b']),
267 Encoding::Utf16Be
268 );
269 }
270
271 #[test]
276 fn decode_utf8_plain_ascii_roundtrips() {
277 let result = decode(b"hello: world\n");
278 assert_eq!(result.unwrap(), "hello: world\n");
279 }
280
281 #[test]
282 fn decode_utf8_strips_bom() {
283 let result = decode(&[0xEF, 0xBB, 0xBF, b'k', b'e', b'y']);
284 assert_eq!(result.unwrap(), "key");
285 }
286
287 #[test]
288 fn decode_utf16_le_produces_correct_utf8() {
289 let result = decode(&[0x68, 0x00, 0x69, 0x00]);
291 assert_eq!(result.unwrap(), "hi");
292 }
293
294 #[test]
295 fn decode_utf16_be_produces_correct_utf8() {
296 let result = decode(&[0x00, 0x68, 0x00, 0x69]);
298 assert_eq!(result.unwrap(), "hi");
299 }
300
301 #[test]
302 fn decode_invalid_utf8_returns_error() {
303 let result = decode(&[0x80]);
305 assert!(result.is_err());
306 }
307
308 #[test]
309 fn decode_utf16_le_strips_bom() {
310 let result = decode(&[0xFF, 0xFE, 0x68, 0x00, 0x69, 0x00]);
312 assert_eq!(result.unwrap(), "hi");
313 }
314
315 #[test]
316 fn decode_empty_input_returns_empty_string() {
317 let result = decode(b"");
318 assert_eq!(result.unwrap(), "");
319 }
320
321 #[test]
326 fn normalize_crlf_to_lf() {
327 assert_eq!(normalize_line_breaks("a\r\nb\r\nc".to_string()), "a\nb\nc");
328 }
329
330 #[test]
331 fn normalize_lone_cr_to_lf() {
332 assert_eq!(normalize_line_breaks("a\rb\rc".to_string()), "a\nb\nc");
333 }
334
335 #[test]
336 fn normalize_lf_only_is_unchanged() {
337 assert_eq!(normalize_line_breaks("a\nb\nc".to_string()), "a\nb\nc");
338 }
339
340 #[test]
341 fn normalize_mixed_line_endings() {
342 assert_eq!(
343 normalize_line_breaks("a\r\nb\rc\nd".to_string()),
344 "a\nb\nc\nd"
345 );
346 }
347
348 #[test]
349 fn normalize_empty_string_is_unchanged() {
350 assert_eq!(normalize_line_breaks(String::new()), "");
351 }
352
353 #[test]
354 fn normalize_does_not_double_lf_after_crlf() {
355 assert_eq!(normalize_line_breaks("\r\n".to_string()), "\n");
356 }
357}