1#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum Encoding {
6 Utf8,
8 Utf16Le,
10 Utf16Be,
12 Utf32Le,
14 Utf32Be,
16}
17
18#[derive(Debug, Clone, PartialEq, Eq)]
20pub enum EncodingError {
21 InvalidBytes,
23 InvalidCodepoint(u32),
26 TruncatedUtf16,
28 TruncatedUtf32,
30}
31
32impl core::fmt::Display for EncodingError {
33 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
34 match self {
35 Self::InvalidBytes => write!(f, "invalid byte sequence for detected encoding"),
36 Self::InvalidCodepoint(cp) => write!(f, "invalid Unicode codepoint U+{cp:04X}"),
37 Self::TruncatedUtf16 => write!(f, "UTF-16 stream has an odd number of bytes"),
38 Self::TruncatedUtf32 => {
39 write!(f, "UTF-32 stream length is not a multiple of four")
40 }
41 }
42 }
43}
44
45#[must_use]
55pub fn detect_encoding(bytes: &[u8]) -> Encoding {
56 match bytes {
57 [0x00, 0x00, 0xFE, 0xFF, ..] => Encoding::Utf32Be,
59 [0xFF, 0xFE, 0x00, 0x00, ..] => Encoding::Utf32Le,
60 [0xFE, 0xFF, ..] => Encoding::Utf16Be,
62 [0xFF, 0xFE, ..] => Encoding::Utf16Le,
63 [a, 0x00, b, 0x00, ..] if *a != 0 && *b != 0 => Encoding::Utf16Le,
67 [0x00, a, 0x00, b, ..] if *a != 0 && *b != 0 => Encoding::Utf16Be,
68 [a, 0x00, ..] if *a != 0 => Encoding::Utf16Le,
69 [0x00, a, ..] if *a != 0 => Encoding::Utf16Be,
70 _ => Encoding::Utf8,
71 }
72}
73
74pub fn decode(bytes: &[u8]) -> Result<String, EncodingError> {
89 match detect_encoding(bytes) {
90 Encoding::Utf8 => decode_utf8(bytes),
91 Encoding::Utf16Le => decode_utf16(bytes, Endian::Little),
92 Encoding::Utf16Be => decode_utf16(bytes, Endian::Big),
93 Encoding::Utf32Le => decode_utf32(bytes, Endian::Little),
94 Encoding::Utf32Be => decode_utf32(bytes, Endian::Big),
95 }
96}
97
98#[derive(Clone, Copy)]
99enum Endian {
100 Little,
101 Big,
102}
103
104fn decode_utf8(bytes: &[u8]) -> Result<String, EncodingError> {
105 let s = core::str::from_utf8(bytes).map_err(|_| EncodingError::InvalidBytes)?;
106 Ok(s.strip_prefix('\u{FEFF}').unwrap_or(s).to_owned())
108}
109
110fn decode_utf16(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
111 if !bytes.len().is_multiple_of(2) {
112 return Err(EncodingError::TruncatedUtf16);
113 }
114 let units: Vec<u16> = bytes
116 .chunks_exact(2)
117 .map(|chunk| match (chunk, endian) {
118 ([lo, hi], Endian::Little) => u16::from_le_bytes([*lo, *hi]),
119 ([hi, lo], Endian::Big) => u16::from_be_bytes([*hi, *lo]),
120 _ => 0, })
122 .collect();
123
124 let units = match units.as_slice() {
126 [0xFEFF, rest @ ..] => rest,
127 other => other,
128 };
129
130 char::decode_utf16(units.iter().copied()).try_fold(
132 String::with_capacity(units.len()),
133 |mut s, r| match r {
134 Ok(ch) => {
135 s.push(ch);
136 Ok(s)
137 }
138 Err(e) => Err(EncodingError::InvalidCodepoint(u32::from(
139 e.unpaired_surrogate(),
140 ))),
141 },
142 )
143}
144
145fn decode_utf32(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
146 if !bytes.len().is_multiple_of(4) {
147 return Err(EncodingError::TruncatedUtf32);
148 }
149 let mut out = String::with_capacity(bytes.len() / 4);
150 let mut skip_bom = true;
151 for chunk in bytes.chunks_exact(4) {
152 let cp = match (chunk, endian) {
153 ([a, b, c, d], Endian::Little) => u32::from_le_bytes([*a, *b, *c, *d]),
154 ([a, b, c, d], Endian::Big) => u32::from_be_bytes([*a, *b, *c, *d]),
155 _ => 0, };
157 if skip_bom && cp == 0xFEFF {
159 skip_bom = false;
160 continue;
161 }
162 skip_bom = false;
163 let ch = char::from_u32(cp).ok_or(EncodingError::InvalidCodepoint(cp))?;
164 out.push(ch);
165 }
166 Ok(out)
167}
168
169#[must_use]
179pub fn normalize_line_breaks(s: String) -> String {
180 if !s.contains('\r') {
182 return s;
183 }
184 let mut out = String::with_capacity(s.len());
185 let mut chars = s.chars().peekable();
186 while let Some(ch) = chars.next() {
187 if ch == '\r' {
188 if chars.peek() == Some(&'\n') {
190 let _ = chars.next();
191 }
192 out.push('\n');
193 } else {
194 out.push(ch);
195 }
196 }
197 out
198}
199
200#[cfg(test)]
205#[expect(clippy::unwrap_used, reason = "test code")]
206mod tests {
207 use rstest::rstest;
208
209 use super::*;
210
211 #[test]
216 fn detect_encoding_returns_utf8_for_empty_bytes() {
217 assert_eq!(detect_encoding(b""), Encoding::Utf8);
218 }
219
220 #[rstest]
221 #[case::utf8_bom(&[0xEF, 0xBB, 0xBF, b'a'], Encoding::Utf8)]
222 #[case::utf16_le_bom(&[0xFF, 0xFE, b'a', 0x00], Encoding::Utf16Le)]
223 #[case::utf16_be_bom(&[0xFE, 0xFF, 0x00, b'a'], Encoding::Utf16Be)]
224 #[case::utf32_le_bom(&[0xFF, 0xFE, 0x00, 0x00], Encoding::Utf32Le)]
225 #[case::utf32_be_bom(&[0x00, 0x00, 0xFE, 0xFF], Encoding::Utf32Be)]
226 fn detect_encoding_bom(#[case] bytes: &[u8], #[case] expected: Encoding) {
227 assert_eq!(detect_encoding(bytes), expected);
228 }
229
230 #[test]
231 fn detect_encoding_falls_back_to_utf8_for_plain_ascii() {
232 assert_eq!(detect_encoding(b"key: value\n"), Encoding::Utf8);
233 }
234
235 #[rstest]
236 #[case::utf16_le_without_bom(&[b'a', 0x00, b'b', 0x00], Encoding::Utf16Le)]
237 #[case::utf16_be_without_bom(&[0x00, b'a', 0x00, b'b'], Encoding::Utf16Be)]
238 #[case::utf16_le_two_byte_heuristic(&[b'a', 0x00], Encoding::Utf16Le)]
241 #[case::utf16_be_two_byte_heuristic(&[0x00, b'a'], Encoding::Utf16Be)]
242 fn detect_encoding_null_byte_heuristic(#[case] bytes: &[u8], #[case] expected: Encoding) {
243 assert_eq!(detect_encoding(bytes), expected);
244 }
245
246 #[rstest]
251 #[case::utf8_plain_ascii(b"hello: world\n" as &[u8], "hello: world\n")]
252 #[case::utf8_strips_bom(&[0xEF, 0xBB, 0xBF, b'k', b'e', b'y'], "key")]
253 #[case::utf16_le_no_bom(&[0x68, 0x00, 0x69, 0x00], "hi")]
254 #[case::utf16_be_no_bom(&[0x00, 0x68, 0x00, 0x69], "hi")]
255 #[case::utf16_le_strips_bom(&[0xFF, 0xFE, 0x68, 0x00, 0x69, 0x00], "hi")]
256 #[case::empty_input(b"", "")]
257 #[case::utf32_le_with_bom(&[0xFF, 0xFE, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00], "A")]
260 fn decode_ok(#[case] bytes: &[u8], #[case] expected: &str) {
261 assert_eq!(decode(bytes).unwrap(), expected);
262 }
263
264 #[test]
267 fn decode_utf32_be_second_bom_codepoint_kept_as_content() {
268 let input: &[u8] = &[0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0xFE, 0xFF];
271 assert_eq!(decode(input).unwrap(), "\u{FEFF}");
272 }
273
274 #[test]
275 fn decode_invalid_utf8_returns_error() {
276 assert!(decode(&[0x80]).is_err());
278 }
279
280 #[rstest]
285 #[case::crlf_to_lf("a\r\nb\r\nc".to_string(), "a\nb\nc")]
286 #[case::lone_cr_to_lf("a\rb\rc".to_string(), "a\nb\nc")]
287 #[case::lf_only_unchanged("a\nb\nc".to_string(), "a\nb\nc")]
288 #[case::mixed_line_endings("a\r\nb\rc\nd".to_string(), "a\nb\nc\nd")]
289 #[case::empty_string_unchanged(String::new(), "")]
290 #[case::crlf_not_doubled("\r\n".to_string(), "\n")]
291 fn normalize_line_breaks_cases(#[case] input: String, #[case] expected: &str) {
292 assert_eq!(normalize_line_breaks(input), expected);
293 }
294}