1#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum Encoding {
6 Utf8,
7 Utf16Le,
8 Utf16Be,
9 Utf32Le,
10 Utf32Be,
11}
12
13#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum EncodingError {
16 InvalidBytes,
18 InvalidCodepoint(u32),
21 TruncatedUtf16,
23 TruncatedUtf32,
25}
26
27impl core::fmt::Display for EncodingError {
28 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
29 match self {
30 Self::InvalidBytes => write!(f, "invalid byte sequence for detected encoding"),
31 Self::InvalidCodepoint(cp) => write!(f, "invalid Unicode codepoint U+{cp:04X}"),
32 Self::TruncatedUtf16 => write!(f, "UTF-16 stream has an odd number of bytes"),
33 Self::TruncatedUtf32 => {
34 write!(f, "UTF-32 stream length is not a multiple of four")
35 }
36 }
37 }
38}
39
40#[must_use]
50pub fn detect_encoding(bytes: &[u8]) -> Encoding {
51 match bytes {
52 [0x00, 0x00, 0xFE, 0xFF, ..] => Encoding::Utf32Be,
54 [0xFF, 0xFE, 0x00, 0x00, ..] => Encoding::Utf32Le,
55 [0xFE, 0xFF, ..] => Encoding::Utf16Be,
57 [0xFF, 0xFE, ..] => Encoding::Utf16Le,
58 [a, 0x00, b, 0x00, ..] if *a != 0 && *b != 0 => Encoding::Utf16Le,
62 [0x00, a, 0x00, b, ..] if *a != 0 && *b != 0 => Encoding::Utf16Be,
63 [a, 0x00, ..] if *a != 0 => Encoding::Utf16Le,
64 [0x00, a, ..] if *a != 0 => Encoding::Utf16Be,
65 _ => Encoding::Utf8,
66 }
67}
68
69pub fn decode(bytes: &[u8]) -> Result<String, EncodingError> {
84 match detect_encoding(bytes) {
85 Encoding::Utf8 => decode_utf8(bytes),
86 Encoding::Utf16Le => decode_utf16(bytes, Endian::Little),
87 Encoding::Utf16Be => decode_utf16(bytes, Endian::Big),
88 Encoding::Utf32Le => decode_utf32(bytes, Endian::Little),
89 Encoding::Utf32Be => decode_utf32(bytes, Endian::Big),
90 }
91}
92
93#[derive(Clone, Copy)]
94enum Endian {
95 Little,
96 Big,
97}
98
99fn decode_utf8(bytes: &[u8]) -> Result<String, EncodingError> {
100 let s = core::str::from_utf8(bytes).map_err(|_| EncodingError::InvalidBytes)?;
101 Ok(s.strip_prefix('\u{FEFF}').unwrap_or(s).to_owned())
103}
104
105fn decode_utf16(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
106 if !bytes.len().is_multiple_of(2) {
107 return Err(EncodingError::TruncatedUtf16);
108 }
109 let units: Vec<u16> = bytes
111 .chunks_exact(2)
112 .map(|chunk| match (chunk, endian) {
113 ([lo, hi], Endian::Little) => u16::from_le_bytes([*lo, *hi]),
114 ([hi, lo], Endian::Big) => u16::from_be_bytes([*hi, *lo]),
115 _ => 0, })
117 .collect();
118
119 let units = match units.as_slice() {
121 [0xFEFF, rest @ ..] => rest,
122 other => other,
123 };
124
125 char::decode_utf16(units.iter().copied()).try_fold(
127 String::with_capacity(units.len()),
128 |mut s, r| match r {
129 Ok(ch) => {
130 s.push(ch);
131 Ok(s)
132 }
133 Err(e) => Err(EncodingError::InvalidCodepoint(u32::from(
134 e.unpaired_surrogate(),
135 ))),
136 },
137 )
138}
139
140fn decode_utf32(bytes: &[u8], endian: Endian) -> Result<String, EncodingError> {
141 if !bytes.len().is_multiple_of(4) {
142 return Err(EncodingError::TruncatedUtf32);
143 }
144 let mut out = String::with_capacity(bytes.len() / 4);
145 let mut skip_bom = true;
146 for chunk in bytes.chunks_exact(4) {
147 let cp = match (chunk, endian) {
148 ([a, b, c, d], Endian::Little) => u32::from_le_bytes([*a, *b, *c, *d]),
149 ([a, b, c, d], Endian::Big) => u32::from_be_bytes([*a, *b, *c, *d]),
150 _ => 0, };
152 if skip_bom && cp == 0xFEFF {
154 skip_bom = false;
155 continue;
156 }
157 skip_bom = false;
158 let ch = char::from_u32(cp).ok_or(EncodingError::InvalidCodepoint(cp))?;
159 out.push(ch);
160 }
161 Ok(out)
162}
163
164#[must_use]
174pub fn normalize_line_breaks(s: String) -> String {
175 if !s.contains('\r') {
177 return s;
178 }
179 let mut out = String::with_capacity(s.len());
180 let mut chars = s.chars().peekable();
181 while let Some(ch) = chars.next() {
182 if ch == '\r' {
183 if chars.peek() == Some(&'\n') {
185 let _ = chars.next();
186 }
187 out.push('\n');
188 } else {
189 out.push(ch);
190 }
191 }
192 out
193}
194
195#[cfg(test)]
200#[expect(clippy::unwrap_used, reason = "test code")]
201mod tests {
202 use rstest::rstest;
203
204 use super::*;
205
206 #[test]
211 fn detect_encoding_returns_utf8_for_empty_bytes() {
212 assert_eq!(detect_encoding(b""), Encoding::Utf8);
213 }
214
215 #[rstest]
216 #[case::utf8_bom(&[0xEF, 0xBB, 0xBF, b'a'], Encoding::Utf8)]
217 #[case::utf16_le_bom(&[0xFF, 0xFE, b'a', 0x00], Encoding::Utf16Le)]
218 #[case::utf16_be_bom(&[0xFE, 0xFF, 0x00, b'a'], Encoding::Utf16Be)]
219 #[case::utf32_le_bom(&[0xFF, 0xFE, 0x00, 0x00], Encoding::Utf32Le)]
220 #[case::utf32_be_bom(&[0x00, 0x00, 0xFE, 0xFF], Encoding::Utf32Be)]
221 fn detect_encoding_bom(#[case] bytes: &[u8], #[case] expected: Encoding) {
222 assert_eq!(detect_encoding(bytes), expected);
223 }
224
225 #[test]
226 fn detect_encoding_falls_back_to_utf8_for_plain_ascii() {
227 assert_eq!(detect_encoding(b"key: value\n"), Encoding::Utf8);
228 }
229
230 #[rstest]
231 #[case::utf16_le_without_bom(&[b'a', 0x00, b'b', 0x00], Encoding::Utf16Le)]
232 #[case::utf16_be_without_bom(&[0x00, b'a', 0x00, b'b'], Encoding::Utf16Be)]
233 fn detect_encoding_null_byte_heuristic(#[case] bytes: &[u8], #[case] expected: Encoding) {
234 assert_eq!(detect_encoding(bytes), expected);
235 }
236
237 #[rstest]
242 #[case::utf8_plain_ascii(b"hello: world\n" as &[u8], "hello: world\n")]
243 #[case::utf8_strips_bom(&[0xEF, 0xBB, 0xBF, b'k', b'e', b'y'], "key")]
244 #[case::utf16_le_no_bom(&[0x68, 0x00, 0x69, 0x00], "hi")]
245 #[case::utf16_be_no_bom(&[0x00, 0x68, 0x00, 0x69], "hi")]
246 #[case::utf16_le_strips_bom(&[0xFF, 0xFE, 0x68, 0x00, 0x69, 0x00], "hi")]
247 #[case::empty_input(b"", "")]
248 fn decode_ok(#[case] bytes: &[u8], #[case] expected: &str) {
249 assert_eq!(decode(bytes).unwrap(), expected);
250 }
251
252 #[test]
253 fn decode_invalid_utf8_returns_error() {
254 assert!(decode(&[0x80]).is_err());
256 }
257
258 #[rstest]
263 #[case::crlf_to_lf("a\r\nb\r\nc".to_string(), "a\nb\nc")]
264 #[case::lone_cr_to_lf("a\rb\rc".to_string(), "a\nb\nc")]
265 #[case::lf_only_unchanged("a\nb\nc".to_string(), "a\nb\nc")]
266 #[case::mixed_line_endings("a\r\nb\rc\nd".to_string(), "a\nb\nc\nd")]
267 #[case::empty_string_unchanged(String::new(), "")]
268 #[case::crlf_not_doubled("\r\n".to_string(), "\n")]
269 fn normalize_line_breaks_cases(#[case] input: String, #[case] expected: &str) {
270 assert_eq!(normalize_line_breaks(input), expected);
271 }
272}