1use chardetng::EncodingDetector;
4use simdutf8::basic::from_utf8;
5
6pub fn is_utf8(data: &[u8]) -> bool {
10 from_utf8(data).is_ok()
11}
12
13pub fn has_utf8_bom(data: &[u8]) -> bool {
17 data.len() >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF
18}
19
20pub fn skip_bom(data: &[u8]) -> &[u8] {
22 if has_utf8_bom(data) { &data[3..] } else { data }
23}
24
25pub fn detect_encoding(data: &[u8]) -> EncodingInfo {
29 let has_bom = has_utf8_bom(data);
30 let data_without_bom = skip_bom(data);
31 let valid_utf8 = is_utf8(data_without_bom);
32
33 EncodingInfo {
34 is_utf8: valid_utf8,
35 has_bom,
36 }
37}
38
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
41pub struct EncodingInfo {
42 pub is_utf8: bool,
44 pub has_bom: bool,
46}
47
48impl EncodingInfo {
49 pub const fn new(is_utf8: bool, has_bom: bool) -> Self {
51 Self { is_utf8, has_bom }
52 }
53}
54
55pub fn detect_and_transcode(data: &[u8]) -> (std::borrow::Cow<'_, [u8]>, bool) {
68 if data.len() >= 2 {
70 if data[0] == 0xFF && data[1] == 0xFE {
72 let (decoded, _, _) = encoding_rs::UTF_16LE.decode(data);
73 return (
74 std::borrow::Cow::Owned(decoded.into_owned().into_bytes()),
75 true,
76 );
77 }
78 if data[0] == 0xFE && data[1] == 0xFF {
80 let (decoded, _, _) = encoding_rs::UTF_16BE.decode(data);
81 return (
82 std::borrow::Cow::Owned(decoded.into_owned().into_bytes()),
83 true,
84 );
85 }
86 }
87
88 if is_utf8(data) {
90 return (std::borrow::Cow::Borrowed(data), false);
91 }
92
93 let mut detector = EncodingDetector::new();
95 detector.feed(data, true);
96 let encoding = detector.guess(None, true);
97
98 if encoding == encoding_rs::UTF_8 {
100 return (std::borrow::Cow::Borrowed(data), false);
101 }
102
103 let (decoded, _, _) = encoding.decode(data);
105 (
106 std::borrow::Cow::Owned(decoded.into_owned().into_bytes()),
107 true,
108 )
109}
110
111#[cfg(test)]
112mod tests {
113 use super::*;
114
115 #[test]
116 fn test_is_utf8() {
117 assert!(is_utf8(b"Hello, World!"));
118 assert!(is_utf8("こんにちは".as_bytes()));
119 assert!(is_utf8(b""));
120 }
121
122 #[test]
123 fn test_invalid_utf8() {
124 assert!(!is_utf8(&[0xFF, 0xFE]));
126 assert!(!is_utf8(&[0x80, 0x81, 0x82]));
127 }
128
129 #[test]
130 fn test_utf8_bom() {
131 let with_bom = [0xEF, 0xBB, 0xBF, b'a', b'b', b'c'];
132 let without_bom = b"abc";
133
134 assert!(has_utf8_bom(&with_bom));
135 assert!(!has_utf8_bom(without_bom));
136
137 assert_eq!(skip_bom(&with_bom), b"abc");
138 assert_eq!(skip_bom(without_bom), b"abc");
139 }
140
141 #[test]
142 fn test_detect_encoding() {
143 let info = detect_encoding(b"Hello");
144 assert!(info.is_utf8);
145 assert!(!info.has_bom);
146
147 let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
148 let info = detect_encoding(&with_bom);
149 assert!(info.is_utf8);
150 assert!(info.has_bom);
151 }
152
153 #[test]
154 fn test_detect_and_transcode_utf8() {
155 let data = b"Hello, World!";
157 let (result, was_transcoded) = detect_and_transcode(data);
158 assert!(!was_transcoded);
159 assert_eq!(&result[..], data);
160 }
161
162 #[test]
163 fn test_detect_and_transcode_utf16_le() {
164 let data: &[u8] = &[0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
166 let (result, was_transcoded) = detect_and_transcode(data);
167 assert!(was_transcoded);
168 assert!(is_utf8(&result));
170 }
171
172 #[test]
173 fn test_detect_and_transcode_windows1251() {
174 let data: &[u8] = &[0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
177 let (result, was_transcoded) = detect_and_transcode(data);
178 assert!(was_transcoded);
180 assert!(is_utf8(&result));
182 }
183}