feedparser_rs/util/
encoding.rs1use encoding_rs::{Encoding, UTF_8};
7
8pub fn detect_encoding(data: &[u8]) -> &'static str {
37 if data.starts_with(&[0xEF, 0xBB, 0xBF]) {
38 return "UTF-8";
39 }
40 if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
43 return "UTF-32BE";
44 }
45 if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
46 return "UTF-32LE";
47 }
48 if data.starts_with(&[0xFF, 0xFE]) {
49 return "UTF-16LE";
50 }
51 if data.starts_with(&[0xFE, 0xFF]) {
52 return "UTF-16BE";
53 }
54
55 if let Some(encoding) = extract_xml_encoding(data) {
56 return encoding;
57 }
58
59 "UTF-8"
60}
61
62fn extract_xml_encoding(data: &[u8]) -> Option<&'static str> {
66 let search_len = data.len().min(512);
67 let search_data = &data[..search_len];
68
69 if let Ok(header) = std::str::from_utf8(search_data)
70 && let Some(enc_start) = header.find("encoding=")
71 {
72 let after_eq = &header[enc_start + 9..];
73 let quote = after_eq.chars().next()?;
74 if quote == '"' || quote == '\'' {
75 let quote_end = after_eq[1..].find(quote)?;
76 let encoding_name = &after_eq[1..=quote_end];
77 return normalize_encoding_name(encoding_name);
78 }
79 }
80
81 None
82}
83
84fn normalize_encoding_name(name: &str) -> Option<&'static str> {
86 let normalized = name.trim().to_lowercase();
87 Encoding::for_label(normalized.as_bytes()).map(encoding_rs::Encoding::name)
88}
89
90pub fn convert_to_utf8(data: &[u8], encoding_name: &str) -> Result<String, String> {
117 let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
118
119 let (cow, _encoding_used, had_errors) = encoding.decode(data);
120
121 if had_errors {
122 Err(format!(
123 "Encoding conversion from {encoding_name} had errors"
124 ))
125 } else {
126 Ok(cow.into_owned())
127 }
128}
129
130pub fn detect_and_convert(data: &[u8]) -> Result<(String, &'static str), String> {
148 let encoding_name = detect_encoding(data);
149 let utf8_string = convert_to_utf8(data, encoding_name)?;
150 Ok((utf8_string, encoding_name))
151}
152
153#[cfg(test)]
154mod tests {
155 use super::*;
156
157 #[test]
158 fn test_detect_utf8_bom() {
159 let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
160 assert_eq!(detect_encoding(data), "UTF-8");
161 }
162
163 #[test]
164 fn test_detect_utf16le_bom() {
165 let data = b"\xFF\xFE<\x00?\x00x\x00m\x00l\x00";
166 assert_eq!(detect_encoding(data), "UTF-16LE");
167 }
168
169 #[test]
170 fn test_detect_utf16be_bom() {
171 let data = b"\xFE\xFF\x00<\x00?\x00x\x00m\x00l";
172 assert_eq!(detect_encoding(data), "UTF-16BE");
173 }
174
175 #[test]
176 fn test_detect_from_xml_declaration() {
177 let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
178 assert_eq!(detect_encoding(data).to_lowercase(), "windows-1252");
179 }
180
181 #[test]
182 fn test_detect_from_xml_declaration_single_quotes() {
183 let data = b"<?xml version='1.0' encoding='UTF-8'?>";
184 assert_eq!(detect_encoding(data), "UTF-8");
185 }
186
187 #[test]
188 fn test_detect_default_utf8() {
189 let data = b"<?xml version=\"1.0\"?>";
190 assert_eq!(detect_encoding(data), "UTF-8");
191 }
192
193 #[test]
194 fn test_convert_iso8859_1() {
195 let data = b"\xE9";
196 let utf8 = convert_to_utf8(data, "iso-8859-1").unwrap();
197 assert_eq!(utf8, "é");
198 }
199
200 #[test]
201 fn test_convert_windows1252() {
202 let data = b"\x93Hello\x94";
203 let utf8 = convert_to_utf8(data, "windows-1252").unwrap();
204 assert!(utf8.contains("Hello"));
205 }
206
207 #[test]
208 fn test_detect_and_convert() {
209 let data = b"<?xml version=\"1.0\"?><root>Test</root>";
210 let (utf8, encoding) = detect_and_convert(data).unwrap();
211 assert_eq!(encoding, "UTF-8");
212 assert!(utf8.contains("Test"));
213 }
214
215 #[test]
216 fn test_extract_xml_encoding_double_quotes() {
217 let data = b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
218 assert!(extract_xml_encoding(data).is_some());
219 }
220
221 #[test]
222 fn test_extract_xml_encoding_single_quotes() {
223 let data = b"<?xml version='1.0' encoding='UTF-8'?>";
224 assert!(extract_xml_encoding(data).is_some());
225 }
226
227 #[test]
228 fn test_extract_xml_encoding_none() {
229 let data = b"<?xml version=\"1.0\"?>";
230 assert!(extract_xml_encoding(data).is_none());
231 }
232
233 #[test]
234 fn test_normalize_encoding_name() {
235 assert_eq!(normalize_encoding_name("UTF-8"), Some("UTF-8"));
236 assert_eq!(normalize_encoding_name("utf-8"), Some("UTF-8"));
237 assert_eq!(normalize_encoding_name(" UTF-8 "), Some("UTF-8"));
238 assert_eq!(normalize_encoding_name("ISO-8859-1"), Some("windows-1252"));
239 }
240
241 #[test]
242 fn test_convert_utf8_to_utf8() {
243 let data = b"Hello";
244 let result = convert_to_utf8(data, "utf-8").unwrap();
245 assert_eq!(result, "Hello");
246 }
247
248 #[test]
249 fn test_detect_no_encoding_declaration() {
250 let data = b"<rss><channel></channel></rss>";
251 assert_eq!(detect_encoding(data), "UTF-8");
252 }
253
254 #[test]
255 fn test_empty_data() {
256 let data = b"";
257 assert_eq!(detect_encoding(data), "UTF-8");
258 }
259}