feedparser_rs/util/
encoding.rs

1//! Encoding detection and conversion utilities
2//!
3//! This module provides functions for detecting character encoding
4//! and converting to UTF-8.
5
6use encoding_rs::{Encoding, UTF_8};
7
8/// Detect character encoding from byte data
9///
10/// Detection order:
11/// 1. BOM (Byte Order Mark)
12/// 2. XML declaration (<?xml encoding="..."?>)
13/// 3. Default to UTF-8
14///
15/// # Arguments
16///
17/// * `data` - Raw byte data
18///
19/// # Returns
20///
21/// Detected encoding name (e.g., "utf-8", "iso-8859-1")
22///
23/// # Examples
24///
25/// ```
26/// use feedparser_rs::util::encoding::detect_encoding;
27///
28/// // UTF-8 with BOM
29/// let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
30/// assert_eq!(detect_encoding(data), "UTF-8");
31///
32/// // XML declaration
33/// let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
34/// assert_eq!(detect_encoding(data), "windows-1252");
35/// ```
36pub fn detect_encoding(data: &[u8]) -> &'static str {
37    if data.starts_with(&[0xEF, 0xBB, 0xBF]) {
38        return "UTF-8";
39    }
40    // UTF-32 BOMs must be checked BEFORE UTF-16 BOMs
41    // because UTF-32LE BOM (FF FE 00 00) starts with UTF-16LE BOM (FF FE)
42    if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
43        return "UTF-32BE";
44    }
45    if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
46        return "UTF-32LE";
47    }
48    if data.starts_with(&[0xFF, 0xFE]) {
49        return "UTF-16LE";
50    }
51    if data.starts_with(&[0xFE, 0xFF]) {
52        return "UTF-16BE";
53    }
54
55    if let Some(encoding) = extract_xml_encoding(data) {
56        return encoding;
57    }
58
59    "UTF-8"
60}
61
62/// Extract encoding from XML declaration
63///
64/// Parses <?xml version="1.0" encoding="..."?> declaration
65fn extract_xml_encoding(data: &[u8]) -> Option<&'static str> {
66    let search_len = data.len().min(512);
67    let search_data = &data[..search_len];
68
69    if let Ok(header) = std::str::from_utf8(search_data)
70        && let Some(enc_start) = header.find("encoding=")
71    {
72        let after_eq = &header[enc_start + 9..];
73        let quote = after_eq.chars().next()?;
74        if quote == '"' || quote == '\'' {
75            let quote_end = after_eq[1..].find(quote)?;
76            let encoding_name = &after_eq[1..=quote_end];
77            return normalize_encoding_name(encoding_name);
78        }
79    }
80
81    None
82}
83
84/// Normalize encoding name to `encoding_rs` canonical form
85fn normalize_encoding_name(name: &str) -> Option<&'static str> {
86    let normalized = name.trim().to_lowercase();
87    Encoding::for_label(normalized.as_bytes()).map(encoding_rs::Encoding::name)
88}
89
90/// Convert data to UTF-8 from detected encoding
91///
92/// # Arguments
93///
94/// * `data` - Raw byte data in unknown encoding
95/// * `encoding_name` - Encoding name (e.g., "iso-8859-1")
96///
97/// # Returns
98///
99/// * `Ok(String)` - UTF-8 string
100/// * `Err(String)` - Error message if conversion failed
101///
102/// # Examples
103///
104/// ```
105/// use feedparser_rs::util::encoding::convert_to_utf8;
106///
107/// let latin1 = b"\xE9"; // é in ISO-8859-1
108/// let utf8 = convert_to_utf8(latin1, "iso-8859-1").unwrap();
109/// assert_eq!(utf8, "é");
110/// ```
111///
112/// # Errors
113///
114/// Returns an error if the encoding conversion encounters invalid byte sequences
115/// that cannot be properly decoded.
116pub fn convert_to_utf8(data: &[u8], encoding_name: &str) -> Result<String, String> {
117    let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
118
119    let (cow, _encoding_used, had_errors) = encoding.decode(data);
120
121    if had_errors {
122        Err(format!(
123            "Encoding conversion from {encoding_name} had errors"
124        ))
125    } else {
126        Ok(cow.into_owned())
127    }
128}
129
130/// Detect encoding and convert to UTF-8 in one step
131///
132/// # Examples
133///
134/// ```
135/// use feedparser_rs::util::encoding::detect_and_convert;
136///
137/// let data = b"<?xml version=\"1.0\"?><root>Test</root>";
138/// let (utf8, detected_encoding) = detect_and_convert(data).unwrap();
139/// assert_eq!(detected_encoding, "UTF-8");
140/// assert!(utf8.contains("Test"));
141/// ```
142///
143/// # Errors
144///
145/// Returns an error if the encoding conversion encounters invalid byte sequences
146/// that cannot be properly decoded.
147pub fn detect_and_convert(data: &[u8]) -> Result<(String, &'static str), String> {
148    let encoding_name = detect_encoding(data);
149    let utf8_string = convert_to_utf8(data, encoding_name)?;
150    Ok((utf8_string, encoding_name))
151}
152
153#[cfg(test)]
154mod tests {
155    use super::*;
156
157    #[test]
158    fn test_detect_utf8_bom() {
159        let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
160        assert_eq!(detect_encoding(data), "UTF-8");
161    }
162
163    #[test]
164    fn test_detect_utf16le_bom() {
165        let data = b"\xFF\xFE<\x00?\x00x\x00m\x00l\x00";
166        assert_eq!(detect_encoding(data), "UTF-16LE");
167    }
168
169    #[test]
170    fn test_detect_utf16be_bom() {
171        let data = b"\xFE\xFF\x00<\x00?\x00x\x00m\x00l";
172        assert_eq!(detect_encoding(data), "UTF-16BE");
173    }
174
175    #[test]
176    fn test_detect_from_xml_declaration() {
177        let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
178        assert_eq!(detect_encoding(data).to_lowercase(), "windows-1252");
179    }
180
181    #[test]
182    fn test_detect_from_xml_declaration_single_quotes() {
183        let data = b"<?xml version='1.0' encoding='UTF-8'?>";
184        assert_eq!(detect_encoding(data), "UTF-8");
185    }
186
187    #[test]
188    fn test_detect_default_utf8() {
189        let data = b"<?xml version=\"1.0\"?>";
190        assert_eq!(detect_encoding(data), "UTF-8");
191    }
192
193    #[test]
194    fn test_convert_iso8859_1() {
195        let data = b"\xE9";
196        let utf8 = convert_to_utf8(data, "iso-8859-1").unwrap();
197        assert_eq!(utf8, "é");
198    }
199
200    #[test]
201    fn test_convert_windows1252() {
202        let data = b"\x93Hello\x94";
203        let utf8 = convert_to_utf8(data, "windows-1252").unwrap();
204        assert!(utf8.contains("Hello"));
205    }
206
207    #[test]
208    fn test_detect_and_convert() {
209        let data = b"<?xml version=\"1.0\"?><root>Test</root>";
210        let (utf8, encoding) = detect_and_convert(data).unwrap();
211        assert_eq!(encoding, "UTF-8");
212        assert!(utf8.contains("Test"));
213    }
214
215    #[test]
216    fn test_extract_xml_encoding_double_quotes() {
217        let data = b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
218        assert!(extract_xml_encoding(data).is_some());
219    }
220
221    #[test]
222    fn test_extract_xml_encoding_single_quotes() {
223        let data = b"<?xml version='1.0' encoding='UTF-8'?>";
224        assert!(extract_xml_encoding(data).is_some());
225    }
226
227    #[test]
228    fn test_extract_xml_encoding_none() {
229        let data = b"<?xml version=\"1.0\"?>";
230        assert!(extract_xml_encoding(data).is_none());
231    }
232
233    #[test]
234    fn test_normalize_encoding_name() {
235        assert_eq!(normalize_encoding_name("UTF-8"), Some("UTF-8"));
236        assert_eq!(normalize_encoding_name("utf-8"), Some("UTF-8"));
237        assert_eq!(normalize_encoding_name("  UTF-8  "), Some("UTF-8"));
238        assert_eq!(normalize_encoding_name("ISO-8859-1"), Some("windows-1252"));
239    }
240
241    #[test]
242    fn test_convert_utf8_to_utf8() {
243        let data = b"Hello";
244        let result = convert_to_utf8(data, "utf-8").unwrap();
245        assert_eq!(result, "Hello");
246    }
247
248    #[test]
249    fn test_detect_no_encoding_declaration() {
250        let data = b"<rss><channel></channel></rss>";
251        assert_eq!(detect_encoding(data), "UTF-8");
252    }
253
254    #[test]
255    fn test_empty_data() {
256        let data = b"";
257        assert_eq!(detect_encoding(data), "UTF-8");
258    }
259}