feedparser_rs/util/
encoding.rs

1//! Encoding detection and conversion utilities
2//!
3//! This module provides functions for detecting character encoding
4//! and converting to UTF-8.
5//!
6//! Encoding detection follows this priority order:
7//! 1. BOM (Byte Order Mark) - highest priority
8//! 2. HTTP Content-Type charset (if provided)
9//! 3. XML declaration encoding attribute
10//! 4. Default to UTF-8
11
12use encoding_rs::{Encoding, UTF_8};
13
14/// Detect character encoding from byte data
15///
16/// Detection order:
17/// 1. BOM (Byte Order Mark)
18/// 2. XML declaration (<?xml encoding="..."?>)
19/// 3. Default to UTF-8
20///
21/// # Arguments
22///
23/// * `data` - Raw byte data
24///
25/// # Returns
26///
27/// Detected encoding name (e.g., "utf-8", "iso-8859-1")
28///
29/// # Examples
30///
31/// ```
32/// use feedparser_rs::util::encoding::detect_encoding;
33///
34/// // UTF-8 with BOM
35/// let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
36/// assert_eq!(detect_encoding(data), "UTF-8");
37///
38/// // XML declaration
39/// let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
40/// assert_eq!(detect_encoding(data), "windows-1252");
41/// ```
42pub fn detect_encoding(data: &[u8]) -> &'static str {
43    // Check BOM first
44    if let Some(bom_encoding) = detect_bom(data) {
45        return bom_encoding;
46    }
47
48    // Check XML declaration
49    if let Some(encoding) = extract_xml_encoding(data) {
50        return encoding;
51    }
52
53    // Default to UTF-8
54    "UTF-8"
55}
56
57/// Extract encoding from XML declaration
58///
59/// Parses <?xml version="1.0" encoding="..."?> declaration
60fn extract_xml_encoding(data: &[u8]) -> Option<&'static str> {
61    let search_len = data.len().min(512);
62    let search_data = &data[..search_len];
63
64    if let Ok(header) = std::str::from_utf8(search_data)
65        && let Some(enc_start) = header.find("encoding=")
66    {
67        let after_eq = &header[enc_start + 9..];
68        let quote = after_eq.chars().next()?;
69        if quote == '"' || quote == '\'' {
70            let quote_end = after_eq[1..].find(quote)?;
71            let encoding_name = &after_eq[1..=quote_end];
72            return normalize_encoding_name(encoding_name);
73        }
74    }
75
76    None
77}
78
79/// Normalize encoding name to `encoding_rs` canonical form
80fn normalize_encoding_name(name: &str) -> Option<&'static str> {
81    let normalized = name.trim().to_lowercase();
82    Encoding::for_label(normalized.as_bytes()).map(encoding_rs::Encoding::name)
83}
84
85/// Convert data to UTF-8 from detected encoding
86///
87/// # Arguments
88///
89/// * `data` - Raw byte data in unknown encoding
90/// * `encoding_name` - Encoding name (e.g., "iso-8859-1")
91///
92/// # Returns
93///
94/// * `Ok(String)` - UTF-8 string
95/// * `Err(String)` - Error message if conversion failed
96///
97/// # Examples
98///
99/// ```
100/// use feedparser_rs::util::encoding::convert_to_utf8;
101///
102/// let latin1 = b"\xE9"; // é in ISO-8859-1
103/// let utf8 = convert_to_utf8(latin1, "iso-8859-1").unwrap();
104/// assert_eq!(utf8, "é");
105/// ```
106///
107/// # Errors
108///
109/// Returns an error if the encoding conversion encounters invalid byte sequences
110/// that cannot be properly decoded.
111pub fn convert_to_utf8(data: &[u8], encoding_name: &str) -> Result<String, String> {
112    let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
113
114    let (cow, _encoding_used, had_errors) = encoding.decode(data);
115
116    if had_errors {
117        Err(format!(
118            "Encoding conversion from {encoding_name} had errors"
119        ))
120    } else {
121        Ok(cow.into_owned())
122    }
123}
124
125/// Detect encoding and convert to UTF-8 in one step
126///
127/// # Examples
128///
129/// ```
130/// use feedparser_rs::util::encoding::detect_and_convert;
131///
132/// let data = b"<?xml version=\"1.0\"?><root>Test</root>";
133/// let (utf8, detected_encoding) = detect_and_convert(data).unwrap();
134/// assert_eq!(detected_encoding, "UTF-8");
135/// assert!(utf8.contains("Test"));
136/// ```
137///
138/// # Errors
139///
140/// Returns an error if the encoding conversion encounters invalid byte sequences
141/// that cannot be properly decoded.
142pub fn detect_and_convert(data: &[u8]) -> Result<(String, &'static str), String> {
143    let encoding_name = detect_encoding(data);
144    let utf8_string = convert_to_utf8(data, encoding_name)?;
145    Ok((utf8_string, encoding_name))
146}
147
148/// Extract charset from HTTP Content-Type header
149///
150/// Parses the charset parameter from Content-Type headers like:
151/// - `text/xml; charset=utf-8`
152/// - `application/xml;charset=ISO-8859-1`
153/// - `text/html; charset="UTF-8"`
154///
155/// # Arguments
156///
157/// * `content_type` - The Content-Type header value
158///
159/// # Returns
160///
161/// The charset value if found, or None
162///
163/// # Examples
164///
165/// ```
166/// use feedparser_rs::util::encoding::extract_charset_from_content_type;
167///
168/// assert_eq!(
169///     extract_charset_from_content_type("text/xml; charset=utf-8"),
170///     Some("UTF-8")
171/// );
172/// assert_eq!(
173///     extract_charset_from_content_type("text/html"),
174///     None
175/// );
176/// ```
177#[must_use]
178pub fn extract_charset_from_content_type(content_type: &str) -> Option<&'static str> {
179    let lowercase = content_type.to_lowercase();
180
181    // Find charset= parameter
182    let charset_start = lowercase.find("charset=")?;
183    let value_start = charset_start + 8;
184    let rest = &content_type[value_start..];
185
186    // Handle quoted values: charset="UTF-8"
187    let charset_value = if rest.starts_with('"') || rest.starts_with('\'') {
188        let quote = rest.chars().next()?;
189        let end = rest[1..].find(quote)?;
190        &rest[1..=end]
191    } else {
192        // Unquoted value: charset=UTF-8
193        // End at semicolon, space, or end of string
194        let end = rest
195            .find(|c: char| c == ';' || c.is_whitespace())
196            .unwrap_or(rest.len());
197        &rest[..end]
198    };
199
200    normalize_encoding_name(charset_value)
201}
202
203/// Detect encoding with optional HTTP Content-Type hint
204///
205/// This is the preferred function when parsing feeds from HTTP responses,
206/// as it considers the Content-Type charset parameter in addition to
207/// BOM and XML declaration detection.
208///
209/// # Priority Order
210///
211/// 1. BOM (Byte Order Mark) - highest priority, cannot be wrong
212/// 2. HTTP Content-Type charset (if provided)
213/// 3. XML declaration encoding attribute
214/// 4. Default to UTF-8
215///
216/// # Arguments
217///
218/// * `data` - Raw byte data
219/// * `content_type` - Optional HTTP Content-Type header value
220///
221/// # Returns
222///
223/// Detected encoding name
224///
225/// # Examples
226///
227/// ```
228/// use feedparser_rs::util::encoding::detect_encoding_with_hint;
229///
230/// // BOM takes priority over Content-Type
231/// let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
232/// assert_eq!(
233///     detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
234///     "UTF-8"
235/// );
236///
237/// // Content-Type is used when no BOM
238/// let data = b"<?xml version=\"1.0\"?>";
239/// assert_eq!(
240///     detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
241///     "windows-1252"
242/// );
243///
244/// // Falls back to XML declaration when no Content-Type
245/// let data = b"<?xml version=\"1.0\" encoding=\"UTF-16\"?>";
246/// assert_eq!(detect_encoding_with_hint(data, None), "UTF-16LE");
247/// ```
248pub fn detect_encoding_with_hint(data: &[u8], content_type: Option<&str>) -> &'static str {
249    // Check BOM first - highest priority
250    if let Some(bom_encoding) = detect_bom(data) {
251        return bom_encoding;
252    }
253
254    // Check Content-Type charset if provided
255    if let Some(ct) = content_type
256        && let Some(charset) = extract_charset_from_content_type(ct)
257    {
258        return charset;
259    }
260
261    // Check XML declaration
262    if let Some(encoding) = extract_xml_encoding(data) {
263        return encoding;
264    }
265
266    // Default to UTF-8
267    "UTF-8"
268}
269
270/// Detect encoding from BOM only
271///
272/// Returns the encoding if a BOM is present, None otherwise.
273fn detect_bom(data: &[u8]) -> Option<&'static str> {
274    if data.starts_with(&[0xEF, 0xBB, 0xBF]) {
275        return Some("UTF-8");
276    }
277    // UTF-32 BOMs must be checked BEFORE UTF-16 BOMs
278    // because UTF-32LE BOM (FF FE 00 00) starts with UTF-16LE BOM (FF FE)
279    if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
280        return Some("UTF-32BE");
281    }
282    if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
283        return Some("UTF-32LE");
284    }
285    if data.starts_with(&[0xFF, 0xFE]) {
286        return Some("UTF-16LE");
287    }
288    if data.starts_with(&[0xFE, 0xFF]) {
289        return Some("UTF-16BE");
290    }
291    None
292}
293
294#[cfg(test)]
295mod tests {
296    use super::*;
297
298    #[test]
299    fn test_detect_utf8_bom() {
300        let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
301        assert_eq!(detect_encoding(data), "UTF-8");
302    }
303
304    #[test]
305    fn test_detect_utf16le_bom() {
306        let data = b"\xFF\xFE<\x00?\x00x\x00m\x00l\x00";
307        assert_eq!(detect_encoding(data), "UTF-16LE");
308    }
309
310    #[test]
311    fn test_detect_utf16be_bom() {
312        let data = b"\xFE\xFF\x00<\x00?\x00x\x00m\x00l";
313        assert_eq!(detect_encoding(data), "UTF-16BE");
314    }
315
316    #[test]
317    fn test_detect_from_xml_declaration() {
318        let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
319        assert_eq!(detect_encoding(data).to_lowercase(), "windows-1252");
320    }
321
322    #[test]
323    fn test_detect_from_xml_declaration_single_quotes() {
324        let data = b"<?xml version='1.0' encoding='UTF-8'?>";
325        assert_eq!(detect_encoding(data), "UTF-8");
326    }
327
328    #[test]
329    fn test_detect_default_utf8() {
330        let data = b"<?xml version=\"1.0\"?>";
331        assert_eq!(detect_encoding(data), "UTF-8");
332    }
333
334    #[test]
335    fn test_convert_iso8859_1() {
336        let data = b"\xE9";
337        let utf8 = convert_to_utf8(data, "iso-8859-1").unwrap();
338        assert_eq!(utf8, "é");
339    }
340
341    #[test]
342    fn test_convert_windows1252() {
343        let data = b"\x93Hello\x94";
344        let utf8 = convert_to_utf8(data, "windows-1252").unwrap();
345        assert!(utf8.contains("Hello"));
346    }
347
348    #[test]
349    fn test_detect_and_convert() {
350        let data = b"<?xml version=\"1.0\"?><root>Test</root>";
351        let (utf8, encoding) = detect_and_convert(data).unwrap();
352        assert_eq!(encoding, "UTF-8");
353        assert!(utf8.contains("Test"));
354    }
355
356    #[test]
357    fn test_extract_xml_encoding_double_quotes() {
358        let data = b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
359        assert!(extract_xml_encoding(data).is_some());
360    }
361
362    #[test]
363    fn test_extract_xml_encoding_single_quotes() {
364        let data = b"<?xml version='1.0' encoding='UTF-8'?>";
365        assert!(extract_xml_encoding(data).is_some());
366    }
367
368    #[test]
369    fn test_extract_xml_encoding_none() {
370        let data = b"<?xml version=\"1.0\"?>";
371        assert!(extract_xml_encoding(data).is_none());
372    }
373
374    #[test]
375    fn test_normalize_encoding_name() {
376        assert_eq!(normalize_encoding_name("UTF-8"), Some("UTF-8"));
377        assert_eq!(normalize_encoding_name("utf-8"), Some("UTF-8"));
378        assert_eq!(normalize_encoding_name("  UTF-8  "), Some("UTF-8"));
379        assert_eq!(normalize_encoding_name("ISO-8859-1"), Some("windows-1252"));
380    }
381
382    #[test]
383    fn test_convert_utf8_to_utf8() {
384        let data = b"Hello";
385        let result = convert_to_utf8(data, "utf-8").unwrap();
386        assert_eq!(result, "Hello");
387    }
388
389    #[test]
390    fn test_detect_no_encoding_declaration() {
391        let data = b"<rss><channel></channel></rss>";
392        assert_eq!(detect_encoding(data), "UTF-8");
393    }
394
395    #[test]
396    fn test_empty_data() {
397        let data = b"";
398        assert_eq!(detect_encoding(data), "UTF-8");
399    }
400
401    // Tests for Content-Type charset extraction
402
403    #[test]
404    fn test_extract_charset_basic() {
405        assert_eq!(
406            extract_charset_from_content_type("text/xml; charset=utf-8"),
407            Some("UTF-8")
408        );
409    }
410
411    #[test]
412    fn test_extract_charset_no_space() {
413        assert_eq!(
414            extract_charset_from_content_type("text/xml;charset=utf-8"),
415            Some("UTF-8")
416        );
417    }
418
419    #[test]
420    fn test_extract_charset_quoted() {
421        assert_eq!(
422            extract_charset_from_content_type("text/xml; charset=\"UTF-8\""),
423            Some("UTF-8")
424        );
425    }
426
427    #[test]
428    fn test_extract_charset_single_quoted() {
429        assert_eq!(
430            extract_charset_from_content_type("text/xml; charset='UTF-8'"),
431            Some("UTF-8")
432        );
433    }
434
435    #[test]
436    fn test_extract_charset_uppercase() {
437        assert_eq!(
438            extract_charset_from_content_type("TEXT/XML; CHARSET=UTF-8"),
439            Some("UTF-8")
440        );
441    }
442
443    #[test]
444    fn test_extract_charset_iso8859() {
445        assert_eq!(
446            extract_charset_from_content_type("text/html; charset=iso-8859-1"),
447            Some("windows-1252")
448        );
449    }
450
451    #[test]
452    fn test_extract_charset_none() {
453        assert_eq!(extract_charset_from_content_type("text/xml"), None);
454    }
455
456    #[test]
457    fn test_extract_charset_empty() {
458        assert_eq!(extract_charset_from_content_type(""), None);
459    }
460
461    #[test]
462    fn test_extract_charset_with_boundary() {
463        // Content-Type with multiple parameters
464        assert_eq!(
465            extract_charset_from_content_type("multipart/form-data; boundary=----; charset=utf-8"),
466            Some("UTF-8")
467        );
468    }
469
470    // Tests for detect_encoding_with_hint
471
472    #[test]
473    fn test_hint_bom_priority() {
474        // BOM takes priority over Content-Type
475        let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
476        assert_eq!(
477            detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
478            "UTF-8"
479        );
480    }
481
482    #[test]
483    fn test_hint_content_type_used() {
484        // Content-Type is used when no BOM
485        let data = b"<?xml version=\"1.0\"?>";
486        assert_eq!(
487            detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
488            "windows-1252"
489        );
490    }
491
492    #[test]
493    fn test_hint_xml_declaration_fallback() {
494        // Falls back to XML declaration when no Content-Type charset
495        let data = b"<?xml version=\"1.0\" encoding=\"windows-1252\"?>";
496        assert_eq!(detect_encoding_with_hint(data, None), "windows-1252");
497    }
498
499    #[test]
500    fn test_hint_default_utf8() {
501        // Default to UTF-8 when no hints
502        let data = b"<rss><channel></channel></rss>";
503        assert_eq!(detect_encoding_with_hint(data, None), "UTF-8");
504    }
505
506    #[test]
507    fn test_hint_content_type_without_charset() {
508        // Content-Type without charset falls through to XML declaration
509        let data = b"<?xml version=\"1.0\" encoding=\"windows-1252\"?>";
510        assert_eq!(
511            detect_encoding_with_hint(data, Some("text/xml")),
512            "windows-1252"
513        );
514    }
515
516    // Tests for detect_bom
517
518    #[test]
519    fn test_detect_bom_utf8() {
520        assert_eq!(detect_bom(b"\xEF\xBB\xBF"), Some("UTF-8"));
521    }
522
523    #[test]
524    fn test_detect_bom_utf16le() {
525        assert_eq!(detect_bom(b"\xFF\xFE"), Some("UTF-16LE"));
526    }
527
528    #[test]
529    fn test_detect_bom_utf16be() {
530        assert_eq!(detect_bom(b"\xFE\xFF"), Some("UTF-16BE"));
531    }
532
533    #[test]
534    fn test_detect_bom_utf32le() {
535        assert_eq!(detect_bom(b"\xFF\xFE\x00\x00"), Some("UTF-32LE"));
536    }
537
538    #[test]
539    fn test_detect_bom_utf32be() {
540        assert_eq!(detect_bom(b"\x00\x00\xFE\xFF"), Some("UTF-32BE"));
541    }
542
543    #[test]
544    fn test_detect_bom_none() {
545        assert_eq!(detect_bom(b"<?xml"), None);
546        assert_eq!(detect_bom(b""), None);
547    }
548}