Skip to main content

feedparser_rs/util/
encoding.rs

1//! Encoding detection and conversion utilities
2//!
3//! This module provides functions for detecting character encoding
4//! and converting to UTF-8.
5//!
6//! Encoding detection follows this priority order:
7//! 1. BOM (Byte Order Mark) - highest priority
8//! 2. HTTP Content-Type charset (if provided)
9//! 3. XML declaration encoding attribute
10//! 4. Default to UTF-8
11
12use encoding_rs::{Encoding, UTF_8};
13
14/// Detect character encoding from byte data
15///
16/// Detection order:
17/// 1. BOM (Byte Order Mark)
18/// 2. XML declaration (<?xml encoding="..."?>)
19/// 3. Default to UTF-8
20///
21/// # Arguments
22///
23/// * `data` - Raw byte data
24///
25/// # Returns
26///
27/// Detected encoding name (e.g., "utf-8", "iso-8859-1")
28///
29/// # Examples
30///
31/// ```
32/// use feedparser_rs::util::encoding::detect_encoding;
33///
34/// // UTF-8 with BOM
35/// let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
36/// assert_eq!(detect_encoding(data), "UTF-8");
37///
38/// // XML declaration
39/// let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
40/// assert_eq!(detect_encoding(data), "windows-1252");
41/// ```
42pub fn detect_encoding(data: &[u8]) -> &'static str {
43    // Check BOM first
44    if let Some(bom_encoding) = detect_bom(data) {
45        return bom_encoding;
46    }
47
48    // Check XML declaration
49    if let Some(encoding) = extract_xml_encoding(data) {
50        return encoding;
51    }
52
53    // Default to UTF-8
54    "UTF-8"
55}
56
57/// Extract encoding from XML declaration
58///
59/// Parses <?xml version="1.0" encoding="..."?> declaration.
60///
61/// Operates on raw bytes rather than converting to UTF-8 first, so it works
62/// correctly even when non-ASCII bytes appear before the first 512 bytes of
63/// content (e.g. ISO-8859-1 titles that begin within 512 bytes of the start).
64/// The XML declaration itself is always ASCII, so a byte-level search is safe.
65fn extract_xml_encoding(data: &[u8]) -> Option<&'static str> {
66    // Only scan the opening portion of the document; the XML declaration must
67    // appear before any non-ASCII content so 512 bytes is more than enough.
68    let search_data = &data[..data.len().min(512)];
69
70    // Locate b"encoding=" using a byte-level search.
71    let needle = b"encoding=";
72    let enc_pos = search_data
73        .windows(needle.len())
74        .position(|w| w == needle)?;
75
76    let after_eq = &search_data[enc_pos + needle.len()..];
77    let quote = *after_eq.first()?;
78    if quote != b'"' && quote != b'\'' {
79        return None;
80    }
81
82    let value_bytes = &after_eq[1..];
83    let quote_end = value_bytes.iter().position(|&b| b == quote)?;
84    // The encoding name is always ASCII; reject if it contains non-ASCII bytes.
85    let encoding_name = std::str::from_utf8(&value_bytes[..quote_end]).ok()?;
86
87    normalize_encoding_name(encoding_name)
88}
89
90/// Normalize encoding name to `encoding_rs` canonical form
91fn normalize_encoding_name(name: &str) -> Option<&'static str> {
92    let normalized = name.trim().to_lowercase();
93    Encoding::for_label(normalized.as_bytes()).map(encoding_rs::Encoding::name)
94}
95
96/// Convert data to UTF-8 from detected encoding
97///
98/// # Arguments
99///
100/// * `data` - Raw byte data in unknown encoding
101/// * `encoding_name` - Encoding name (e.g., "iso-8859-1")
102///
103/// # Returns
104///
105/// * `Ok(String)` - UTF-8 string
106/// * `Err(String)` - Error message if conversion failed
107///
108/// # Examples
109///
110/// ```
111/// use feedparser_rs::util::encoding::convert_to_utf8;
112///
113/// let latin1 = b"\xE9"; // é in ISO-8859-1
114/// let utf8 = convert_to_utf8(latin1, "iso-8859-1").unwrap();
115/// assert_eq!(utf8, "é");
116/// ```
117///
118/// # Errors
119///
120/// Returns an error if the encoding conversion encounters invalid byte sequences
121/// that cannot be properly decoded.
122pub fn convert_to_utf8(data: &[u8], encoding_name: &str) -> Result<String, String> {
123    let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
124
125    let (cow, _encoding_used, had_errors) = encoding.decode(data);
126
127    if had_errors {
128        Err(format!(
129            "Encoding conversion from {encoding_name} had errors"
130        ))
131    } else {
132        Ok(cow.into_owned())
133    }
134}
135
136/// Detect encoding and convert to UTF-8 in one step
137///
138/// # Examples
139///
140/// ```
141/// use feedparser_rs::util::encoding::detect_and_convert;
142///
143/// let data = b"<?xml version=\"1.0\"?><root>Test</root>";
144/// let (utf8, detected_encoding) = detect_and_convert(data).unwrap();
145/// assert_eq!(detected_encoding, "UTF-8");
146/// assert!(utf8.contains("Test"));
147/// ```
148///
149/// # Errors
150///
151/// Returns an error if the encoding conversion encounters invalid byte sequences
152/// that cannot be properly decoded.
153pub fn detect_and_convert(data: &[u8]) -> Result<(String, &'static str), String> {
154    let encoding_name = detect_encoding(data);
155    let utf8_string = convert_to_utf8(data, encoding_name)?;
156    Ok((utf8_string, encoding_name))
157}
158
159/// Extract charset from HTTP Content-Type header
160///
161/// Parses the charset parameter from Content-Type headers like:
162/// - `text/xml; charset=utf-8`
163/// - `application/xml;charset=ISO-8859-1`
164/// - `text/html; charset="UTF-8"`
165///
166/// # Arguments
167///
168/// * `content_type` - The Content-Type header value
169///
170/// # Returns
171///
172/// The charset value if found, or None
173///
174/// # Examples
175///
176/// ```
177/// use feedparser_rs::util::encoding::extract_charset_from_content_type;
178///
179/// assert_eq!(
180///     extract_charset_from_content_type("text/xml; charset=utf-8"),
181///     Some("UTF-8")
182/// );
183/// assert_eq!(
184///     extract_charset_from_content_type("text/html"),
185///     None
186/// );
187/// ```
188#[must_use]
189pub fn extract_charset_from_content_type(content_type: &str) -> Option<&'static str> {
190    let lowercase = content_type.to_lowercase();
191
192    // Find charset= parameter
193    let charset_start = lowercase.find("charset=")?;
194    let value_start = charset_start + 8;
195    let rest = &content_type[value_start..];
196
197    // Handle quoted values: charset="UTF-8"
198    let charset_value = if rest.starts_with('"') || rest.starts_with('\'') {
199        let quote = rest.chars().next()?;
200        let end = rest[1..].find(quote)?;
201        &rest[1..=end]
202    } else {
203        // Unquoted value: charset=UTF-8
204        // End at semicolon, space, or end of string
205        let end = rest
206            .find(|c: char| c == ';' || c.is_whitespace())
207            .unwrap_or(rest.len());
208        &rest[..end]
209    };
210
211    normalize_encoding_name(charset_value)
212}
213
214/// Detect encoding with optional HTTP Content-Type hint
215///
216/// This is the preferred function when parsing feeds from HTTP responses,
217/// as it considers the Content-Type charset parameter in addition to
218/// BOM and XML declaration detection.
219///
220/// # Priority Order
221///
222/// 1. BOM (Byte Order Mark) - highest priority, cannot be wrong
223/// 2. HTTP Content-Type charset (if provided)
224/// 3. XML declaration encoding attribute
225/// 4. Default to UTF-8
226///
227/// # Arguments
228///
229/// * `data` - Raw byte data
230/// * `content_type` - Optional HTTP Content-Type header value
231///
232/// # Returns
233///
234/// Detected encoding name
235///
236/// # Examples
237///
238/// ```
239/// use feedparser_rs::util::encoding::detect_encoding_with_hint;
240///
241/// // BOM takes priority over Content-Type
242/// let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
243/// assert_eq!(
244///     detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
245///     "UTF-8"
246/// );
247///
248/// // Content-Type is used when no BOM
249/// let data = b"<?xml version=\"1.0\"?>";
250/// assert_eq!(
251///     detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
252///     "windows-1252"
253/// );
254///
255/// // Falls back to XML declaration when no Content-Type
256/// let data = b"<?xml version=\"1.0\" encoding=\"UTF-16\"?>";
257/// assert_eq!(detect_encoding_with_hint(data, None), "UTF-16LE");
258/// ```
259pub fn detect_encoding_with_hint(data: &[u8], content_type: Option<&str>) -> &'static str {
260    // Check BOM first - highest priority
261    if let Some(bom_encoding) = detect_bom(data) {
262        return bom_encoding;
263    }
264
265    // Check Content-Type charset if provided
266    if let Some(ct) = content_type
267        && let Some(charset) = extract_charset_from_content_type(ct)
268    {
269        return charset;
270    }
271
272    // Check XML declaration
273    if let Some(encoding) = extract_xml_encoding(data) {
274        return encoding;
275    }
276
277    // Default to UTF-8
278    "UTF-8"
279}
280
281/// Detect encoding from BOM only
282///
283/// Returns the encoding if a BOM is present, None otherwise.
284fn detect_bom(data: &[u8]) -> Option<&'static str> {
285    if data.starts_with(&[0xEF, 0xBB, 0xBF]) {
286        return Some("UTF-8");
287    }
288    // UTF-32 BOMs must be checked BEFORE UTF-16 BOMs
289    // because UTF-32LE BOM (FF FE 00 00) starts with UTF-16LE BOM (FF FE)
290    if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
291        return Some("UTF-32BE");
292    }
293    if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
294        return Some("UTF-32LE");
295    }
296    if data.starts_with(&[0xFF, 0xFE]) {
297        return Some("UTF-16LE");
298    }
299    if data.starts_with(&[0xFE, 0xFF]) {
300        return Some("UTF-16BE");
301    }
302    None
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308
309    #[test]
310    fn test_detect_utf8_bom() {
311        let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
312        assert_eq!(detect_encoding(data), "UTF-8");
313    }
314
315    #[test]
316    fn test_detect_utf16le_bom() {
317        let data = b"\xFF\xFE<\x00?\x00x\x00m\x00l\x00";
318        assert_eq!(detect_encoding(data), "UTF-16LE");
319    }
320
321    #[test]
322    fn test_detect_utf16be_bom() {
323        let data = b"\xFE\xFF\x00<\x00?\x00x\x00m\x00l";
324        assert_eq!(detect_encoding(data), "UTF-16BE");
325    }
326
327    #[test]
328    fn test_detect_from_xml_declaration() {
329        let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
330        assert_eq!(detect_encoding(data).to_lowercase(), "windows-1252");
331    }
332
333    #[test]
334    fn test_detect_from_xml_declaration_single_quotes() {
335        let data = b"<?xml version='1.0' encoding='UTF-8'?>";
336        assert_eq!(detect_encoding(data), "UTF-8");
337    }
338
339    #[test]
340    fn test_detect_default_utf8() {
341        let data = b"<?xml version=\"1.0\"?>";
342        assert_eq!(detect_encoding(data), "UTF-8");
343    }
344
345    #[test]
346    fn test_convert_iso8859_1() {
347        let data = b"\xE9";
348        let utf8 = convert_to_utf8(data, "iso-8859-1").unwrap();
349        assert_eq!(utf8, "é");
350    }
351
352    #[test]
353    fn test_convert_windows1252() {
354        let data = b"\x93Hello\x94";
355        let utf8 = convert_to_utf8(data, "windows-1252").unwrap();
356        assert!(utf8.contains("Hello"));
357    }
358
359    #[test]
360    fn test_detect_and_convert() {
361        let data = b"<?xml version=\"1.0\"?><root>Test</root>";
362        let (utf8, encoding) = detect_and_convert(data).unwrap();
363        assert_eq!(encoding, "UTF-8");
364        assert!(utf8.contains("Test"));
365    }
366
367    #[test]
368    fn test_extract_xml_encoding_double_quotes() {
369        let data = b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
370        assert!(extract_xml_encoding(data).is_some());
371    }
372
373    #[test]
374    fn test_extract_xml_encoding_single_quotes() {
375        let data = b"<?xml version='1.0' encoding='UTF-8'?>";
376        assert!(extract_xml_encoding(data).is_some());
377    }
378
379    #[test]
380    fn test_extract_xml_encoding_none() {
381        let data = b"<?xml version=\"1.0\"?>";
382        assert!(extract_xml_encoding(data).is_none());
383    }
384
385    #[test]
386    fn test_normalize_encoding_name() {
387        assert_eq!(normalize_encoding_name("UTF-8"), Some("UTF-8"));
388        assert_eq!(normalize_encoding_name("utf-8"), Some("UTF-8"));
389        assert_eq!(normalize_encoding_name("  UTF-8  "), Some("UTF-8"));
390        assert_eq!(normalize_encoding_name("ISO-8859-1"), Some("windows-1252"));
391    }
392
393    #[test]
394    fn test_convert_utf8_to_utf8() {
395        let data = b"Hello";
396        let result = convert_to_utf8(data, "utf-8").unwrap();
397        assert_eq!(result, "Hello");
398    }
399
400    #[test]
401    fn test_detect_no_encoding_declaration() {
402        let data = b"<rss><channel></channel></rss>";
403        assert_eq!(detect_encoding(data), "UTF-8");
404    }
405
406    #[test]
407    fn test_empty_data() {
408        let data = b"";
409        assert_eq!(detect_encoding(data), "UTF-8");
410    }
411
412    // Tests for Content-Type charset extraction
413
414    #[test]
415    fn test_extract_charset_basic() {
416        assert_eq!(
417            extract_charset_from_content_type("text/xml; charset=utf-8"),
418            Some("UTF-8")
419        );
420    }
421
422    #[test]
423    fn test_extract_charset_no_space() {
424        assert_eq!(
425            extract_charset_from_content_type("text/xml;charset=utf-8"),
426            Some("UTF-8")
427        );
428    }
429
430    #[test]
431    fn test_extract_charset_quoted() {
432        assert_eq!(
433            extract_charset_from_content_type("text/xml; charset=\"UTF-8\""),
434            Some("UTF-8")
435        );
436    }
437
438    #[test]
439    fn test_extract_charset_single_quoted() {
440        assert_eq!(
441            extract_charset_from_content_type("text/xml; charset='UTF-8'"),
442            Some("UTF-8")
443        );
444    }
445
446    #[test]
447    fn test_extract_charset_uppercase() {
448        assert_eq!(
449            extract_charset_from_content_type("TEXT/XML; CHARSET=UTF-8"),
450            Some("UTF-8")
451        );
452    }
453
454    #[test]
455    fn test_extract_charset_iso8859() {
456        assert_eq!(
457            extract_charset_from_content_type("text/html; charset=iso-8859-1"),
458            Some("windows-1252")
459        );
460    }
461
462    #[test]
463    fn test_extract_charset_none() {
464        assert_eq!(extract_charset_from_content_type("text/xml"), None);
465    }
466
467    #[test]
468    fn test_extract_charset_empty() {
469        assert_eq!(extract_charset_from_content_type(""), None);
470    }
471
472    #[test]
473    fn test_extract_charset_with_boundary() {
474        // Content-Type with multiple parameters
475        assert_eq!(
476            extract_charset_from_content_type("multipart/form-data; boundary=----; charset=utf-8"),
477            Some("UTF-8")
478        );
479    }
480
481    // Tests for detect_encoding_with_hint
482
483    #[test]
484    fn test_hint_bom_priority() {
485        // BOM takes priority over Content-Type
486        let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
487        assert_eq!(
488            detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
489            "UTF-8"
490        );
491    }
492
493    #[test]
494    fn test_hint_content_type_used() {
495        // Content-Type is used when no BOM
496        let data = b"<?xml version=\"1.0\"?>";
497        assert_eq!(
498            detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
499            "windows-1252"
500        );
501    }
502
503    #[test]
504    fn test_hint_xml_declaration_fallback() {
505        // Falls back to XML declaration when no Content-Type charset
506        let data = b"<?xml version=\"1.0\" encoding=\"windows-1252\"?>";
507        assert_eq!(detect_encoding_with_hint(data, None), "windows-1252");
508    }
509
510    #[test]
511    fn test_hint_default_utf8() {
512        // Default to UTF-8 when no hints
513        let data = b"<rss><channel></channel></rss>";
514        assert_eq!(detect_encoding_with_hint(data, None), "UTF-8");
515    }
516
517    #[test]
518    fn test_hint_content_type_without_charset() {
519        // Content-Type without charset falls through to XML declaration
520        let data = b"<?xml version=\"1.0\" encoding=\"windows-1252\"?>";
521        assert_eq!(
522            detect_encoding_with_hint(data, Some("text/xml")),
523            "windows-1252"
524        );
525    }
526
527    // Tests for detect_bom
528
529    #[test]
530    fn test_detect_bom_utf8() {
531        assert_eq!(detect_bom(b"\xEF\xBB\xBF"), Some("UTF-8"));
532    }
533
534    #[test]
535    fn test_detect_bom_utf16le() {
536        assert_eq!(detect_bom(b"\xFF\xFE"), Some("UTF-16LE"));
537    }
538
539    #[test]
540    fn test_detect_bom_utf16be() {
541        assert_eq!(detect_bom(b"\xFE\xFF"), Some("UTF-16BE"));
542    }
543
544    #[test]
545    fn test_detect_bom_utf32le() {
546        assert_eq!(detect_bom(b"\xFF\xFE\x00\x00"), Some("UTF-32LE"));
547    }
548
549    #[test]
550    fn test_detect_bom_utf32be() {
551        assert_eq!(detect_bom(b"\x00\x00\xFE\xFF"), Some("UTF-32BE"));
552    }
553
554    #[test]
555    fn test_detect_bom_none() {
556        assert_eq!(detect_bom(b"<?xml"), None);
557        assert_eq!(detect_bom(b""), None);
558    }
559}