feedparser_rs/util/
encoding.rs1use encoding_rs::{Encoding, UTF_8};
13
14pub fn detect_encoding(data: &[u8]) -> &'static str {
43 if let Some(bom_encoding) = detect_bom(data) {
45 return bom_encoding;
46 }
47
48 if let Some(encoding) = extract_xml_encoding(data) {
50 return encoding;
51 }
52
53 "UTF-8"
55}
56
57fn extract_xml_encoding(data: &[u8]) -> Option<&'static str> {
66 let search_data = &data[..data.len().min(512)];
69
70 let needle = b"encoding=";
72 let enc_pos = search_data
73 .windows(needle.len())
74 .position(|w| w == needle)?;
75
76 let after_eq = &search_data[enc_pos + needle.len()..];
77 let quote = *after_eq.first()?;
78 if quote != b'"' && quote != b'\'' {
79 return None;
80 }
81
82 let value_bytes = &after_eq[1..];
83 let quote_end = value_bytes.iter().position(|&b| b == quote)?;
84 let encoding_name = std::str::from_utf8(&value_bytes[..quote_end]).ok()?;
86
87 normalize_encoding_name(encoding_name)
88}
89
90fn normalize_encoding_name(name: &str) -> Option<&'static str> {
92 let normalized = name.trim().to_lowercase();
93 Encoding::for_label(normalized.as_bytes()).map(encoding_rs::Encoding::name)
94}
95
96pub fn convert_to_utf8(data: &[u8], encoding_name: &str) -> Result<String, String> {
123 let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
124
125 let (cow, _encoding_used, had_errors) = encoding.decode(data);
126
127 if had_errors {
128 Err(format!(
129 "Encoding conversion from {encoding_name} had errors"
130 ))
131 } else {
132 Ok(cow.into_owned())
133 }
134}
135
136pub fn detect_and_convert(data: &[u8]) -> Result<(String, &'static str), String> {
154 let encoding_name = detect_encoding(data);
155 let utf8_string = convert_to_utf8(data, encoding_name)?;
156 Ok((utf8_string, encoding_name))
157}
158
159#[must_use]
189pub fn extract_charset_from_content_type(content_type: &str) -> Option<&'static str> {
190 let lowercase = content_type.to_lowercase();
191
192 let charset_start = lowercase.find("charset=")?;
194 let value_start = charset_start + 8;
195 let rest = &content_type[value_start..];
196
197 let charset_value = if rest.starts_with('"') || rest.starts_with('\'') {
199 let quote = rest.chars().next()?;
200 let end = rest[1..].find(quote)?;
201 &rest[1..=end]
202 } else {
203 let end = rest
206 .find(|c: char| c == ';' || c.is_whitespace())
207 .unwrap_or(rest.len());
208 &rest[..end]
209 };
210
211 normalize_encoding_name(charset_value)
212}
213
214pub fn detect_encoding_with_hint(data: &[u8], content_type: Option<&str>) -> &'static str {
260 if let Some(bom_encoding) = detect_bom(data) {
262 return bom_encoding;
263 }
264
265 if let Some(ct) = content_type
267 && let Some(charset) = extract_charset_from_content_type(ct)
268 {
269 return charset;
270 }
271
272 if let Some(encoding) = extract_xml_encoding(data) {
274 return encoding;
275 }
276
277 "UTF-8"
279}
280
281fn detect_bom(data: &[u8]) -> Option<&'static str> {
285 if data.starts_with(&[0xEF, 0xBB, 0xBF]) {
286 return Some("UTF-8");
287 }
288 if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
291 return Some("UTF-32BE");
292 }
293 if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
294 return Some("UTF-32LE");
295 }
296 if data.starts_with(&[0xFF, 0xFE]) {
297 return Some("UTF-16LE");
298 }
299 if data.starts_with(&[0xFE, 0xFF]) {
300 return Some("UTF-16BE");
301 }
302 None
303}
304
305#[cfg(test)]
306mod tests {
307 use super::*;
308
309 #[test]
310 fn test_detect_utf8_bom() {
311 let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
312 assert_eq!(detect_encoding(data), "UTF-8");
313 }
314
315 #[test]
316 fn test_detect_utf16le_bom() {
317 let data = b"\xFF\xFE<\x00?\x00x\x00m\x00l\x00";
318 assert_eq!(detect_encoding(data), "UTF-16LE");
319 }
320
321 #[test]
322 fn test_detect_utf16be_bom() {
323 let data = b"\xFE\xFF\x00<\x00?\x00x\x00m\x00l";
324 assert_eq!(detect_encoding(data), "UTF-16BE");
325 }
326
327 #[test]
328 fn test_detect_from_xml_declaration() {
329 let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
330 assert_eq!(detect_encoding(data).to_lowercase(), "windows-1252");
331 }
332
333 #[test]
334 fn test_detect_from_xml_declaration_single_quotes() {
335 let data = b"<?xml version='1.0' encoding='UTF-8'?>";
336 assert_eq!(detect_encoding(data), "UTF-8");
337 }
338
339 #[test]
340 fn test_detect_default_utf8() {
341 let data = b"<?xml version=\"1.0\"?>";
342 assert_eq!(detect_encoding(data), "UTF-8");
343 }
344
345 #[test]
346 fn test_convert_iso8859_1() {
347 let data = b"\xE9";
348 let utf8 = convert_to_utf8(data, "iso-8859-1").unwrap();
349 assert_eq!(utf8, "é");
350 }
351
352 #[test]
353 fn test_convert_windows1252() {
354 let data = b"\x93Hello\x94";
355 let utf8 = convert_to_utf8(data, "windows-1252").unwrap();
356 assert!(utf8.contains("Hello"));
357 }
358
359 #[test]
360 fn test_detect_and_convert() {
361 let data = b"<?xml version=\"1.0\"?><root>Test</root>";
362 let (utf8, encoding) = detect_and_convert(data).unwrap();
363 assert_eq!(encoding, "UTF-8");
364 assert!(utf8.contains("Test"));
365 }
366
367 #[test]
368 fn test_extract_xml_encoding_double_quotes() {
369 let data = b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
370 assert!(extract_xml_encoding(data).is_some());
371 }
372
373 #[test]
374 fn test_extract_xml_encoding_single_quotes() {
375 let data = b"<?xml version='1.0' encoding='UTF-8'?>";
376 assert!(extract_xml_encoding(data).is_some());
377 }
378
379 #[test]
380 fn test_extract_xml_encoding_none() {
381 let data = b"<?xml version=\"1.0\"?>";
382 assert!(extract_xml_encoding(data).is_none());
383 }
384
385 #[test]
386 fn test_normalize_encoding_name() {
387 assert_eq!(normalize_encoding_name("UTF-8"), Some("UTF-8"));
388 assert_eq!(normalize_encoding_name("utf-8"), Some("UTF-8"));
389 assert_eq!(normalize_encoding_name(" UTF-8 "), Some("UTF-8"));
390 assert_eq!(normalize_encoding_name("ISO-8859-1"), Some("windows-1252"));
391 }
392
393 #[test]
394 fn test_convert_utf8_to_utf8() {
395 let data = b"Hello";
396 let result = convert_to_utf8(data, "utf-8").unwrap();
397 assert_eq!(result, "Hello");
398 }
399
400 #[test]
401 fn test_detect_no_encoding_declaration() {
402 let data = b"<rss><channel></channel></rss>";
403 assert_eq!(detect_encoding(data), "UTF-8");
404 }
405
406 #[test]
407 fn test_empty_data() {
408 let data = b"";
409 assert_eq!(detect_encoding(data), "UTF-8");
410 }
411
412 #[test]
415 fn test_extract_charset_basic() {
416 assert_eq!(
417 extract_charset_from_content_type("text/xml; charset=utf-8"),
418 Some("UTF-8")
419 );
420 }
421
422 #[test]
423 fn test_extract_charset_no_space() {
424 assert_eq!(
425 extract_charset_from_content_type("text/xml;charset=utf-8"),
426 Some("UTF-8")
427 );
428 }
429
430 #[test]
431 fn test_extract_charset_quoted() {
432 assert_eq!(
433 extract_charset_from_content_type("text/xml; charset=\"UTF-8\""),
434 Some("UTF-8")
435 );
436 }
437
438 #[test]
439 fn test_extract_charset_single_quoted() {
440 assert_eq!(
441 extract_charset_from_content_type("text/xml; charset='UTF-8'"),
442 Some("UTF-8")
443 );
444 }
445
446 #[test]
447 fn test_extract_charset_uppercase() {
448 assert_eq!(
449 extract_charset_from_content_type("TEXT/XML; CHARSET=UTF-8"),
450 Some("UTF-8")
451 );
452 }
453
454 #[test]
455 fn test_extract_charset_iso8859() {
456 assert_eq!(
457 extract_charset_from_content_type("text/html; charset=iso-8859-1"),
458 Some("windows-1252")
459 );
460 }
461
462 #[test]
463 fn test_extract_charset_none() {
464 assert_eq!(extract_charset_from_content_type("text/xml"), None);
465 }
466
467 #[test]
468 fn test_extract_charset_empty() {
469 assert_eq!(extract_charset_from_content_type(""), None);
470 }
471
472 #[test]
473 fn test_extract_charset_with_boundary() {
474 assert_eq!(
476 extract_charset_from_content_type("multipart/form-data; boundary=----; charset=utf-8"),
477 Some("UTF-8")
478 );
479 }
480
481 #[test]
484 fn test_hint_bom_priority() {
485 let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
487 assert_eq!(
488 detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
489 "UTF-8"
490 );
491 }
492
493 #[test]
494 fn test_hint_content_type_used() {
495 let data = b"<?xml version=\"1.0\"?>";
497 assert_eq!(
498 detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
499 "windows-1252"
500 );
501 }
502
503 #[test]
504 fn test_hint_xml_declaration_fallback() {
505 let data = b"<?xml version=\"1.0\" encoding=\"windows-1252\"?>";
507 assert_eq!(detect_encoding_with_hint(data, None), "windows-1252");
508 }
509
510 #[test]
511 fn test_hint_default_utf8() {
512 let data = b"<rss><channel></channel></rss>";
514 assert_eq!(detect_encoding_with_hint(data, None), "UTF-8");
515 }
516
517 #[test]
518 fn test_hint_content_type_without_charset() {
519 let data = b"<?xml version=\"1.0\" encoding=\"windows-1252\"?>";
521 assert_eq!(
522 detect_encoding_with_hint(data, Some("text/xml")),
523 "windows-1252"
524 );
525 }
526
527 #[test]
530 fn test_detect_bom_utf8() {
531 assert_eq!(detect_bom(b"\xEF\xBB\xBF"), Some("UTF-8"));
532 }
533
534 #[test]
535 fn test_detect_bom_utf16le() {
536 assert_eq!(detect_bom(b"\xFF\xFE"), Some("UTF-16LE"));
537 }
538
539 #[test]
540 fn test_detect_bom_utf16be() {
541 assert_eq!(detect_bom(b"\xFE\xFF"), Some("UTF-16BE"));
542 }
543
544 #[test]
545 fn test_detect_bom_utf32le() {
546 assert_eq!(detect_bom(b"\xFF\xFE\x00\x00"), Some("UTF-32LE"));
547 }
548
549 #[test]
550 fn test_detect_bom_utf32be() {
551 assert_eq!(detect_bom(b"\x00\x00\xFE\xFF"), Some("UTF-32BE"));
552 }
553
554 #[test]
555 fn test_detect_bom_none() {
556 assert_eq!(detect_bom(b"<?xml"), None);
557 assert_eq!(detect_bom(b""), None);
558 }
559}