feedparser_rs/util/
encoding.rs1use encoding_rs::{Encoding, UTF_8};
13
14pub fn detect_encoding(data: &[u8]) -> &'static str {
43 if let Some(bom_encoding) = detect_bom(data) {
45 return bom_encoding;
46 }
47
48 if let Some(encoding) = extract_xml_encoding(data) {
50 return encoding;
51 }
52
53 "UTF-8"
55}
56
57fn extract_xml_encoding(data: &[u8]) -> Option<&'static str> {
61 let search_len = data.len().min(512);
62 let search_data = &data[..search_len];
63
64 if let Ok(header) = std::str::from_utf8(search_data)
65 && let Some(enc_start) = header.find("encoding=")
66 {
67 let after_eq = &header[enc_start + 9..];
68 let quote = after_eq.chars().next()?;
69 if quote == '"' || quote == '\'' {
70 let quote_end = after_eq[1..].find(quote)?;
71 let encoding_name = &after_eq[1..=quote_end];
72 return normalize_encoding_name(encoding_name);
73 }
74 }
75
76 None
77}
78
79fn normalize_encoding_name(name: &str) -> Option<&'static str> {
81 let normalized = name.trim().to_lowercase();
82 Encoding::for_label(normalized.as_bytes()).map(encoding_rs::Encoding::name)
83}
84
85pub fn convert_to_utf8(data: &[u8], encoding_name: &str) -> Result<String, String> {
112 let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
113
114 let (cow, _encoding_used, had_errors) = encoding.decode(data);
115
116 if had_errors {
117 Err(format!(
118 "Encoding conversion from {encoding_name} had errors"
119 ))
120 } else {
121 Ok(cow.into_owned())
122 }
123}
124
125pub fn detect_and_convert(data: &[u8]) -> Result<(String, &'static str), String> {
143 let encoding_name = detect_encoding(data);
144 let utf8_string = convert_to_utf8(data, encoding_name)?;
145 Ok((utf8_string, encoding_name))
146}
147
148#[must_use]
178pub fn extract_charset_from_content_type(content_type: &str) -> Option<&'static str> {
179 let lowercase = content_type.to_lowercase();
180
181 let charset_start = lowercase.find("charset=")?;
183 let value_start = charset_start + 8;
184 let rest = &content_type[value_start..];
185
186 let charset_value = if rest.starts_with('"') || rest.starts_with('\'') {
188 let quote = rest.chars().next()?;
189 let end = rest[1..].find(quote)?;
190 &rest[1..=end]
191 } else {
192 let end = rest
195 .find(|c: char| c == ';' || c.is_whitespace())
196 .unwrap_or(rest.len());
197 &rest[..end]
198 };
199
200 normalize_encoding_name(charset_value)
201}
202
203pub fn detect_encoding_with_hint(data: &[u8], content_type: Option<&str>) -> &'static str {
249 if let Some(bom_encoding) = detect_bom(data) {
251 return bom_encoding;
252 }
253
254 if let Some(ct) = content_type
256 && let Some(charset) = extract_charset_from_content_type(ct)
257 {
258 return charset;
259 }
260
261 if let Some(encoding) = extract_xml_encoding(data) {
263 return encoding;
264 }
265
266 "UTF-8"
268}
269
270fn detect_bom(data: &[u8]) -> Option<&'static str> {
274 if data.starts_with(&[0xEF, 0xBB, 0xBF]) {
275 return Some("UTF-8");
276 }
277 if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
280 return Some("UTF-32BE");
281 }
282 if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
283 return Some("UTF-32LE");
284 }
285 if data.starts_with(&[0xFF, 0xFE]) {
286 return Some("UTF-16LE");
287 }
288 if data.starts_with(&[0xFE, 0xFF]) {
289 return Some("UTF-16BE");
290 }
291 None
292}
293
294#[cfg(test)]
295mod tests {
296 use super::*;
297
298 #[test]
299 fn test_detect_utf8_bom() {
300 let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
301 assert_eq!(detect_encoding(data), "UTF-8");
302 }
303
304 #[test]
305 fn test_detect_utf16le_bom() {
306 let data = b"\xFF\xFE<\x00?\x00x\x00m\x00l\x00";
307 assert_eq!(detect_encoding(data), "UTF-16LE");
308 }
309
310 #[test]
311 fn test_detect_utf16be_bom() {
312 let data = b"\xFE\xFF\x00<\x00?\x00x\x00m\x00l";
313 assert_eq!(detect_encoding(data), "UTF-16BE");
314 }
315
316 #[test]
317 fn test_detect_from_xml_declaration() {
318 let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
319 assert_eq!(detect_encoding(data).to_lowercase(), "windows-1252");
320 }
321
322 #[test]
323 fn test_detect_from_xml_declaration_single_quotes() {
324 let data = b"<?xml version='1.0' encoding='UTF-8'?>";
325 assert_eq!(detect_encoding(data), "UTF-8");
326 }
327
328 #[test]
329 fn test_detect_default_utf8() {
330 let data = b"<?xml version=\"1.0\"?>";
331 assert_eq!(detect_encoding(data), "UTF-8");
332 }
333
334 #[test]
335 fn test_convert_iso8859_1() {
336 let data = b"\xE9";
337 let utf8 = convert_to_utf8(data, "iso-8859-1").unwrap();
338 assert_eq!(utf8, "é");
339 }
340
341 #[test]
342 fn test_convert_windows1252() {
343 let data = b"\x93Hello\x94";
344 let utf8 = convert_to_utf8(data, "windows-1252").unwrap();
345 assert!(utf8.contains("Hello"));
346 }
347
348 #[test]
349 fn test_detect_and_convert() {
350 let data = b"<?xml version=\"1.0\"?><root>Test</root>";
351 let (utf8, encoding) = detect_and_convert(data).unwrap();
352 assert_eq!(encoding, "UTF-8");
353 assert!(utf8.contains("Test"));
354 }
355
356 #[test]
357 fn test_extract_xml_encoding_double_quotes() {
358 let data = b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
359 assert!(extract_xml_encoding(data).is_some());
360 }
361
362 #[test]
363 fn test_extract_xml_encoding_single_quotes() {
364 let data = b"<?xml version='1.0' encoding='UTF-8'?>";
365 assert!(extract_xml_encoding(data).is_some());
366 }
367
368 #[test]
369 fn test_extract_xml_encoding_none() {
370 let data = b"<?xml version=\"1.0\"?>";
371 assert!(extract_xml_encoding(data).is_none());
372 }
373
374 #[test]
375 fn test_normalize_encoding_name() {
376 assert_eq!(normalize_encoding_name("UTF-8"), Some("UTF-8"));
377 assert_eq!(normalize_encoding_name("utf-8"), Some("UTF-8"));
378 assert_eq!(normalize_encoding_name(" UTF-8 "), Some("UTF-8"));
379 assert_eq!(normalize_encoding_name("ISO-8859-1"), Some("windows-1252"));
380 }
381
382 #[test]
383 fn test_convert_utf8_to_utf8() {
384 let data = b"Hello";
385 let result = convert_to_utf8(data, "utf-8").unwrap();
386 assert_eq!(result, "Hello");
387 }
388
389 #[test]
390 fn test_detect_no_encoding_declaration() {
391 let data = b"<rss><channel></channel></rss>";
392 assert_eq!(detect_encoding(data), "UTF-8");
393 }
394
395 #[test]
396 fn test_empty_data() {
397 let data = b"";
398 assert_eq!(detect_encoding(data), "UTF-8");
399 }
400
401 #[test]
404 fn test_extract_charset_basic() {
405 assert_eq!(
406 extract_charset_from_content_type("text/xml; charset=utf-8"),
407 Some("UTF-8")
408 );
409 }
410
411 #[test]
412 fn test_extract_charset_no_space() {
413 assert_eq!(
414 extract_charset_from_content_type("text/xml;charset=utf-8"),
415 Some("UTF-8")
416 );
417 }
418
419 #[test]
420 fn test_extract_charset_quoted() {
421 assert_eq!(
422 extract_charset_from_content_type("text/xml; charset=\"UTF-8\""),
423 Some("UTF-8")
424 );
425 }
426
427 #[test]
428 fn test_extract_charset_single_quoted() {
429 assert_eq!(
430 extract_charset_from_content_type("text/xml; charset='UTF-8'"),
431 Some("UTF-8")
432 );
433 }
434
435 #[test]
436 fn test_extract_charset_uppercase() {
437 assert_eq!(
438 extract_charset_from_content_type("TEXT/XML; CHARSET=UTF-8"),
439 Some("UTF-8")
440 );
441 }
442
443 #[test]
444 fn test_extract_charset_iso8859() {
445 assert_eq!(
446 extract_charset_from_content_type("text/html; charset=iso-8859-1"),
447 Some("windows-1252")
448 );
449 }
450
451 #[test]
452 fn test_extract_charset_none() {
453 assert_eq!(extract_charset_from_content_type("text/xml"), None);
454 }
455
456 #[test]
457 fn test_extract_charset_empty() {
458 assert_eq!(extract_charset_from_content_type(""), None);
459 }
460
461 #[test]
462 fn test_extract_charset_with_boundary() {
463 assert_eq!(
465 extract_charset_from_content_type("multipart/form-data; boundary=----; charset=utf-8"),
466 Some("UTF-8")
467 );
468 }
469
470 #[test]
473 fn test_hint_bom_priority() {
474 let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
476 assert_eq!(
477 detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
478 "UTF-8"
479 );
480 }
481
482 #[test]
483 fn test_hint_content_type_used() {
484 let data = b"<?xml version=\"1.0\"?>";
486 assert_eq!(
487 detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
488 "windows-1252"
489 );
490 }
491
492 #[test]
493 fn test_hint_xml_declaration_fallback() {
494 let data = b"<?xml version=\"1.0\" encoding=\"windows-1252\"?>";
496 assert_eq!(detect_encoding_with_hint(data, None), "windows-1252");
497 }
498
499 #[test]
500 fn test_hint_default_utf8() {
501 let data = b"<rss><channel></channel></rss>";
503 assert_eq!(detect_encoding_with_hint(data, None), "UTF-8");
504 }
505
506 #[test]
507 fn test_hint_content_type_without_charset() {
508 let data = b"<?xml version=\"1.0\" encoding=\"windows-1252\"?>";
510 assert_eq!(
511 detect_encoding_with_hint(data, Some("text/xml")),
512 "windows-1252"
513 );
514 }
515
516 #[test]
519 fn test_detect_bom_utf8() {
520 assert_eq!(detect_bom(b"\xEF\xBB\xBF"), Some("UTF-8"));
521 }
522
523 #[test]
524 fn test_detect_bom_utf16le() {
525 assert_eq!(detect_bom(b"\xFF\xFE"), Some("UTF-16LE"));
526 }
527
528 #[test]
529 fn test_detect_bom_utf16be() {
530 assert_eq!(detect_bom(b"\xFE\xFF"), Some("UTF-16BE"));
531 }
532
533 #[test]
534 fn test_detect_bom_utf32le() {
535 assert_eq!(detect_bom(b"\xFF\xFE\x00\x00"), Some("UTF-32LE"));
536 }
537
538 #[test]
539 fn test_detect_bom_utf32be() {
540 assert_eq!(detect_bom(b"\x00\x00\xFE\xFF"), Some("UTF-32BE"));
541 }
542
543 #[test]
544 fn test_detect_bom_none() {
545 assert_eq!(detect_bom(b"<?xml"), None);
546 assert_eq!(detect_bom(b""), None);
547 }
548}