use encoding_rs::{Encoding, UTF_8};
pub fn detect_encoding(data: &[u8]) -> &'static str {
if let Some(bom_encoding) = detect_bom(data) {
return bom_encoding;
}
if let Some(encoding) = extract_xml_encoding(data) {
return encoding;
}
"UTF-8"
}
fn extract_xml_encoding(data: &[u8]) -> Option<&'static str> {
let search_data = &data[..data.len().min(512)];
let needle = b"encoding=";
let enc_pos = search_data
.windows(needle.len())
.position(|w| w == needle)?;
let after_eq = &search_data[enc_pos + needle.len()..];
let quote = *after_eq.first()?;
if quote != b'"' && quote != b'\'' {
return None;
}
let value_bytes = &after_eq[1..];
let quote_end = value_bytes.iter().position(|&b| b == quote)?;
let encoding_name = std::str::from_utf8(&value_bytes[..quote_end]).ok()?;
normalize_encoding_name(encoding_name)
}
fn normalize_encoding_name(name: &str) -> Option<&'static str> {
let normalized = name.trim().to_lowercase();
Encoding::for_label(normalized.as_bytes()).map(encoding_rs::Encoding::name)
}
pub fn convert_to_utf8(data: &[u8], encoding_name: &str) -> Result<String, String> {
let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
let (cow, _encoding_used, had_errors) = encoding.decode(data);
if had_errors {
Err(format!(
"Encoding conversion from {encoding_name} had errors"
))
} else {
Ok(cow.into_owned())
}
}
pub fn detect_and_convert(data: &[u8]) -> Result<(String, &'static str), String> {
let encoding_name = detect_encoding(data);
let utf8_string = convert_to_utf8(data, encoding_name)?;
Ok((utf8_string, encoding_name))
}
#[must_use]
pub fn extract_charset_from_content_type(content_type: &str) -> Option<&'static str> {
let lowercase = content_type.to_lowercase();
let charset_start = lowercase.find("charset=")?;
let value_start = charset_start + 8;
let rest = &content_type[value_start..];
let charset_value = if rest.starts_with('"') || rest.starts_with('\'') {
let quote = rest.chars().next()?;
let end = rest[1..].find(quote)?;
&rest[1..=end]
} else {
let end = rest
.find(|c: char| c == ';' || c.is_whitespace())
.unwrap_or(rest.len());
&rest[..end]
};
normalize_encoding_name(charset_value)
}
pub fn detect_encoding_with_hint(data: &[u8], content_type: Option<&str>) -> &'static str {
if let Some(bom_encoding) = detect_bom(data) {
return bom_encoding;
}
if let Some(ct) = content_type
&& let Some(charset) = extract_charset_from_content_type(ct)
{
return charset;
}
if let Some(encoding) = extract_xml_encoding(data) {
return encoding;
}
"UTF-8"
}
fn detect_bom(data: &[u8]) -> Option<&'static str> {
if data.starts_with(&[0xEF, 0xBB, 0xBF]) {
return Some("UTF-8");
}
if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
return Some("UTF-32BE");
}
if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
return Some("UTF-32LE");
}
if data.starts_with(&[0xFF, 0xFE]) {
return Some("UTF-16LE");
}
if data.starts_with(&[0xFE, 0xFF]) {
return Some("UTF-16BE");
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_utf8_bom() {
let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
assert_eq!(detect_encoding(data), "UTF-8");
}
#[test]
fn test_detect_utf16le_bom() {
let data = b"\xFF\xFE<\x00?\x00x\x00m\x00l\x00";
assert_eq!(detect_encoding(data), "UTF-16LE");
}
#[test]
fn test_detect_utf16be_bom() {
let data = b"\xFE\xFF\x00<\x00?\x00x\x00m\x00l";
assert_eq!(detect_encoding(data), "UTF-16BE");
}
#[test]
fn test_detect_from_xml_declaration() {
let data = b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>";
assert_eq!(detect_encoding(data).to_lowercase(), "windows-1252");
}
#[test]
fn test_detect_from_xml_declaration_single_quotes() {
let data = b"<?xml version='1.0' encoding='UTF-8'?>";
assert_eq!(detect_encoding(data), "UTF-8");
}
#[test]
fn test_detect_default_utf8() {
let data = b"<?xml version=\"1.0\"?>";
assert_eq!(detect_encoding(data), "UTF-8");
}
#[test]
fn test_convert_iso8859_1() {
let data = b"\xE9";
let utf8 = convert_to_utf8(data, "iso-8859-1").unwrap();
assert_eq!(utf8, "é");
}
#[test]
fn test_convert_windows1252() {
let data = b"\x93Hello\x94";
let utf8 = convert_to_utf8(data, "windows-1252").unwrap();
assert!(utf8.contains("Hello"));
}
#[test]
fn test_detect_and_convert() {
let data = b"<?xml version=\"1.0\"?><root>Test</root>";
let (utf8, encoding) = detect_and_convert(data).unwrap();
assert_eq!(encoding, "UTF-8");
assert!(utf8.contains("Test"));
}
#[test]
fn test_extract_xml_encoding_double_quotes() {
let data = b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
assert!(extract_xml_encoding(data).is_some());
}
#[test]
fn test_extract_xml_encoding_single_quotes() {
let data = b"<?xml version='1.0' encoding='UTF-8'?>";
assert!(extract_xml_encoding(data).is_some());
}
#[test]
fn test_extract_xml_encoding_none() {
let data = b"<?xml version=\"1.0\"?>";
assert!(extract_xml_encoding(data).is_none());
}
#[test]
fn test_normalize_encoding_name() {
assert_eq!(normalize_encoding_name("UTF-8"), Some("UTF-8"));
assert_eq!(normalize_encoding_name("utf-8"), Some("UTF-8"));
assert_eq!(normalize_encoding_name(" UTF-8 "), Some("UTF-8"));
assert_eq!(normalize_encoding_name("ISO-8859-1"), Some("windows-1252"));
}
#[test]
fn test_convert_utf8_to_utf8() {
let data = b"Hello";
let result = convert_to_utf8(data, "utf-8").unwrap();
assert_eq!(result, "Hello");
}
#[test]
fn test_detect_no_encoding_declaration() {
let data = b"<rss><channel></channel></rss>";
assert_eq!(detect_encoding(data), "UTF-8");
}
#[test]
fn test_empty_data() {
let data = b"";
assert_eq!(detect_encoding(data), "UTF-8");
}
#[test]
fn test_extract_charset_basic() {
assert_eq!(
extract_charset_from_content_type("text/xml; charset=utf-8"),
Some("UTF-8")
);
}
#[test]
fn test_extract_charset_no_space() {
assert_eq!(
extract_charset_from_content_type("text/xml;charset=utf-8"),
Some("UTF-8")
);
}
#[test]
fn test_extract_charset_quoted() {
assert_eq!(
extract_charset_from_content_type("text/xml; charset=\"UTF-8\""),
Some("UTF-8")
);
}
#[test]
fn test_extract_charset_single_quoted() {
assert_eq!(
extract_charset_from_content_type("text/xml; charset='UTF-8'"),
Some("UTF-8")
);
}
#[test]
fn test_extract_charset_uppercase() {
assert_eq!(
extract_charset_from_content_type("TEXT/XML; CHARSET=UTF-8"),
Some("UTF-8")
);
}
#[test]
fn test_extract_charset_iso8859() {
assert_eq!(
extract_charset_from_content_type("text/html; charset=iso-8859-1"),
Some("windows-1252")
);
}
#[test]
fn test_extract_charset_none() {
assert_eq!(extract_charset_from_content_type("text/xml"), None);
}
#[test]
fn test_extract_charset_empty() {
assert_eq!(extract_charset_from_content_type(""), None);
}
#[test]
fn test_extract_charset_with_boundary() {
assert_eq!(
extract_charset_from_content_type("multipart/form-data; boundary=----; charset=utf-8"),
Some("UTF-8")
);
}
#[test]
fn test_hint_bom_priority() {
let data = b"\xEF\xBB\xBF<?xml version=\"1.0\"?>";
assert_eq!(
detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
"UTF-8"
);
}
#[test]
fn test_hint_content_type_used() {
let data = b"<?xml version=\"1.0\"?>";
assert_eq!(
detect_encoding_with_hint(data, Some("text/xml; charset=ISO-8859-1")),
"windows-1252"
);
}
#[test]
fn test_hint_xml_declaration_fallback() {
let data = b"<?xml version=\"1.0\" encoding=\"windows-1252\"?>";
assert_eq!(detect_encoding_with_hint(data, None), "windows-1252");
}
#[test]
fn test_hint_default_utf8() {
let data = b"<rss><channel></channel></rss>";
assert_eq!(detect_encoding_with_hint(data, None), "UTF-8");
}
#[test]
fn test_hint_content_type_without_charset() {
let data = b"<?xml version=\"1.0\" encoding=\"windows-1252\"?>";
assert_eq!(
detect_encoding_with_hint(data, Some("text/xml")),
"windows-1252"
);
}
#[test]
fn test_detect_bom_utf8() {
assert_eq!(detect_bom(b"\xEF\xBB\xBF"), Some("UTF-8"));
}
#[test]
fn test_detect_bom_utf16le() {
assert_eq!(detect_bom(b"\xFF\xFE"), Some("UTF-16LE"));
}
#[test]
fn test_detect_bom_utf16be() {
assert_eq!(detect_bom(b"\xFE\xFF"), Some("UTF-16BE"));
}
#[test]
fn test_detect_bom_utf32le() {
assert_eq!(detect_bom(b"\xFF\xFE\x00\x00"), Some("UTF-32LE"));
}
#[test]
fn test_detect_bom_utf32be() {
assert_eq!(detect_bom(b"\x00\x00\xFE\xFF"), Some("UTF-32BE"));
}
#[test]
fn test_detect_bom_none() {
assert_eq!(detect_bom(b"<?xml"), None);
assert_eq!(detect_bom(b""), None);
}
}