use encoding_rs::Encoding;
const PRESCAN_LIMIT: usize = 1024;
pub fn detect(input: &[u8]) -> &'static Encoding {
if let Some(enc) = detect_bom(input) {
return enc;
}
if let Some(enc) = prescan_meta(input) {
return enc;
}
encoding_rs::UTF_8
}
fn detect_bom(input: &[u8]) -> Option<&'static Encoding> {
if input.len() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF {
return Some(encoding_rs::UTF_8);
}
if input.len() >= 2 {
if input[0] == 0xFF && input[1] == 0xFE {
return Some(encoding_rs::UTF_16LE);
}
if input[0] == 0xFE && input[1] == 0xFF {
return Some(encoding_rs::UTF_16BE);
}
}
None
}
fn prescan_meta(input: &[u8]) -> Option<&'static Encoding> {
let limit = input.len().min(PRESCAN_LIMIT);
let haystack = &input[..limit];
let mut pos = 0;
while pos < haystack.len() {
let Some(lt) = memchr_byte(b'<', &haystack[pos..]) else {
break;
};
let lt = pos + lt;
pos = lt + 1;
if !starts_with_ci(&haystack[lt..], b"<meta") {
continue;
}
let tag_start = lt;
let Some(gt_offset) = memchr_byte(b'>', &haystack[tag_start..]) else {
break;
};
let tag_bytes = &haystack[tag_start..tag_start + gt_offset + 1];
pos = tag_start + gt_offset + 1;
if let Some(enc) = extract_charset_attr(tag_bytes) {
return Some(enc);
}
if let Some(enc) = extract_http_equiv_charset(tag_bytes) {
return Some(enc);
}
}
None
}
fn extract_charset_attr(tag: &[u8]) -> Option<&'static Encoding> {
let charset_needle = b"charset";
let idx = find_subsequence_ci(tag, charset_needle)?;
let rest = &tag[idx + charset_needle.len()..];
let rest = skip_ws(rest);
if rest.first() != Some(&b'=') {
return None;
}
let rest = skip_ws(&rest[1..]);
let value = read_attr_value(rest)?;
Encoding::for_label(value.as_bytes())
}
fn extract_http_equiv_charset(tag: &[u8]) -> Option<&'static Encoding> {
if !contains_subsequence_ci(tag, b"http-equiv") {
return None;
}
if !contains_subsequence_ci(tag, b"content-type") {
return None;
}
let content_needle = b"content";
let mut search_start = 0;
let content_value = loop {
let idx = find_subsequence_ci(&tag[search_start..], content_needle)?;
let abs_idx = search_start + idx;
let after = &tag[abs_idx + content_needle.len()..];
let after = skip_ws(after);
if after.first() == Some(&b'=') {
let rest = skip_ws(&after[1..]);
break read_attr_value(rest)?;
}
search_start = abs_idx + content_needle.len();
};
let cv_lower: String = content_value.to_ascii_lowercase();
let charset_pos = cv_lower.find("charset=")?;
let enc_str = &cv_lower[charset_pos + 8..];
let enc_str = enc_str.split(';').next().unwrap_or("").trim();
Encoding::for_label(enc_str.as_bytes())
}
#[inline]
fn memchr_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
haystack.iter().position(|&b| b == needle)
}
fn starts_with_ci(haystack: &[u8], needle: &[u8]) -> bool {
if haystack.len() < needle.len() {
return false;
}
haystack[..needle.len()]
.iter()
.zip(needle)
.all(|(&a, &b)| a.eq_ignore_ascii_case(&b))
}
fn find_subsequence_ci(haystack: &[u8], needle: &[u8]) -> Option<usize> {
haystack
.windows(needle.len())
.position(|w| w.eq_ignore_ascii_case(needle))
}
fn contains_subsequence_ci(haystack: &[u8], needle: &[u8]) -> bool {
find_subsequence_ci(haystack, needle).is_some()
}
fn skip_ws(input: &[u8]) -> &[u8] {
let start = input
.iter()
.position(|b| !b.is_ascii_whitespace())
.unwrap_or(input.len());
&input[start..]
}
fn read_attr_value(input: &[u8]) -> Option<String> {
if input.is_empty() {
return None;
}
let quote = input[0];
if quote == b'"' || quote == b'\'' {
let end = memchr_byte(quote, &input[1..])?;
let value = &input[1..1 + end];
Some(String::from_utf8_lossy(value).into_owned())
} else {
let end = input
.iter()
.position(|&b| b.is_ascii_whitespace() || b == b'>' || b == b'/' || b == b';')
.unwrap_or(input.len());
if end == 0 {
return None;
}
Some(String::from_utf8_lossy(&input[..end]).into_owned())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bom_utf8() {
let input = b"\xEF\xBB\xBF<html></html>";
assert_eq!(detect(input).name(), "UTF-8");
}
#[test]
fn bom_utf16le() {
let input = b"\xFF\xFE<\x00h\x00t\x00m\x00l\x00";
assert_eq!(detect(input).name(), "UTF-16LE");
}
#[test]
fn bom_utf16be() {
let input = b"\xFE\xFF\x00<\x00h\x00t\x00m\x00l";
assert_eq!(detect(input).name(), "UTF-16BE");
}
#[test]
fn meta_charset_double_quote() {
let input = b"<html><head><meta charset=\"windows-1252\"></head></html>";
assert_eq!(detect(input).name(), "windows-1252");
}
#[test]
fn meta_charset_single_quote() {
let input = b"<html><head><meta charset='iso-8859-1'></head></html>";
assert_eq!(detect(input).name(), "windows-1252"); }
#[test]
fn meta_charset_case_insensitive() {
let input = b"<HTML><HEAD><META CHARSET=\"UTF-8\"></HEAD></HTML>";
assert_eq!(detect(input).name(), "UTF-8");
}
#[test]
fn meta_http_equiv() {
let input = b"<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1254\"></head></html>";
assert_eq!(detect(input).name(), "windows-1254");
}
#[test]
fn fallback_utf8() {
let input = b"<html><head></head><body>Hello</body></html>";
assert_eq!(detect(input).name(), "UTF-8");
}
#[test]
fn empty_input() {
assert_eq!(detect(b"").name(), "UTF-8");
}
#[test]
fn no_meta_in_first_1kb() {
let mut input = vec![b' '; 1100];
let meta = b"<meta charset=\"iso-8859-1\">";
input.extend_from_slice(meta);
assert_eq!(detect(&input).name(), "UTF-8"); }
#[test]
fn meta_charset_bare_value() {
let input = b"<meta charset=utf-8>";
assert_eq!(detect(input).name(), "UTF-8");
}
#[test]
fn bom_takes_priority_over_meta() {
let input = b"\xEF\xBB\xBF<html><head><meta charset=\"windows-1252\"></head></html>";
assert_eq!(detect(input).name(), "UTF-8");
}
}