Skip to main content

fhp_encoding/
detect.rs

1//! Encoding detection from raw HTML bytes.
2//!
3//! Implements a simplified version of the HTML spec's encoding sniffing
4//! algorithm: BOM → meta prescan → UTF-8 fallback.
5
6use encoding_rs::Encoding;
7
8/// Maximum number of bytes to prescan for `<meta>` tags.
9const PRESCAN_LIMIT: usize = 1024;
10
11/// Detect the character encoding of raw HTML bytes.
12///
13/// The detection order is:
14/// 1. **BOM** — UTF-8 (`EF BB BF`), UTF-16 LE (`FF FE`), UTF-16 BE (`FE FF`)
15/// 2. **`<meta charset="...">`** — first occurrence in the first 1 KB
16/// 3. **`<meta http-equiv="Content-Type" content="...charset=...">`**
17/// 4. **Fallback** — UTF-8
18///
19/// # Example
20///
21/// ```
22/// use fhp_encoding::detect;
23///
24/// let html = b"\xEF\xBB\xBF<html>UTF-8 with BOM</html>";
25/// assert_eq!(detect(html).name(), "UTF-8");
26/// ```
27pub fn detect(input: &[u8]) -> &'static Encoding {
28    // 1. BOM detection.
29    if let Some(enc) = detect_bom(input) {
30        return enc;
31    }
32
33    // 2–3. Meta prescan.
34    if let Some(enc) = prescan_meta(input) {
35        return enc;
36    }
37
38    // 4. Fallback.
39    encoding_rs::UTF_8
40}
41
42/// Check for a Byte Order Mark at the start of the input.
43fn detect_bom(input: &[u8]) -> Option<&'static Encoding> {
44    if input.len() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF {
45        return Some(encoding_rs::UTF_8);
46    }
47    if input.len() >= 2 {
48        if input[0] == 0xFF && input[1] == 0xFE {
49            return Some(encoding_rs::UTF_16LE);
50        }
51        if input[0] == 0xFE && input[1] == 0xFF {
52            return Some(encoding_rs::UTF_16BE);
53        }
54    }
55    None
56}
57
58/// Prescan the first [`PRESCAN_LIMIT`] bytes for `<meta>` encoding declarations.
59///
60/// Looks for two patterns:
61/// - `<meta charset="ENCODING">`
62/// - `<meta http-equiv="Content-Type" content="...charset=ENCODING...">`
63fn prescan_meta(input: &[u8]) -> Option<&'static Encoding> {
64    let limit = input.len().min(PRESCAN_LIMIT);
65    let haystack = &input[..limit];
66
67    // Find each '<' and check if it starts a <meta tag.
68    let mut pos = 0;
69    while pos < haystack.len() {
70        // Find next '<'.
71        let Some(lt) = memchr_byte(b'<', &haystack[pos..]) else {
72            break;
73        };
74        let lt = pos + lt;
75        pos = lt + 1;
76
77        // Check for "<meta" (case-insensitive).
78        if !starts_with_ci(&haystack[lt..], b"<meta") {
79            continue;
80        }
81
82        // Extract everything up to the closing '>'.
83        let tag_start = lt;
84        let Some(gt_offset) = memchr_byte(b'>', &haystack[tag_start..]) else {
85            break;
86        };
87        let tag_bytes = &haystack[tag_start..tag_start + gt_offset + 1];
88        pos = tag_start + gt_offset + 1;
89
90        // Try <meta charset="...">
91        if let Some(enc) = extract_charset_attr(tag_bytes) {
92            return Some(enc);
93        }
94
95        // Try <meta http-equiv="Content-Type" content="...charset=...">
96        if let Some(enc) = extract_http_equiv_charset(tag_bytes) {
97            return Some(enc);
98        }
99    }
100    None
101}
102
103/// Extract encoding from `charset="VALUE"` or `charset=VALUE` in a `<meta>` tag.
104fn extract_charset_attr(tag: &[u8]) -> Option<&'static Encoding> {
105    let charset_needle = b"charset";
106
107    let idx = find_subsequence_ci(tag, charset_needle)?;
108    let rest = &tag[idx + charset_needle.len()..];
109
110    // Skip whitespace and '='.
111    let rest = skip_ws(rest);
112    if rest.first() != Some(&b'=') {
113        return None;
114    }
115    let rest = skip_ws(&rest[1..]);
116
117    // Read the value (quoted or unquoted).
118    let value = read_attr_value(rest)?;
119    Encoding::for_label(value.as_bytes())
120}
121
122/// Extract encoding from `http-equiv="Content-Type" content="...charset=..."`.
123fn extract_http_equiv_charset(tag: &[u8]) -> Option<&'static Encoding> {
124    // Must have http-equiv="content-type".
125    if !contains_subsequence_ci(tag, b"http-equiv") {
126        return None;
127    }
128    if !contains_subsequence_ci(tag, b"content-type") {
129        return None;
130    }
131
132    // Find the `content` attribute — skip occurrences inside "content-type".
133    // We look for "content" followed by optional whitespace then "=".
134    let content_needle = b"content";
135    let mut search_start = 0;
136    let content_value = loop {
137        let idx = find_subsequence_ci(&tag[search_start..], content_needle)?;
138        let abs_idx = search_start + idx;
139        let after = &tag[abs_idx + content_needle.len()..];
140        let after = skip_ws(after);
141        if after.first() == Some(&b'=') {
142            let rest = skip_ws(&after[1..]);
143            break read_attr_value(rest)?;
144        }
145        // Not followed by '=', skip past and try again.
146        search_start = abs_idx + content_needle.len();
147    };
148
149    // Find charset= inside the content value.
150    let cv_lower: String = content_value.to_ascii_lowercase();
151    let charset_pos = cv_lower.find("charset=")?;
152    let enc_str = &cv_lower[charset_pos + 8..];
153    // Trim trailing ';' or whitespace.
154    let enc_str = enc_str.split(';').next().unwrap_or("").trim();
155
156    Encoding::for_label(enc_str.as_bytes())
157}
158
159// ---------------------------------------------------------------------------
160// Helper utilities
161// ---------------------------------------------------------------------------
162
163/// Simple `memchr`-like byte search (no external dep, just for 1KB prescan).
164#[inline]
165fn memchr_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
166    haystack.iter().position(|&b| b == needle)
167}
168
169/// Case-insensitive prefix check.
170fn starts_with_ci(haystack: &[u8], needle: &[u8]) -> bool {
171    if haystack.len() < needle.len() {
172        return false;
173    }
174    haystack[..needle.len()]
175        .iter()
176        .zip(needle)
177        .all(|(&a, &b)| a.eq_ignore_ascii_case(&b))
178}
179
180/// Find the first occurrence of `needle` in `haystack` (case-insensitive).
181fn find_subsequence_ci(haystack: &[u8], needle: &[u8]) -> Option<usize> {
182    haystack
183        .windows(needle.len())
184        .position(|w| w.eq_ignore_ascii_case(needle))
185}
186
187/// Check if `haystack` contains `needle` (case-insensitive).
188fn contains_subsequence_ci(haystack: &[u8], needle: &[u8]) -> bool {
189    find_subsequence_ci(haystack, needle).is_some()
190}
191
192/// Skip leading ASCII whitespace.
193fn skip_ws(input: &[u8]) -> &[u8] {
194    let start = input
195        .iter()
196        .position(|b| !b.is_ascii_whitespace())
197        .unwrap_or(input.len());
198    &input[start..]
199}
200
201/// Read an attribute value — either quoted (`"..."` / `'...'`) or bare token.
202fn read_attr_value(input: &[u8]) -> Option<String> {
203    if input.is_empty() {
204        return None;
205    }
206    let quote = input[0];
207    if quote == b'"' || quote == b'\'' {
208        let end = memchr_byte(quote, &input[1..])?;
209        let value = &input[1..1 + end];
210        Some(String::from_utf8_lossy(value).into_owned())
211    } else {
212        // Bare token: ends at whitespace, '>', '/', or ';'.
213        let end = input
214            .iter()
215            .position(|&b| b.is_ascii_whitespace() || b == b'>' || b == b'/' || b == b';')
216            .unwrap_or(input.len());
217        if end == 0 {
218            return None;
219        }
220        Some(String::from_utf8_lossy(&input[..end]).into_owned())
221    }
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    #[test]
229    fn bom_utf8() {
230        let input = b"\xEF\xBB\xBF<html></html>";
231        assert_eq!(detect(input).name(), "UTF-8");
232    }
233
234    #[test]
235    fn bom_utf16le() {
236        let input = b"\xFF\xFE<\x00h\x00t\x00m\x00l\x00";
237        assert_eq!(detect(input).name(), "UTF-16LE");
238    }
239
240    #[test]
241    fn bom_utf16be() {
242        let input = b"\xFE\xFF\x00<\x00h\x00t\x00m\x00l";
243        assert_eq!(detect(input).name(), "UTF-16BE");
244    }
245
246    #[test]
247    fn meta_charset_double_quote() {
248        let input = b"<html><head><meta charset=\"windows-1252\"></head></html>";
249        assert_eq!(detect(input).name(), "windows-1252");
250    }
251
252    #[test]
253    fn meta_charset_single_quote() {
254        let input = b"<html><head><meta charset='iso-8859-1'></head></html>";
255        assert_eq!(detect(input).name(), "windows-1252"); // encoding_rs maps iso-8859-1 → windows-1252
256    }
257
258    #[test]
259    fn meta_charset_case_insensitive() {
260        let input = b"<HTML><HEAD><META CHARSET=\"UTF-8\"></HEAD></HTML>";
261        assert_eq!(detect(input).name(), "UTF-8");
262    }
263
264    #[test]
265    fn meta_http_equiv() {
266        let input = b"<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1254\"></head></html>";
267        assert_eq!(detect(input).name(), "windows-1254");
268    }
269
270    #[test]
271    fn fallback_utf8() {
272        let input = b"<html><head></head><body>Hello</body></html>";
273        assert_eq!(detect(input).name(), "UTF-8");
274    }
275
276    #[test]
277    fn empty_input() {
278        assert_eq!(detect(b"").name(), "UTF-8");
279    }
280
281    #[test]
282    fn no_meta_in_first_1kb() {
283        // Put meta after 1KB — should not be detected.
284        let mut input = vec![b' '; 1100];
285        let meta = b"<meta charset=\"iso-8859-1\">";
286        input.extend_from_slice(meta);
287        assert_eq!(detect(&input).name(), "UTF-8"); // fallback
288    }
289
290    #[test]
291    fn meta_charset_bare_value() {
292        let input = b"<meta charset=utf-8>";
293        assert_eq!(detect(input).name(), "UTF-8");
294    }
295
296    #[test]
297    fn bom_takes_priority_over_meta() {
298        // UTF-8 BOM but meta says windows-1252. BOM wins.
299        let input = b"\xEF\xBB\xBF<html><head><meta charset=\"windows-1252\"></head></html>";
300        assert_eq!(detect(input).name(), "UTF-8");
301    }
302}