1use encoding_rs::Encoding;
7
8const PRESCAN_LIMIT: usize = 1024;
10
11pub fn detect(input: &[u8]) -> &'static Encoding {
28 if let Some(enc) = detect_bom(input) {
30 return enc;
31 }
32
33 if let Some(enc) = prescan_meta(input) {
35 return enc;
36 }
37
38 encoding_rs::UTF_8
40}
41
42fn detect_bom(input: &[u8]) -> Option<&'static Encoding> {
44 if input.len() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF {
45 return Some(encoding_rs::UTF_8);
46 }
47 if input.len() >= 2 {
48 if input[0] == 0xFF && input[1] == 0xFE {
49 return Some(encoding_rs::UTF_16LE);
50 }
51 if input[0] == 0xFE && input[1] == 0xFF {
52 return Some(encoding_rs::UTF_16BE);
53 }
54 }
55 None
56}
57
58fn prescan_meta(input: &[u8]) -> Option<&'static Encoding> {
64 let limit = input.len().min(PRESCAN_LIMIT);
65 let haystack = &input[..limit];
66
67 let mut pos = 0;
69 while pos < haystack.len() {
70 let Some(lt) = memchr_byte(b'<', &haystack[pos..]) else {
72 break;
73 };
74 let lt = pos + lt;
75 pos = lt + 1;
76
77 if !starts_with_ci(&haystack[lt..], b"<meta") {
79 continue;
80 }
81
82 let tag_start = lt;
84 let Some(gt_offset) = memchr_byte(b'>', &haystack[tag_start..]) else {
85 break;
86 };
87 let tag_bytes = &haystack[tag_start..tag_start + gt_offset + 1];
88 pos = tag_start + gt_offset + 1;
89
90 if let Some(enc) = extract_charset_attr(tag_bytes) {
92 return Some(enc);
93 }
94
95 if let Some(enc) = extract_http_equiv_charset(tag_bytes) {
97 return Some(enc);
98 }
99 }
100 None
101}
102
103fn extract_charset_attr(tag: &[u8]) -> Option<&'static Encoding> {
105 let charset_needle = b"charset";
106
107 let idx = find_subsequence_ci(tag, charset_needle)?;
108 let rest = &tag[idx + charset_needle.len()..];
109
110 let rest = skip_ws(rest);
112 if rest.first() != Some(&b'=') {
113 return None;
114 }
115 let rest = skip_ws(&rest[1..]);
116
117 let value = read_attr_value(rest)?;
119 Encoding::for_label(value.as_bytes())
120}
121
122fn extract_http_equiv_charset(tag: &[u8]) -> Option<&'static Encoding> {
124 if !contains_subsequence_ci(tag, b"http-equiv") {
126 return None;
127 }
128 if !contains_subsequence_ci(tag, b"content-type") {
129 return None;
130 }
131
132 let content_needle = b"content";
135 let mut search_start = 0;
136 let content_value = loop {
137 let idx = find_subsequence_ci(&tag[search_start..], content_needle)?;
138 let abs_idx = search_start + idx;
139 let after = &tag[abs_idx + content_needle.len()..];
140 let after = skip_ws(after);
141 if after.first() == Some(&b'=') {
142 let rest = skip_ws(&after[1..]);
143 break read_attr_value(rest)?;
144 }
145 search_start = abs_idx + content_needle.len();
147 };
148
149 let cv_lower: String = content_value.to_ascii_lowercase();
151 let charset_pos = cv_lower.find("charset=")?;
152 let enc_str = &cv_lower[charset_pos + 8..];
153 let enc_str = enc_str.split(';').next().unwrap_or("").trim();
155
156 Encoding::for_label(enc_str.as_bytes())
157}
158
159#[inline]
165fn memchr_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
166 haystack.iter().position(|&b| b == needle)
167}
168
169fn starts_with_ci(haystack: &[u8], needle: &[u8]) -> bool {
171 if haystack.len() < needle.len() {
172 return false;
173 }
174 haystack[..needle.len()]
175 .iter()
176 .zip(needle)
177 .all(|(&a, &b)| a.eq_ignore_ascii_case(&b))
178}
179
180fn find_subsequence_ci(haystack: &[u8], needle: &[u8]) -> Option<usize> {
182 haystack
183 .windows(needle.len())
184 .position(|w| w.eq_ignore_ascii_case(needle))
185}
186
187fn contains_subsequence_ci(haystack: &[u8], needle: &[u8]) -> bool {
189 find_subsequence_ci(haystack, needle).is_some()
190}
191
192fn skip_ws(input: &[u8]) -> &[u8] {
194 let start = input
195 .iter()
196 .position(|b| !b.is_ascii_whitespace())
197 .unwrap_or(input.len());
198 &input[start..]
199}
200
201fn read_attr_value(input: &[u8]) -> Option<String> {
203 if input.is_empty() {
204 return None;
205 }
206 let quote = input[0];
207 if quote == b'"' || quote == b'\'' {
208 let end = memchr_byte(quote, &input[1..])?;
209 let value = &input[1..1 + end];
210 Some(String::from_utf8_lossy(value).into_owned())
211 } else {
212 let end = input
214 .iter()
215 .position(|&b| b.is_ascii_whitespace() || b == b'>' || b == b'/' || b == b';')
216 .unwrap_or(input.len());
217 if end == 0 {
218 return None;
219 }
220 Some(String::from_utf8_lossy(&input[..end]).into_owned())
221 }
222}
223
224#[cfg(test)]
225mod tests {
226 use super::*;
227
228 #[test]
229 fn bom_utf8() {
230 let input = b"\xEF\xBB\xBF<html></html>";
231 assert_eq!(detect(input).name(), "UTF-8");
232 }
233
234 #[test]
235 fn bom_utf16le() {
236 let input = b"\xFF\xFE<\x00h\x00t\x00m\x00l\x00";
237 assert_eq!(detect(input).name(), "UTF-16LE");
238 }
239
240 #[test]
241 fn bom_utf16be() {
242 let input = b"\xFE\xFF\x00<\x00h\x00t\x00m\x00l";
243 assert_eq!(detect(input).name(), "UTF-16BE");
244 }
245
246 #[test]
247 fn meta_charset_double_quote() {
248 let input = b"<html><head><meta charset=\"windows-1252\"></head></html>";
249 assert_eq!(detect(input).name(), "windows-1252");
250 }
251
252 #[test]
253 fn meta_charset_single_quote() {
254 let input = b"<html><head><meta charset='iso-8859-1'></head></html>";
255 assert_eq!(detect(input).name(), "windows-1252"); }
257
258 #[test]
259 fn meta_charset_case_insensitive() {
260 let input = b"<HTML><HEAD><META CHARSET=\"UTF-8\"></HEAD></HTML>";
261 assert_eq!(detect(input).name(), "UTF-8");
262 }
263
264 #[test]
265 fn meta_http_equiv() {
266 let input = b"<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1254\"></head></html>";
267 assert_eq!(detect(input).name(), "windows-1254");
268 }
269
270 #[test]
271 fn fallback_utf8() {
272 let input = b"<html><head></head><body>Hello</body></html>";
273 assert_eq!(detect(input).name(), "UTF-8");
274 }
275
276 #[test]
277 fn empty_input() {
278 assert_eq!(detect(b"").name(), "UTF-8");
279 }
280
281 #[test]
282 fn no_meta_in_first_1kb() {
283 let mut input = vec![b' '; 1100];
285 let meta = b"<meta charset=\"iso-8859-1\">";
286 input.extend_from_slice(meta);
287 assert_eq!(detect(&input).name(), "UTF-8"); }
289
290 #[test]
291 fn meta_charset_bare_value() {
292 let input = b"<meta charset=utf-8>";
293 assert_eq!(detect(input).name(), "UTF-8");
294 }
295
296 #[test]
297 fn bom_takes_priority_over_meta() {
298 let input = b"\xEF\xBB\xBF<html><head><meta charset=\"windows-1252\"></head></html>";
300 assert_eq!(detect(input).name(), "UTF-8");
301 }
302}