1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
//! Single-pass byte scanner that cleans HTML and emits signals
//! consumed by the tier-1/tier-2 router (added in M2).
use std::borrow::Cow;
use std::ops::Range;
use std::str;
use memchr::memchr;
/// Signals captured during the single prescan pass.
#[derive(Debug, Default, Clone)]
pub struct PrescanReport {
/// Byte range of the contents of `<head>…</head>` (between the tags) in the
/// **cleaned** buffer, or `None`.
pub head_range: Option<Range<usize>>,
/// Any tag-open whose name contains `-` (custom-elements heuristic).
pub had_custom_elements: bool,
/// Any occurrence of `<![CDATA[`.
pub had_cdata: bool,
/// Any `<` that the prescan escaped via the invalid-tag branch.
pub had_unescaped_lt: bool,
/// Saw `<script>` or `<style>` in the source.
pub has_script_or_style: bool,
/// SVG depth ever exceeded zero.
pub has_svg: bool,
}
// Tags that are stripped of their content by the prescan.
const STRIP_CONTENT_TAGS: [&[u8]; 2] = [b"script", b"style"];
const SVG_TAG: &[u8] = b"svg";
const HEAD_TAG: &[u8] = b"head";
const CDATA_START: &[u8] = b"<![CDATA[";
const DOCTYPE: &[u8] = b"doctype";
const EMPTY_COMMENT: &[u8] = b"<!---->";
const SELF_CLOSING: [(&[u8], &str); 3] =
[(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
/// Run the prescan over `html`, returning the cleaned buffer and signals.
///
/// `Cow::Borrowed` is returned when no transformation was needed.
///
/// # Panics
///
/// Panics if a tag-name byte sequence encountered during script/style stripping
/// is not valid UTF-8 (this cannot happen in practice because it is always a
/// sub-slice of the valid UTF-8 input `html`).
pub fn run(html: &str) -> (Cow<'_, str>, PrescanReport) {
let bytes = html.as_bytes();
let len = bytes.len();
if len == 0 {
return (Cow::Borrowed(html), PrescanReport::default());
}
let mut report = PrescanReport::default();
let mut idx = 0usize;
let mut last = 0usize;
let mut output: Option<String> = None;
let mut svg_depth = 0usize;
// Head-range tracking: byte index in the *output* buffer after `<head…>` closes.
let mut head_open_end: Option<usize> = None;
while idx < len {
if bytes[idx] != b'<' {
match memchr(b'<', &bytes[idx + 1..]) {
Some(next) => {
idx += next + 1;
}
None => break,
}
}
// ── `<![CDATA[` detection (signal only; cleaning falls through) ─────────
// The `<` in `<![CDATA[` will be processed by the is_valid_tag check below
// (it is NOT a valid tag: `!` followed by `[` fails the validity test), so
// it gets escaped to `<` — exactly what the original preprocess_html did.
// We only set the signal here without `continue`.
if bytes[idx..].starts_with(CDATA_START) {
report.had_cdata = true;
// Fall through to is_valid_tag / escape logic below.
}
// ── Empty-comment normalisation: `<!---->` → `<!-- -->` ───────────────
if bytes[idx..].starts_with(EMPTY_COMMENT) {
let out = output.get_or_insert_with(|| String::with_capacity(html.len()));
// flush output position accounting for bytes emitted into `output`
out.push_str(&html[last..idx]);
out.push_str("<!-- -->");
idx += EMPTY_COMMENT.len();
last = idx;
continue;
}
// ── Self-closing normalisation: `<br/>` → `<br>` etc. ────────────────
{
let mut replaced = false;
for (pattern, replacement) in &SELF_CLOSING {
if bytes[idx..].starts_with(pattern) {
let out = output.get_or_insert_with(|| String::with_capacity(html.len()));
out.push_str(&html[last..idx]);
out.push_str(replacement);
idx += pattern.len();
last = idx;
replaced = true;
break;
}
}
if replaced {
continue;
}
}
// ── SVG open / close ──────────────────────────────────────────────────
if matches_tag_start(bytes, idx + 1, SVG_TAG) {
if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG_TAG.len()) {
svg_depth += 1;
report.has_svg = true;
idx = open_end;
continue;
}
} else if matches_end_tag_start(bytes, idx + 1, SVG_TAG)
&& let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG_TAG.len())
{
if svg_depth > 0 {
svg_depth = svg_depth.saturating_sub(1);
}
idx = close_end;
continue;
}
// ── Operations only outside SVG ───────────────────────────────────────
if svg_depth == 0 {
// ── `<script>` / `<style>` content stripping ──────────────────────
let mut handled = false;
for tag in &STRIP_CONTENT_TAGS {
if matches_tag_start(bytes, idx + 1, tag)
&& let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len())
{
report.has_script_or_style = true;
let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
let out = output.get_or_insert_with(|| String::with_capacity(html.len()));
out.push_str(&html[last..idx]);
out.push_str(&html[idx..open_end]);
out.push_str("</");
out.push_str(str::from_utf8(tag).unwrap());
out.push('>');
last = remove_end;
idx = remove_end;
handled = true;
break;
}
}
if handled {
continue;
}
// ── DOCTYPE stripping ─────────────────────────────────────────────
if idx + 2 < len && bytes[idx + 1] == b'!' {
let mut cursor = idx + 2;
while cursor < len && bytes[cursor].is_ascii_whitespace() {
cursor += 1;
}
if cursor + DOCTYPE.len() <= len
&& bytes[cursor..cursor + DOCTYPE.len()].eq_ignore_ascii_case(DOCTYPE)
&& let Some(end) = find_tag_end(bytes, cursor + DOCTYPE.len())
{
let out = output.get_or_insert_with(|| String::with_capacity(html.len()));
out.push_str(&html[last..idx]);
last = end;
idx = end;
continue;
}
}
// ── Signal: `<head>` / `</head>` ─────────────────────────────────
if matches_tag_start(bytes, idx + 1, HEAD_TAG) {
if let Some(open_end) = find_tag_end(bytes, idx + 1 + HEAD_TAG.len()) {
// Record output position after the `<head…>` close-bracket.
// We need to compute the offset in the *output* buffer.
let flushed_so_far = if let Some(ref out) = output {
out.len() + (open_end - last)
} else {
open_end
};
head_open_end = Some(flushed_so_far);
idx = open_end;
continue;
}
} else if matches_end_tag_start(bytes, idx + 1, HEAD_TAG)
&& let Some(close_end) = find_tag_end(bytes, idx + 2 + HEAD_TAG.len())
{
if let Some(start) = head_open_end.take() {
// The `</head>` tag itself starts at the current output position.
let flushed_so_far = if let Some(ref out) = output {
out.len() + (idx - last)
} else {
idx
};
report.head_range = Some(start..flushed_so_far);
}
idx = close_end;
continue;
}
// ── Signal: custom elements (tag name contains `-`) ───────────────
// Only fires for open tags, not close tags.
{
let tag_start = idx + 1;
if tag_start < len && (bytes[tag_start].is_ascii_alphabetic()) {
// Find the end of the tag name.
let name_end = {
let mut e = tag_start;
while e < len
&& (bytes[e].is_ascii_alphanumeric()
|| bytes[e] == b'-'
|| bytes[e] == b'_')
{
e += 1;
}
e
};
let tag_name = &bytes[tag_start..name_end];
if tag_name.contains(&b'-') {
report.had_custom_elements = true;
}
}
}
}
// ── Validity check (applies at all depths) ────────────────────────────
let is_valid_tag = if idx + 1 < len {
match bytes[idx + 1] {
b'!' => {
idx + 2 < len
&& (bytes[idx + 2] == b'-'
|| bytes[idx + 2].is_ascii_alphabetic()
|| bytes[idx + 2].is_ascii_uppercase())
}
b'/' => {
idx + 2 < len
&& (bytes[idx + 2].is_ascii_alphabetic()
|| bytes[idx + 2].is_ascii_uppercase())
}
b'?' => true,
c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
_ => false,
}
} else {
false
};
if !is_valid_tag {
report.had_unescaped_lt = true;
let out = output.get_or_insert_with(|| String::with_capacity(html.len() + 4));
out.push_str(&html[last..idx]);
out.push_str("<");
idx += 1;
last = idx;
continue;
}
idx += 1;
}
// If `<head>` was opened but `</head>` was never seen, record to EOF.
if let Some(start) = head_open_end.take() {
let end = if let Some(ref out) = output {
out.len() + (len - last)
} else {
len
};
report.head_range = Some(start..end);
}
let cow = if let Some(mut out) = output {
if last < len {
out.push_str(&html[last..]);
}
Cow::Owned(out)
} else {
Cow::Borrowed(html)
};
(cow, report)
}
// ── Private helpers (mirrors of the ones in converter.rs) ──────────────────
fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
if start >= bytes.len() || start + tag.len() > bytes.len() {
return false;
}
if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
return false;
}
start += tag.len();
matches!(
bytes.get(start),
Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') | None
)
}
fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
if start >= bytes.len() || bytes[start] != b'/' {
return false;
}
matches_tag_start(bytes, start + 1, tag)
}
fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
let len = bytes.len();
let mut in_quote: Option<u8> = None;
while idx < len {
let next = find_quote_or_close(bytes, idx)?;
idx = next;
match bytes[idx] {
b'"' | b'\'' => {
if let Some(current) = in_quote {
if current == bytes[idx] {
in_quote = None;
}
} else {
in_quote = Some(bytes[idx]);
}
}
b'>' if in_quote.is_none() => return Some(idx + 1),
_ => {}
}
idx += 1;
}
None
}
fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
let len = bytes.len();
let mut depth = 1usize;
while idx < len {
let Some(next_lt) = find_lt(bytes, idx) else {
break;
};
idx = next_lt;
if matches_tag_start(bytes, idx + 1, tag) {
if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
depth += 1;
idx = next;
continue;
}
} else if matches_end_tag_start(bytes, idx + 1, tag)
&& let Some(close) = find_tag_end(bytes, idx + 2 + tag.len())
{
depth -= 1;
if depth == 0 {
return Some(close);
}
idx = close;
continue;
}
idx += 1;
}
None
}
#[inline]
fn find_quote_or_close(bytes: &[u8], start: usize) -> Option<usize> {
memchr::memchr3(b'"', b'\'', b'>', &bytes[start..]).map(|pos| start + pos)
}
#[inline]
fn find_lt(bytes: &[u8], start: usize) -> Option<usize> {
memchr(b'<', &bytes[start..]).map(|pos| start + pos)
}