1use crate::dom::comment::CommentNode;
4use crate::dom::element::HTMLElement;
5use crate::dom::node::Node;
6use crate::dom::text::TextNode;
7use crate::dom::void_tag::VoidTag;
8use regex::Regex;
9use std::collections::HashMap;
10use std::sync::OnceLock;
11
12use super::attrs::parse_id_class_attrs_fast;
13use super::fast_parser::parse_tags_zero_copy;
14use super::types::{Options, StackEntry};
15use super::utils::{find_closing_tag_case_insensitive, strip_trailing_self_close_optimized};
16
17static TAG_REGEX: OnceLock<Regex> = OnceLock::new();
19static ATTR_KEY_REGEX: OnceLock<Regex> = OnceLock::new();
20
21pub fn parse_with_options(input: &str, opts: &Options) -> Box<HTMLElement> {
22 let _tag_re = TAG_REGEX.get_or_init(|| {
28 Regex::new(r#"<!--(?s:.*?)-->|<(\/)?([A-Za-z][A-Za-z0-9._:@\-\p{L}\p{M}]*)(?P<attrs>(?:[^>"']|"[^"]*"|'[^']*')*)(/?)>"#).unwrap()
29 });
30 let _attr_key_re = ATTR_KEY_REGEX.get_or_init(|| {
31 Regex::new(
32 r#"([a-zA-Z()\[\]#@$.?:][a-zA-Z0-9-._:()\[\]#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|[^\s>]+))?"#,
33 )
34 .unwrap()
35 });
36 let void_tag = VoidTag::new(&opts.void_tag);
37
38 let mut closed_by_open: HashMap<&'static str, Vec<&'static str>> = HashMap::new();
40 macro_rules! ins_open {
42 ($parent:expr, [ $($v:expr),* ]) => {{
43 closed_by_open.insert($parent, vec![$($v),*]);
44 let up = $parent.to_uppercase();
45 if up != $parent { closed_by_open.insert(Box::leak(up.into_boxed_str()), vec![$($v),*]); }
46 }};
47 }
48 ins_open!("li", ["li", "LI"]);
49 ins_open!("p", ["p", "P", "div", "DIV"]);
50 ins_open!("b", ["div", "DIV"]);
51 ins_open!("td", ["td", "th", "TD", "TH"]);
52 ins_open!("th", ["td", "th", "TD", "TH"]);
53 ins_open!("h1", ["h1", "H1"]);
54 ins_open!("h2", ["h2", "H2"]);
55 ins_open!("h3", ["h3", "H3"]);
56 ins_open!("h4", ["h4", "H4"]);
57 ins_open!("h5", ["h5", "H5"]);
58 ins_open!("h6", ["h6", "H6"]);
59
60 let mut closed_by_close: HashMap<&'static str, Vec<&'static str>> = HashMap::new();
61 macro_rules! ins_close {
62 ($parent:expr, [ $($v:expr),* ]) => {{
63 closed_by_close.insert($parent, vec![$($v),*]);
64 let up = $parent.to_uppercase();
65 if up != $parent { closed_by_close.insert(Box::leak(up.into_boxed_str()), vec![$($v),*]); }
66 }};
67 }
68 ins_close!("li", ["ul", "UL", "ol", "OL"]);
69 ins_close!("a", ["div", "DIV"]);
70 ins_close!("b", ["div", "DIV"]);
71 ins_close!("i", ["div", "DIV"]);
72 ins_close!("p", ["div", "DIV"]);
73 ins_close!("td", ["tr", "TR", "table", "TABLE"]);
74 ins_close!("th", ["tr", "TR", "table", "TABLE"]);
75
76 let root = Box::new(HTMLElement::new(
77 None,
78 String::new(),
79 Vec::new(),
80 false,
81 opts.void_tag.add_closing_slash,
82 ));
83 let mut stack: Vec<StackEntry> = Vec::with_capacity(32);
85 stack.push(StackEntry { elem: root });
86 if let Some(first) = stack.last_mut() {
88 first.elem.parse_comment = opts.comment;
89 first.elem.parse_lowercase = opts.lower_case_tag_name;
90 }
91 let mut no_nested_a_index: Option<usize> = None;
92
93 let frameflag = "documentfragmentcontainer";
94 let frame_prefix = format!("<{}>", frameflag);
95 let frame_suffix = format!("</{}>", frameflag);
96 let data = format!("{}{}{}", frame_prefix, input, frame_suffix);
97 let frame_offset = frame_prefix.len();
98
99 let tag_matches = parse_tags_zero_copy(&data);
101 let mut match_index = 0;
102 let mut last_text_pos = 0usize;
103
104 let mut skipped_closing_starts: std::collections::HashSet<usize> =
106 std::collections::HashSet::new();
107
108 while match_index < tag_matches.len() {
110 let tag_match = &tag_matches[match_index];
111 match_index += 1;
112
113 if skipped_closing_starts.contains(&tag_match.start) {
115 continue;
116 }
117
118 let match_start = tag_match.start;
119 let is_comment = tag_match.is_comment;
120 let _leading_slash = if tag_match.is_closing { "/" } else { "" };
121 let tag_name = tag_match.tag_name.to_string(); let raw_attr_part = &tag_match.attrs;
123 let _trailing_self_close = tag_match.self_closing;
124 let (attr_part, trailing_self_close) = strip_trailing_self_close_optimized(raw_attr_part);
126 if match_start > last_text_pos {
128 let text_slice = &data[last_text_pos..match_start];
129 if !text_slice.is_empty() {
130 let top = stack.last_mut().unwrap();
131 let start_src = last_text_pos.saturating_sub(frame_offset);
132 let end_src = match_start.saturating_sub(frame_offset);
133 top.elem.children.push(Node::Text(TextNode::with_range(
135 text_slice.to_string(), start_src,
137 end_src,
138 )));
139 }
140 }
141 last_text_pos = tag_match.end;
142 if is_comment {
143 if opts.comment {
144 let full_comment = &data[tag_match.start..tag_match.end];
146 let inner = full_comment
147 .trim_start_matches("<!--")
148 .trim_end_matches("-->");
149 let top = stack.last_mut().unwrap();
150 let start_src = match_start.saturating_sub(frame_offset);
152 let end_src = tag_match.end.saturating_sub(frame_offset);
153 top.elem
154 .children
155 .push(Node::Comment(CommentNode::with_range(
156 inner.to_string(),
157 start_src,
158 end_src,
159 )));
160 }
161 continue;
162 }
163 if tag_name == frameflag {
164 continue;
165 }
166
167 let lower_tag_name = tag_name.to_lowercase(); let final_tag_name = if opts.lower_case_tag_name {
171 &lower_tag_name
172 } else {
173 &tag_name
174 };
175
176 let mut self_closing = trailing_self_close
177 || attr_part.trim_end().ends_with('/') || void_tag.is_void(final_tag_name);
179
180 if !tag_match.is_closing {
181 if !opts.preserve_tag_nesting {
184 if let Some(parent) = stack.last() {
185 if let Some(list) = closed_by_open.get(parent.elem.name()) {
186 if list
187 .iter()
188 .any(|t| **t == tag_name || *t == tag_name.to_uppercase())
189 {
190 if stack.len() > 1 {
191 let closed = stack.pop().unwrap();
192 stack
193 .last_mut()
194 .unwrap()
195 .elem
196 .children
197 .push(Node::Element(closed.elem));
198 }
199 }
200 }
201 }
202 }
203 if opts.fix_nested_a_tags && (final_tag_name.eq("a") || final_tag_name.eq("A")) {
205 if let Some(idx) = no_nested_a_index {
206 while stack.len() > idx {
207 let closed = stack.pop().unwrap();
208 stack
209 .last_mut()
210 .unwrap()
211 .elem
212 .children
213 .push(Node::Element(closed.elem));
214 }
215 }
216 no_nested_a_index = Some(stack.len());
217 }
218
219 let (attrs, saw_other_attr) = parse_id_class_attrs_fast(&attr_part);
221 let raw_attr_string = if attr_part.starts_with(' ') && attr_part.len() > 1 {
223 attr_part[1..].to_string()
224 } else {
225 attr_part.to_string()
226 };
227 let mut elem = Box::new(HTMLElement::new(
228 Some(final_tag_name.to_string()),
229 raw_attr_string,
230 attrs,
231 self_closing && void_tag.is_void(final_tag_name),
232 opts.void_tag.add_closing_slash,
233 ));
234 elem.parse_comment = opts.comment;
237 elem.parse_lowercase = opts.lower_case_tag_name;
238 if saw_other_attr {
239 elem.attrs_complete = false;
240 }
241 let open_start = match_start.saturating_sub(frame_offset);
242 let open_end = tag_match.end.saturating_sub(frame_offset);
243 elem.set_range_start(open_start);
245 if self_closing {
246 elem.set_range_end(open_end);
248 }
249 if let Some((_, v)) = elem.attrs.iter().find(|(k, _)| k == "id") {
251 elem.id = v.clone();
252 }
253
254 if let Some(extract) = opts.block_text_elements.get(&lower_tag_name) {
256 let close_markup = format!("</{}>", final_tag_name);
258 let search_slice = &data[last_text_pos..];
259
260 if let Some(rel) = find_closing_tag_case_insensitive(search_slice, &close_markup) {
262 let close_start = last_text_pos + rel; let suppress = opts.suppress_script_style_text
264 && (lower_tag_name == "script" || lower_tag_name == "style");
265 if *extract && !suppress {
266 let inner_text = &data[last_text_pos..close_start];
268 if !inner_text.is_empty() {
269 let inner_start = last_text_pos.saturating_sub(frame_offset);
270 let inner_end = close_start.saturating_sub(frame_offset);
271 elem.children.push(Node::Text(TextNode::with_range(
272 inner_text.to_string(),
273 inner_start,
274 inner_end,
275 )));
276 }
277 }
278 skipped_closing_starts.insert(close_start);
280 last_text_pos = close_start + close_markup.len();
281 let close_end_src =
282 (close_start + close_markup.len()).saturating_sub(frame_offset);
283 elem.set_range_end(close_end_src);
284 self_closing = true; } else {
286 last_text_pos = data.len() + 1;
288 }
289 }
290 if self_closing {
293 let parent = stack.last_mut().unwrap();
294 let parent_ptr: *mut HTMLElement = &mut *parent.elem;
295 elem.parent = Some(parent_ptr);
296 parent.elem.children.push(Node::Element(elem));
297 } else {
298 let parent_ptr: *mut HTMLElement = &mut *stack.last_mut().unwrap().elem;
299 elem.parent = Some(parent_ptr);
300 stack.push(StackEntry { elem });
301 }
302 } else {
303 if no_nested_a_index.is_some() && (final_tag_name.eq("a") || final_tag_name.eq("A")) {
306 no_nested_a_index = None;
307 }
308 let target = &lower_tag_name;
310 let mut i = stack.len();
312 while i > 1 {
313 i -= 1;
315 if stack[i].elem.name().eq_ignore_ascii_case(target) {
316 while stack.len() > i + 1 {
317 let closed = stack.pop().unwrap();
318 let parent = stack.last_mut().unwrap();
319 let mut e = closed.elem;
320 let parent_ptr: *mut HTMLElement = &mut *parent.elem;
321 e.parent = Some(parent_ptr);
322 let close_end = tag_match.end.saturating_sub(frame_offset);
323 if e.range().is_some() {
324 e.set_range_end(close_end);
325 }
326 parent.elem.children.push(Node::Element(e));
327 }
328 let closed = stack.pop().unwrap();
329 let parent = stack.last_mut().unwrap();
330 let mut e = closed.elem;
331 let parent_ptr: *mut HTMLElement = &mut *parent.elem;
332 e.parent = Some(parent_ptr);
333 let close_end_main = tag_match.end.saturating_sub(frame_offset);
334 if e.range().is_some() {
335 e.set_range_end(close_end_main);
336 }
337 parent.elem.children.push(Node::Element(e));
338 break;
339 } else {
340 let parent_name = stack[i].elem.name().to_lowercase();
342 if let Some(list) = closed_by_close.get(parent_name.as_str()) {
343 if list.iter().any(|x| x.eq_ignore_ascii_case(&tag_name)) {
344 let closed = stack.pop().unwrap();
345 let parent = stack.last_mut().unwrap();
346 let mut e = closed.elem;
347 let parent_ptr: *mut HTMLElement = &mut *parent.elem;
348 e.parent = Some(parent_ptr);
349 parent.elem.children.push(Node::Element(e));
350 continue;
351 }
352 }
353 }
354 }
355 }
356 }
357 if last_text_pos < data.len() {
359 let text = &data[last_text_pos..];
360 if !text.is_empty() {
361 let top = stack.last_mut().unwrap();
362 let start_src = last_text_pos.saturating_sub(frame_offset);
363 let end_src = data.len().saturating_sub(frame_offset);
364 top.elem.children.push(Node::Text(TextNode::with_range(
365 text.to_string(),
366 start_src,
367 end_src,
368 )));
369 }
370 }
371 if opts.parse_none_closed_tags {
373 while stack.len() > 1 {
375 let closed = stack.pop().unwrap();
376 let parent = stack.last_mut().unwrap();
377 let mut e = closed.elem;
378 let parent_ptr: *mut HTMLElement = &mut *parent.elem;
379 e.parent = Some(parent_ptr);
380 parent.elem.children.push(Node::Element(e));
381 }
382 let root = stack.pop().unwrap().elem;
383 return root;
384 }
385 while stack.len() > 1 {
388 let last = stack.pop().unwrap();
389 let one_before = stack.last_mut().unwrap();
390 let mut last_elem = last.elem; let is_pair = last_elem.name() == one_before.elem.name();
393 if is_pair {
394 if let Some(one_parent_ptr) = one_before.elem.parent {
398 unsafe {
399 let one_parent = &mut *one_parent_ptr;
400 one_parent.remove_children_where(
401 |n| matches!(n, Node::Element(e) if e.name()==last_elem.name()),
402 );
403 for mut child in last_elem.children.drain(..) {
404 if let Node::Element(e) = &mut child {
405 let parent_ptr: *mut HTMLElement = one_parent_ptr;
406 e.parent = Some(parent_ptr);
407 }
408 one_parent.children.push(child);
409 }
410 }
411 }
412 continue;
413 }
414 let end_fix_needed =
417 last_elem.range().is_some() && last_elem.range().unwrap().1 < input.len();
418 if end_fix_needed {
419 last_elem.set_range_end(input.len());
421 let parent_ptr: *mut HTMLElement = &mut *one_before.elem;
423 last_elem.parent = Some(parent_ptr);
424 one_before.elem.children.push(Node::Element(last_elem));
425 continue;
426 } else {
427 let target_name = last_elem.name().to_string();
428 one_before
429 .elem
430 .remove_children_where(|n| matches!(n, Node::Element(e) if e.name()==target_name));
431 for mut child in last_elem.children.drain(..) {
432 match &mut child {
433 Node::Element(e) => {
434 let parent_ptr: *mut HTMLElement = &mut *one_before.elem;
435 e.parent = Some(parent_ptr);
436 }
437 _ => {}
438 }
439 one_before.elem.children.push(child);
440 }
441 }
442 }
443 let root = stack.pop().unwrap().elem;
445 fn promote_heading_duplicates(node: &mut HTMLElement) {
448 use crate::dom::node::Node;
449 let mut i = 0;
450 while i + 1 < node.children.len() {
451 let promote = match (&node.children[i], &node.children[i + 1]) {
452 (Node::Element(a), Node::Element(b)) => {
453 let n1 = a.name();
454 let n2 = b.name();
455 if n1 == n2 && matches!(n1, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
456 Some((n1.to_string(), i + 1))
457 } else {
458 None
459 }
460 }
461 _ => None,
462 };
463 if promote.is_none() && i + 1 < node.children.len() {
465 let (left, right) = node.children.split_at_mut(i + 1); if let Some(Node::Element(h)) = left.last_mut() {
468 if let Some(Node::Element(next_el)) = right.first() {
469 let hn = h.name();
470 if matches!(hn, "h1" | "h2" | "h3" | "h4" | "h5" | "h6")
471 && hn != next_el.name()
472 {
473 if let Some(r) = h.range() {
474 if r.0 == r.1 {
475 h.set_range_end(r.0);
476 }
477 }
478 }
479 }
480 }
481 }
482 if let Some((_name, dup_idx)) = promote {
483 let mut dup = match node.children.remove(dup_idx) {
485 Node::Element(e) => e,
486 _ => unreachable!(),
487 };
488 let insertion_pos = i + 1; let mut promoted: Vec<Node> = Vec::new();
491 for child in dup.children.drain(..) {
492 let keep = match &child {
493 Node::Element(e) => {
494 let name = e.name();
495 if name == "div" && e.raw_attrs.is_empty() && e.children.is_empty() {
497 false
498 } else {
499 true
500 }
501 }
502 _ => true,
503 };
504 if keep {
505 promoted.push(child);
506 }
507 }
508 for (offset, mut ch) in promoted.into_iter().enumerate() {
509 if let Node::Element(ref mut e) = ch {
510 e.parent = Some(node as *mut HTMLElement);
511 }
512 node.children.insert(insertion_pos + offset, ch);
513 }
514 if let Node::Element(first_h) = &mut node.children[i] {
516 if matches!(first_h.name(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
517 if let Some((s, e)) = first_h.range() {
518 if s == e {
519 first_h.set_range_end(s + 1);
520 }
521 }
522 }
523 }
524 continue;
526 }
527 i += 1;
528 }
529 for child in node.children.iter_mut() {
531 if let Node::Element(e) = child {
532 promote_heading_duplicates(e);
533 }
534 }
535 }
536 let mut root_box = root;
537 promote_heading_duplicates(&mut root_box);
538 root_box
539}