use crate::dom::comment::CommentNode;
use crate::dom::element::HTMLElement;
use crate::dom::node::Node;
use crate::dom::text::TextNode;
use crate::dom::void_tag::VoidTag;
use regex::Regex;
use std::collections::HashMap;
use std::sync::OnceLock;
use super::attrs::parse_id_class_attrs_fast;
use super::fast_parser::parse_tags_zero_copy;
use super::types::{Options, StackEntry};
use super::utils::{find_closing_tag_case_insensitive, strip_trailing_self_close_optimized};
static TAG_REGEX: OnceLock<Regex> = OnceLock::new();
static ATTR_KEY_REGEX: OnceLock<Regex> = OnceLock::new();
pub fn parse_with_options(input: &str, opts: &Options) -> Box<HTMLElement> {
let _tag_re = TAG_REGEX.get_or_init(|| {
Regex::new(r#"<!--(?s:.*?)-->|<(\/)?([A-Za-z][A-Za-z0-9._:@\-\p{L}\p{M}]*)(?P<attrs>(?:[^>"']|"[^"]*"|'[^']*')*)(/?)>"#).unwrap()
});
let _attr_key_re = ATTR_KEY_REGEX.get_or_init(|| {
Regex::new(
r#"([a-zA-Z()\[\]#@$.?:][a-zA-Z0-9-._:()\[\]#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|[^\s>]+))?"#,
)
.unwrap()
});
let void_tag = VoidTag::new(&opts.void_tag);
let mut closed_by_open: HashMap<&'static str, Vec<&'static str>> = HashMap::new();
macro_rules! ins_open {
($parent:expr, [ $($v:expr),* ]) => {{
closed_by_open.insert($parent, vec![$($v),*]);
let up = $parent.to_uppercase();
if up != $parent { closed_by_open.insert(Box::leak(up.into_boxed_str()), vec![$($v),*]); }
}};
}
ins_open!("li", ["li", "LI"]);
ins_open!("p", ["p", "P", "div", "DIV"]);
ins_open!("b", ["div", "DIV"]);
ins_open!("td", ["td", "th", "TD", "TH"]);
ins_open!("th", ["td", "th", "TD", "TH"]);
ins_open!("h1", ["h1", "H1"]);
ins_open!("h2", ["h2", "H2"]);
ins_open!("h3", ["h3", "H3"]);
ins_open!("h4", ["h4", "H4"]);
ins_open!("h5", ["h5", "H5"]);
ins_open!("h6", ["h6", "H6"]);
let mut closed_by_close: HashMap<&'static str, Vec<&'static str>> = HashMap::new();
macro_rules! ins_close {
($parent:expr, [ $($v:expr),* ]) => {{
closed_by_close.insert($parent, vec![$($v),*]);
let up = $parent.to_uppercase();
if up != $parent { closed_by_close.insert(Box::leak(up.into_boxed_str()), vec![$($v),*]); }
}};
}
ins_close!("li", ["ul", "UL", "ol", "OL"]);
ins_close!("a", ["div", "DIV"]);
ins_close!("b", ["div", "DIV"]);
ins_close!("i", ["div", "DIV"]);
ins_close!("p", ["div", "DIV"]);
ins_close!("td", ["tr", "TR", "table", "TABLE"]);
ins_close!("th", ["tr", "TR", "table", "TABLE"]);
let root = Box::new(HTMLElement::new(
None,
String::new(),
Vec::new(),
false,
opts.void_tag.add_closing_slash,
));
let mut stack: Vec<StackEntry> = Vec::with_capacity(32);
stack.push(StackEntry { elem: root });
if let Some(first) = stack.last_mut() {
first.elem.parse_comment = opts.comment;
first.elem.parse_lowercase = opts.lower_case_tag_name;
}
let mut no_nested_a_index: Option<usize> = None;
let frameflag = "documentfragmentcontainer";
let frame_prefix = format!("<{}>", frameflag);
let frame_suffix = format!("</{}>", frameflag);
let data = format!("{}{}{}", frame_prefix, input, frame_suffix);
let frame_offset = frame_prefix.len();
let tag_matches = parse_tags_zero_copy(&data);
let mut match_index = 0;
let mut last_text_pos = 0usize;
let mut skipped_closing_starts: std::collections::HashSet<usize> =
std::collections::HashSet::new();
while match_index < tag_matches.len() {
let tag_match = &tag_matches[match_index];
match_index += 1;
if skipped_closing_starts.contains(&tag_match.start) {
continue;
}
let match_start = tag_match.start;
let is_comment = tag_match.is_comment;
let _leading_slash = if tag_match.is_closing { "/" } else { "" };
let tag_name = tag_match.tag_name.to_string(); let raw_attr_part = &tag_match.attrs;
let _trailing_self_close = tag_match.self_closing;
let (attr_part, trailing_self_close) = strip_trailing_self_close_optimized(raw_attr_part);
if match_start > last_text_pos {
let text_slice = &data[last_text_pos..match_start];
if !text_slice.is_empty() {
let top = stack.last_mut().unwrap();
let start_src = last_text_pos.saturating_sub(frame_offset);
let end_src = match_start.saturating_sub(frame_offset);
top.elem.children.push(Node::Text(TextNode::with_range(
text_slice.to_string(), start_src,
end_src,
)));
}
}
last_text_pos = tag_match.end;
if is_comment {
if opts.comment {
let full_comment = &data[tag_match.start..tag_match.end];
let inner = full_comment
.trim_start_matches("<!--")
.trim_end_matches("-->");
let top = stack.last_mut().unwrap();
let start_src = match_start.saturating_sub(frame_offset);
let end_src = tag_match.end.saturating_sub(frame_offset);
top.elem
.children
.push(Node::Comment(CommentNode::with_range(
inner.to_string(),
start_src,
end_src,
)));
}
continue;
}
if tag_name == frameflag {
continue;
}
let lower_tag_name = tag_name.to_lowercase();
let final_tag_name = if opts.lower_case_tag_name {
&lower_tag_name
} else {
&tag_name
};
let mut self_closing = trailing_self_close
|| attr_part.trim_end().ends_with('/') || void_tag.is_void(final_tag_name);
if !tag_match.is_closing {
if !opts.preserve_tag_nesting {
if let Some(parent) = stack.last() {
if let Some(list) = closed_by_open.get(parent.elem.name()) {
if list
.iter()
.any(|t| **t == tag_name || *t == tag_name.to_uppercase())
{
if stack.len() > 1 {
let closed = stack.pop().unwrap();
stack
.last_mut()
.unwrap()
.elem
.children
.push(Node::Element(closed.elem));
}
}
}
}
}
if opts.fix_nested_a_tags && (final_tag_name.eq("a") || final_tag_name.eq("A")) {
if let Some(idx) = no_nested_a_index {
while stack.len() > idx {
let closed = stack.pop().unwrap();
stack
.last_mut()
.unwrap()
.elem
.children
.push(Node::Element(closed.elem));
}
}
no_nested_a_index = Some(stack.len());
}
let (attrs, saw_other_attr) = parse_id_class_attrs_fast(&attr_part);
let raw_attr_string = if attr_part.starts_with(' ') && attr_part.len() > 1 {
attr_part[1..].to_string()
} else {
attr_part.to_string()
};
let mut elem = Box::new(HTMLElement::new(
Some(final_tag_name.to_string()),
raw_attr_string,
attrs,
self_closing && void_tag.is_void(final_tag_name),
opts.void_tag.add_closing_slash,
));
elem.parse_comment = opts.comment;
elem.parse_lowercase = opts.lower_case_tag_name;
if saw_other_attr {
elem.attrs_complete = false;
}
let open_start = match_start.saturating_sub(frame_offset);
let open_end = tag_match.end.saturating_sub(frame_offset);
elem.set_range_start(open_start);
if self_closing {
elem.set_range_end(open_end);
}
if let Some((_, v)) = elem.attrs.iter().find(|(k, _)| k == "id") {
elem.id = v.clone();
}
if let Some(extract) = opts.block_text_elements.get(&lower_tag_name) {
let close_markup = format!("</{}>", final_tag_name);
let search_slice = &data[last_text_pos..];
if let Some(rel) = find_closing_tag_case_insensitive(search_slice, &close_markup) {
let close_start = last_text_pos + rel; let suppress = opts.suppress_script_style_text
&& (lower_tag_name == "script" || lower_tag_name == "style");
if *extract && !suppress {
let inner_text = &data[last_text_pos..close_start];
if !inner_text.is_empty() {
let inner_start = last_text_pos.saturating_sub(frame_offset);
let inner_end = close_start.saturating_sub(frame_offset);
elem.children.push(Node::Text(TextNode::with_range(
inner_text.to_string(),
inner_start,
inner_end,
)));
}
}
skipped_closing_starts.insert(close_start);
last_text_pos = close_start + close_markup.len();
let close_end_src =
(close_start + close_markup.len()).saturating_sub(frame_offset);
elem.set_range_end(close_end_src);
self_closing = true; } else {
last_text_pos = data.len() + 1;
}
}
if self_closing {
let parent = stack.last_mut().unwrap();
let parent_ptr: *mut HTMLElement = &mut *parent.elem;
elem.parent = Some(parent_ptr);
parent.elem.children.push(Node::Element(elem));
} else {
let parent_ptr: *mut HTMLElement = &mut *stack.last_mut().unwrap().elem;
elem.parent = Some(parent_ptr);
stack.push(StackEntry { elem });
}
} else {
if no_nested_a_index.is_some() && (final_tag_name.eq("a") || final_tag_name.eq("A")) {
no_nested_a_index = None;
}
let target = &lower_tag_name;
let mut i = stack.len();
while i > 1 {
i -= 1;
if stack[i].elem.name().eq_ignore_ascii_case(target) {
while stack.len() > i + 1 {
let closed = stack.pop().unwrap();
let parent = stack.last_mut().unwrap();
let mut e = closed.elem;
let parent_ptr: *mut HTMLElement = &mut *parent.elem;
e.parent = Some(parent_ptr);
let close_end = tag_match.end.saturating_sub(frame_offset);
if e.range().is_some() {
e.set_range_end(close_end);
}
parent.elem.children.push(Node::Element(e));
}
let closed = stack.pop().unwrap();
let parent = stack.last_mut().unwrap();
let mut e = closed.elem;
let parent_ptr: *mut HTMLElement = &mut *parent.elem;
e.parent = Some(parent_ptr);
let close_end_main = tag_match.end.saturating_sub(frame_offset);
if e.range().is_some() {
e.set_range_end(close_end_main);
}
parent.elem.children.push(Node::Element(e));
break;
} else {
let parent_name = stack[i].elem.name().to_lowercase();
if let Some(list) = closed_by_close.get(parent_name.as_str()) {
if list.iter().any(|x| x.eq_ignore_ascii_case(&tag_name)) {
let closed = stack.pop().unwrap();
let parent = stack.last_mut().unwrap();
let mut e = closed.elem;
let parent_ptr: *mut HTMLElement = &mut *parent.elem;
e.parent = Some(parent_ptr);
parent.elem.children.push(Node::Element(e));
continue;
}
}
}
}
}
}
if last_text_pos < data.len() {
let text = &data[last_text_pos..];
if !text.is_empty() {
let top = stack.last_mut().unwrap();
let start_src = last_text_pos.saturating_sub(frame_offset);
let end_src = data.len().saturating_sub(frame_offset);
top.elem.children.push(Node::Text(TextNode::with_range(
text.to_string(),
start_src,
end_src,
)));
}
}
if opts.parse_none_closed_tags {
while stack.len() > 1 {
let closed = stack.pop().unwrap();
let parent = stack.last_mut().unwrap();
let mut e = closed.elem;
let parent_ptr: *mut HTMLElement = &mut *parent.elem;
e.parent = Some(parent_ptr);
parent.elem.children.push(Node::Element(e));
}
let root = stack.pop().unwrap().elem;
return root;
}
while stack.len() > 1 {
let last = stack.pop().unwrap();
let one_before = stack.last_mut().unwrap();
let mut last_elem = last.elem; let is_pair = last_elem.name() == one_before.elem.name();
if is_pair {
if let Some(one_parent_ptr) = one_before.elem.parent {
unsafe {
let one_parent = &mut *one_parent_ptr;
one_parent.remove_children_where(
|n| matches!(n, Node::Element(e) if e.name()==last_elem.name()),
);
for mut child in last_elem.children.drain(..) {
if let Node::Element(e) = &mut child {
let parent_ptr: *mut HTMLElement = one_parent_ptr;
e.parent = Some(parent_ptr);
}
one_parent.children.push(child);
}
}
}
continue;
}
let end_fix_needed =
last_elem.range().is_some() && last_elem.range().unwrap().1 < input.len();
if end_fix_needed {
last_elem.set_range_end(input.len());
let parent_ptr: *mut HTMLElement = &mut *one_before.elem;
last_elem.parent = Some(parent_ptr);
one_before.elem.children.push(Node::Element(last_elem));
continue;
} else {
let target_name = last_elem.name().to_string();
one_before
.elem
.remove_children_where(|n| matches!(n, Node::Element(e) if e.name()==target_name));
for mut child in last_elem.children.drain(..) {
match &mut child {
Node::Element(e) => {
let parent_ptr: *mut HTMLElement = &mut *one_before.elem;
e.parent = Some(parent_ptr);
}
_ => {}
}
one_before.elem.children.push(child);
}
}
}
let root = stack.pop().unwrap().elem;
fn promote_heading_duplicates(node: &mut HTMLElement) {
use crate::dom::node::Node;
let mut i = 0;
while i + 1 < node.children.len() {
let promote = match (&node.children[i], &node.children[i + 1]) {
(Node::Element(a), Node::Element(b)) => {
let n1 = a.name();
let n2 = b.name();
if n1 == n2 && matches!(n1, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
Some((n1.to_string(), i + 1))
} else {
None
}
}
_ => None,
};
if promote.is_none() && i + 1 < node.children.len() {
let (left, right) = node.children.split_at_mut(i + 1); if let Some(Node::Element(h)) = left.last_mut() {
if let Some(Node::Element(next_el)) = right.first() {
let hn = h.name();
if matches!(hn, "h1" | "h2" | "h3" | "h4" | "h5" | "h6")
&& hn != next_el.name()
{
if let Some(r) = h.range() {
if r.0 == r.1 {
h.set_range_end(r.0);
}
}
}
}
}
}
if let Some((_name, dup_idx)) = promote {
let mut dup = match node.children.remove(dup_idx) {
Node::Element(e) => e,
_ => unreachable!(),
};
let insertion_pos = i + 1; let mut promoted: Vec<Node> = Vec::new();
for child in dup.children.drain(..) {
let keep = match &child {
Node::Element(e) => {
let name = e.name();
if name == "div" && e.raw_attrs.is_empty() && e.children.is_empty() {
false
} else {
true
}
}
_ => true,
};
if keep {
promoted.push(child);
}
}
for (offset, mut ch) in promoted.into_iter().enumerate() {
if let Node::Element(ref mut e) = ch {
e.parent = Some(node as *mut HTMLElement);
}
node.children.insert(insertion_pos + offset, ch);
}
if let Node::Element(first_h) = &mut node.children[i] {
if matches!(first_h.name(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
if let Some((s, e)) = first_h.range() {
if s == e {
first_h.set_range_end(s + 1);
}
}
}
}
continue;
}
i += 1;
}
for child in node.children.iter_mut() {
if let Node::Element(e) = child {
promote_heading_duplicates(e);
}
}
}
let mut root_box = root;
promote_heading_duplicates(&mut root_box);
root_box
}