use super::content::parse_fragment;
use crate::dom::{node::Node, text::TextNode};
use regex::Regex;
use std::collections::HashMap;
use std::fmt;
#[cfg(feature = "parallel")]
use rayon::prelude::*;
#[derive(Debug, Clone)]
pub struct HTMLElement {
pub(super) tag_name: Option<String>, pub(crate) raw_attrs: String, pub attrs: Vec<(String, String)>, pub children: Vec<Node>,
pub(crate) parent: Option<*mut HTMLElement>,
pub(super) is_void: bool,
pub(super) void_add_slash: bool,
pub(super) cache_raw_map: Option<HashMap<String, String>>, pub(super) cache_lower_decoded: Option<HashMap<String, String>>, pub id: String,
pub(super) class_cache: Option<Vec<String>>, pub(super) range: Option<(usize, usize)>, pub(crate) attrs_complete: bool,
pub(crate) attrs_modified: bool,
pub(crate) parse_comment: bool,
pub(crate) parse_lowercase: bool,
}
impl HTMLElement {
pub fn new(
tag: Option<String>,
raw_attrs: String,
attrs: Vec<(String, String)>,
is_void: bool,
void_add_slash: bool,
) -> Self {
let mut id_val = String::new();
for (k, v) in &attrs {
if k.eq_ignore_ascii_case("id") {
id_val = v.clone();
break;
}
}
Self {
tag_name: tag,
raw_attrs,
attrs,
children: Vec::with_capacity(2),
parent: None,
is_void,
void_add_slash,
cache_raw_map: None,
cache_lower_decoded: None,
id: id_val,
class_cache: None,
range: None, attrs_complete: false,
attrs_modified: false,
parse_comment: false,
parse_lowercase: false,
}
}
pub fn is_root(&self) -> bool {
self.tag_name.is_none()
}
pub fn name(&self) -> &str {
self.tag_name.as_deref().unwrap_or("")
}
pub fn set_tag_name(&mut self, new_name: &str) {
let lowered = new_name.to_lowercase();
self.tag_name = Some(lowered);
}
pub fn raw_text(&self) -> String {
if !self.is_root() && self.name().eq_ignore_ascii_case("br") {
return "\n".to_string();
}
let mut buf = String::new();
for c in &self.children {
buf.push_str(&c.raw_text());
}
buf
}
pub fn class_names(&self) -> String {
self.get_attr("class").unwrap_or("").to_string()
}
pub fn inner_html(&self) -> String {
self.children.iter().map(|c| c.to_html()).collect()
}
pub fn set_inner_html(&mut self, html: &str) {
let mut nodes = parse_fragment(html);
if nodes.is_empty() {
nodes.push(Node::Text(TextNode::new(html.to_string())));
}
self.children.clear();
let self_ptr: *mut HTMLElement = self as *mut HTMLElement;
for n in nodes.iter_mut() {
if let Node::Element(e) = n {
e.parent = Some(self_ptr);
}
}
self.children.extend(nodes);
}
pub fn matches_selector<'a>(&'a self, root: &'a HTMLElement, selector: &str) -> bool {
let matches = root.query_selector_all(selector);
let self_ptr = self as *const HTMLElement;
matches.iter().any(|e| *e as *const HTMLElement == self_ptr)
}
pub fn matches(&self, selector: &str) -> bool {
let root = self.root();
self.matches_selector(root, selector)
}
pub fn root(&self) -> &HTMLElement {
let mut cur: &HTMLElement = self;
while let Some(p) = cur.parent() {
cur = p;
}
cur
}
pub fn closest(&self, selector: &str) -> Option<&HTMLElement> {
let mut cur: Option<&HTMLElement> = Some(self);
while let Some(c) = cur {
if c.matches(selector) {
return Some(c);
}
cur = c.parent();
}
None
}
pub fn clone(&self) -> HTMLElement {
self.clone_node()
}
pub fn iter_elements<'a>(&'a self) -> impl Iterator<Item = &'a HTMLElement> + 'a {
self.children.iter().filter_map(|n| n.as_element())
}
pub fn query_selector_all<'a>(&'a self, selector: &str) -> Vec<&'a HTMLElement> {
crate::css_select::select_all(selector, self)
}
pub fn query_selector<'a>(&'a self, selector: &str) -> Option<&'a HTMLElement> {
self.query_selector_all(selector).into_iter().next()
}
pub fn remove_whitespace(&mut self) {
self.ensure_all_attrs();
let mut out = Vec::with_capacity(self.children.len());
for mut child in self.children.drain(..) {
match &mut child {
Node::Text(t) => {
let mut t2 = t.clone();
if !t2.is_whitespace() {
let new_raw = {
let _ = t2.trimmed_raw_text();
t2.trimmed_raw_text().to_string()
};
t2.set_raw(new_raw);
out.push(Node::Text(t2));
}
}
Node::Element(e) => {
let mut ec = e.clone();
ec.remove_whitespace();
out.push(Node::Element(ec));
}
Node::Comment(_) => {}
}
}
self.children = out;
self.rebuild_raw_attrs();
}
pub fn trim_right(&mut self, pattern: &Regex) {
let mut i = 0usize;
while i < self.children.len() {
match &mut self.children[i] {
Node::Element(e) => {
let mut ec = e.clone();
ec.trim_right(pattern);
self.children[i] = Node::Element(ec);
}
Node::Text(t) => {
if let Some(mat) = pattern.find(&t.raw) {
let new_raw = t.raw[..mat.start()].to_string();
let mut nt = t.clone();
nt.set_raw(new_raw);
self.children[i] = Node::Text(nt);
self.children.truncate(i + 1); return;
}
}
Node::Comment(_) => {}
}
i += 1;
}
}
pub fn structure(&self) -> String {
let mut res = Vec::new();
fn dfs(cur: &HTMLElement, indent: usize, out: &mut Vec<String>) {
if cur.is_root() {
for child in &cur.children {
if let Node::Element(e) = child {
dfs(e, 0, out);
}
}
return;
}
let mut line = String::new();
line.push_str(&" ".repeat(indent));
line.push_str(cur.name());
if !cur.id.is_empty() {
line.push('#');
line.push_str(&cur.id);
}
if let Some(cls) = cur.get_attr("class") {
if !cls.is_empty() {
let mut seen = std::collections::HashSet::new();
for c in cls.split_whitespace() {
if seen.insert(c) {
line.push('.');
line.push_str(c);
}
}
}
}
out.push(line);
for child in &cur.children {
match child {
Node::Element(e) => dfs(e, indent + 1, out),
Node::Text(t) => {
if !t.is_whitespace() {
out.push(format!("{}#text", " ".repeat(indent + 1)));
}
}
Node::Comment(_) => {}
}
}
}
dfs(self, 0, &mut res);
res.join("\n")
}
pub fn get_elements_by_tag_name<'a>(&'a self, tag: &str) -> Vec<&'a HTMLElement> {
let tgt = tag.to_lowercase();
let mut acc = Vec::new();
fn walk<'b>(cur: &'b HTMLElement, tgt: &str, acc: &mut Vec<&'b HTMLElement>) {
for c in &cur.children {
if let Node::Element(e) = c {
let inner = &**e;
if tgt == "*" || inner.name().eq_ignore_ascii_case(tgt) {
acc.push(inner);
}
walk(inner, tgt, acc);
}
}
}
walk(self, &tgt, &mut acc);
acc
}
pub fn get_element_by_id<'a>(&'a self, id: &str) -> Option<&'a HTMLElement> {
fn walk<'b>(cur: &'b HTMLElement, id: &str) -> Option<&'b HTMLElement> {
for c in &cur.children {
if let Node::Element(e) = c {
let inner = &**e;
if inner.get_attr("id") == Some(id) {
return Some(inner);
}
if let Some(f) = walk(inner, id) {
return Some(f);
}
}
}
None
}
walk(self, id)
}
pub fn get_element_by_id_mut<'a>(&'a mut self, id: &str) -> Option<&'a mut HTMLElement> {
fn walk<'b>(cur: &'b mut HTMLElement, id: &str) -> Option<&'b mut HTMLElement> {
for c in cur.children.iter_mut() {
if let Node::Element(e) = c {
if e.id == id || e.get_attr("id") == Some(id) {
return Some(e);
}
if let Some(found) = walk(e, id) {
return Some(found);
}
}
}
None
}
walk(self, id)
}
pub fn clone_node(&self) -> HTMLElement {
fn clone_rec(el: &HTMLElement) -> Box<HTMLElement> {
let mut new = Box::new(HTMLElement {
tag_name: el.tag_name.clone(),
raw_attrs: el.raw_attrs.clone(),
attrs: el.attrs.clone(),
children: Vec::new(),
parent: None,
is_void: el.is_void,
void_add_slash: el.void_add_slash,
cache_raw_map: None,
cache_lower_decoded: None,
id: el.id.clone(),
class_cache: el.class_cache.clone(),
range: None,
attrs_complete: el.attrs_complete,
attrs_modified: el.attrs_modified,
parse_comment: el.parse_comment,
parse_lowercase: el.parse_lowercase,
});
for c in &el.children {
match c {
Node::Element(e) => new.children.push(Node::Element(clone_rec(e))),
Node::Text(t) => new.children.push(Node::Text(t.clone())),
Node::Comment(cm) => new.children.push(Node::Comment(cm.clone())),
};
}
new
}
*clone_rec(self)
}
pub fn clone_shallow(&self) -> HTMLElement {
HTMLElement {
tag_name: self.tag_name.clone(),
raw_attrs: self.raw_attrs.clone(),
attrs: self.attrs.clone(),
children: Vec::new(),
parent: None,
is_void: self.is_void,
void_add_slash: self.void_add_slash,
cache_raw_map: None,
cache_lower_decoded: None,
id: self.id.clone(),
class_cache: self.class_cache.clone(),
range: None,
attrs_complete: self.attrs_complete,
attrs_modified: self.attrs_modified,
parse_comment: self.parse_comment,
parse_lowercase: self.parse_lowercase,
}
}
pub fn set_range_start(&mut self, start: usize) {
match self.range {
Some((_, e)) => self.range = Some((start, e)),
None => self.range = Some((start, start)),
}
}
pub fn set_range_end(&mut self, end: usize) {
match self.range {
Some((s, _)) => self.range = Some((s, end)),
None => self.range = Some((end, end)),
}
}
pub fn range(&self) -> Option<(usize, usize)> {
self.range
}
#[cfg(feature = "parallel")]
pub fn batch_ensure_attributes_safe(elements: &mut [HTMLElement]) {
for el in elements.iter_mut() {
el.ensure_all_attrs();
}
}
#[cfg(feature = "parallel")]
pub fn process_text_nodes_parallel(text_nodes: &mut [crate::dom::text::TextNode]) {
const PARALLEL_THRESHOLD: usize = 20;
if text_nodes.len() >= PARALLEL_THRESHOLD {
text_nodes.par_iter_mut().for_each(|node| {
let _ = node.is_whitespace();
let _ = node.trimmed_raw_text();
});
}
}
}
impl fmt::Display for HTMLElement {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.outer_html())
}
}