use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use std::fmt;
use std::sync::Arc;
#[derive(Clone)]
pub struct HtmlDocument {
dom: Arc<RcDom>,
}
impl fmt::Debug for HtmlDocument {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "HtmlDocument")
}
}
#[derive(Clone)]
pub struct HtmlElement {
handle: Handle,
}
impl fmt::Debug for HtmlElement {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "HtmlElement")
}
}
impl HtmlDocument {
pub fn parse(content: &str) -> Self {
let dom = parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut content.as_bytes())
.expect("HTML content should be valid UTF-8");
Self { dom: Arc::new(dom) }
}
pub fn select(&self, selector: &str) -> Result<Vec<HtmlElement>, String> {
self.validate_selector(selector)?;
let elements = Self::select_nodes(&self.dom.document, selector);
Ok(elements
.into_iter()
.map(|handle| HtmlElement { handle })
.collect())
}
pub fn query_selector(&self, selector: &str) -> Result<Option<HtmlElement>, String> {
let elements = self.select(selector)?;
Ok(elements.into_iter().next())
}
pub fn query_selector_all(&self, selector: &str) -> Result<Vec<HtmlElement>, String> {
self.select(selector)
}
fn select_nodes(node: &Handle, selector: &str) -> Vec<Handle> {
let mut results = Vec::new();
if Self::matches_selector(node, selector) {
results.push(node.clone());
}
for child in node.children.borrow().iter() {
results.extend(Self::select_nodes(child, selector));
}
results
}
fn matches_selector(node: &Handle, selector: &str) -> bool {
let selector = selector.trim();
match &node.data {
NodeData::Element { name, attrs, .. } => {
let tag_name = name.local.as_ref();
let attrs_borrowed = attrs.borrow();
if let Some(class_name) = selector.strip_prefix('.') {
return attrs_borrowed.iter().any(|attr| {
attr.name.local.as_ref() == "class"
&& attr
.value
.as_ref()
.split_whitespace()
.any(|c| c == class_name)
});
}
if let Some(id_name) = selector.strip_prefix('#') {
return attrs_borrowed.iter().any(|attr| {
attr.name.local.as_ref() == "id" && attr.value.as_ref() == id_name
});
}
if selector.starts_with('[') && selector.ends_with(']') {
let inner = &selector[1..selector.len() - 1];
if let Some((attr_name, attr_value)) = inner.split_once('=') {
let attr_value = attr_value.trim_matches('\'').trim_matches('"');
return attrs_borrowed.iter().any(|attr| {
attr.name.local.as_ref() == attr_name
&& attr.value.as_ref() == attr_value
});
}
return attrs_borrowed
.iter()
.any(|attr| attr.name.local.as_ref() == inner);
}
if selector.contains(' ') {
let parts: Vec<&str> = selector.split_whitespace().collect();
if let Some(&last) = parts.last() {
return Self::matches_selector(node, last);
}
}
tag_name == selector
}
_ => false,
}
}
fn validate_selector(&self, selector: &str) -> Result<(), String> {
let selector = selector.trim();
if selector.is_empty() {
return Err("Selector cannot be empty".to_string());
}
let open_brackets = selector.matches('[').count();
let close_brackets = selector.matches(']').count();
if open_brackets != close_brackets {
return Err(format!(
"Invalid selector syntax: unmatched brackets in '{selector}'"
));
}
if selector.contains("[[") || selector.contains("]]") {
return Err(format!(
"Invalid selector syntax: nested brackets in '{selector}'"
));
}
Ok(())
}
}
impl HtmlElement {
pub fn text(&self) -> String {
Self::collect_text(&self.handle)
}
pub fn attr(&self, name: &str) -> Option<String> {
match &self.handle.data {
NodeData::Element { attrs, .. } => attrs.borrow().iter().find_map(|attr| {
if attr.name.local.as_ref() == name {
Some(attr.value.to_string())
} else {
None
}
}),
_ => None,
}
}
pub fn html(&self) -> String {
Self::serialize_node(&self.handle)
}
fn collect_text(node: &Handle) -> String {
let mut text = String::new();
match &node.data {
NodeData::Text { contents } => {
text.push_str(&contents.borrow());
}
_ => {
for child in node.children.borrow().iter() {
text.push_str(&Self::collect_text(child));
}
}
}
text
}
fn serialize_node(node: &Handle) -> String {
let mut html = String::new();
match &node.data {
NodeData::Element { name, attrs, .. } => {
let tag_name = name.local.as_ref();
html.push('<');
html.push_str(tag_name);
for attr in attrs.borrow().iter() {
html.push(' ');
html.push_str(attr.name.local.as_ref());
html.push_str("=\"");
html.push_str(&attr.value);
html.push('"');
}
html.push('>');
for child in node.children.borrow().iter() {
html.push_str(&Self::serialize_node(child));
}
html.push_str("</");
html.push_str(tag_name);
html.push('>');
}
NodeData::Text { contents } => {
html.push_str(&contents.borrow());
}
_ => {
for child in node.children.borrow().iter() {
html.push_str(&Self::serialize_node(child));
}
}
}
html
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple_html() {
let html = HtmlDocument::parse("<div>Test</div>");
assert!(!html.dom.document.children.borrow().is_empty());
}
#[test]
fn test_select_by_tag() {
let html = HtmlDocument::parse("<div><p>Test</p></div>");
let elements = html.select("p").expect("operation should succeed in test");
assert_eq!(elements.len(), 1);
}
#[test]
fn test_select_by_class() {
let html = HtmlDocument::parse("<div class='test'>Hello</div>");
let elements = html
.select(".test")
.expect("operation should succeed in test");
assert_eq!(elements.len(), 1);
}
#[test]
fn test_element_text() {
let html = HtmlDocument::parse("<p>Hello World</p>");
let p = html
.query_selector("p")
.expect("operation should succeed in test")
.expect("operation should succeed in test");
assert_eq!(p.text().trim(), "Hello World");
}
#[test]
fn test_element_attr() {
let html = HtmlDocument::parse("<a href='test.html'>Link</a>");
let link = html
.query_selector("a")
.expect("operation should succeed in test")
.expect("operation should succeed in test");
assert_eq!(link.attr("href"), Some("test.html".to_string()));
}
#[test]
fn test_element_attr_missing() {
let html = HtmlDocument::parse("<a>Link</a>");
let link = html
.query_selector("a")
.expect("operation should succeed in test")
.expect("operation should succeed in test");
assert_eq!(link.attr("href"), None);
}
#[test]
fn test_multiple_elements() {
let html = HtmlDocument::parse("<p>1</p><p>2</p><p>3</p>");
let elements = html.select("p").expect("operation should succeed in test");
assert_eq!(elements.len(), 3);
}
#[test]
fn test_query_selector_none() {
let html = HtmlDocument::parse("<div>Test</div>");
let element = html
.query_selector("p")
.expect("operation should succeed in test");
assert!(element.is_none());
}
#[test]
fn test_malformed_html() {
let html = HtmlDocument::parse("<div><p>Unclosed");
let elements = html.select("p").expect("operation should succeed in test");
assert_eq!(elements.len(), 1);
}
#[test]
fn test_method_chaining_simulation() {
let html = HtmlDocument::parse("<div class='content'>Hello World</div>");
let elements = html
.select(".content")
.expect("operation should succeed in test");
assert_eq!(elements.len(), 1, "Should have 1 element");
let element = &elements[0];
let text = element.text();
assert_eq!(text.trim(), "Hello World", "Text extraction should work");
}
#[test]
fn test_empty_html() {
let html = HtmlDocument::parse("");
let elements = html.select("*").expect("operation should succeed in test");
assert_eq!(elements.len(), 0);
}
#[test]
#[ignore = "Property test - run with: cargo test -- --ignored"]
fn prop_parse_never_panics() {
use proptest::prelude::*;
proptest!(|(html_str in ".*")| {
let _ = HtmlDocument::parse(&html_str);
});
}
#[test]
fn test_select_by_id() {
let html = HtmlDocument::parse("<div id='main'>Content</div>");
let elements = html.select("#main").expect("should parse");
assert_eq!(elements.len(), 1);
}
#[test]
fn test_select_by_attribute() {
let html = HtmlDocument::parse("<input type='text' name='field'>");
let elements = html.select("[type]").expect("should parse");
assert_eq!(elements.len(), 1);
}
#[test]
fn test_select_by_attribute_value() {
let html = HtmlDocument::parse("<input type='text'><input type='checkbox'>");
let elements = html.select("[type=text]").expect("should parse");
assert_eq!(elements.len(), 1);
}
#[test]
fn test_select_by_attribute_value_quoted() {
let html = HtmlDocument::parse("<input data-test='value'>");
let elements = html.select("[data-test='value']").expect("should parse");
assert_eq!(elements.len(), 1);
}
#[test]
fn test_query_selector_all() {
let html = HtmlDocument::parse("<p>1</p><p>2</p>");
let elements = html.query_selector_all("p").expect("should parse");
assert_eq!(elements.len(), 2);
}
#[test]
fn test_descendant_selector() {
let html = HtmlDocument::parse("<div><span><p>Nested</p></span></div>");
let elements = html.select("div p").expect("should parse");
assert_eq!(elements.len(), 1);
}
#[test]
fn test_select_article_tag() {
let html = HtmlDocument::parse("<article>Content</article>");
let elem = html
.query_selector("article")
.expect("should parse")
.unwrap();
assert!(elem.text().contains("Content"));
}
#[test]
fn test_element_html() {
let html = HtmlDocument::parse("<div><span>Inner</span></div>");
let elem = html.query_selector("div").expect("should parse").unwrap();
let inner = elem.html();
assert!(inner.contains("span"));
assert!(inner.contains("Inner"));
}
#[test]
fn test_element_multiple_classes() {
let html = HtmlDocument::parse("<div class='a b c'>Test</div>");
let elements = html.select(".b").expect("should parse");
assert_eq!(elements.len(), 1);
}
#[test]
fn test_debug_impl_document() {
let html = HtmlDocument::parse("<div>Test</div>");
let debug_str = format!("{:?}", html);
assert_eq!(debug_str, "HtmlDocument");
}
#[test]
fn test_debug_impl_element() {
let html = HtmlDocument::parse("<div>Test</div>");
let elem = html.query_selector("div").expect("should parse").unwrap();
let debug_str = format!("{:?}", elem);
assert_eq!(debug_str, "HtmlElement");
}
#[test]
fn test_nested_text_extraction() {
let html = HtmlDocument::parse("<div>Hello <b>World</b>!</div>");
let elem = html.query_selector("div").expect("should parse").unwrap();
let text = elem.text();
assert!(text.contains("Hello"));
assert!(text.contains("World"));
}
#[test]
fn test_select_no_match() {
let html = HtmlDocument::parse("<div>Test</div>");
let elements = html.select("nonexistent").expect("should parse");
assert!(elements.is_empty());
}
#[test]
fn test_select_whitespace_in_selector() {
let html = HtmlDocument::parse("<div><p>Test</p></div>");
let elements = html.select(" p ").expect("should parse");
assert_eq!(elements.len(), 1);
}
#[test]
fn test_html_clone() {
let html = HtmlDocument::parse("<div>Test</div>");
let _cloned = html.clone();
}
#[test]
fn test_element_clone() {
let html = HtmlDocument::parse("<div>Test</div>");
let elem = html.query_selector("div").expect("should parse").unwrap();
let _cloned = elem.clone();
}
#[test]
fn test_attr_missing_on_text_node() {
let html = HtmlDocument::parse("<div>Just text</div>");
let elem = html.query_selector("div").expect("should parse").unwrap();
assert!(elem.attr("data-nonexistent").is_none());
}
}