#[derive(Debug, Clone, Default)]
pub struct HtmlDocument {
pub elements: Vec<HtmlElement>,
}
#[derive(Debug, Clone, Default)]
pub struct HtmlElement {
pub tag: String,
pub text: String,
pub attrs: Vec<(String, String)>,
pub children: Vec<HtmlElement>,
}
impl HtmlDocument {
#[must_use]
pub fn from_html(html: &str) -> Self {
let mut doc = Self::default();
for part in html.split('<').filter(|p| !p.is_empty()) {
if let Some((tag_part, rest)) = part.split_once('>') {
let tag = tag_part
.split_whitespace()
.next()
.unwrap_or("")
.trim_start_matches('/')
.to_string();
if !tag.is_empty() && !tag_part.starts_with('/') {
doc.elements.push(HtmlElement {
tag,
text: rest.to_string(),
attrs: Vec::new(),
children: Vec::new(),
});
}
}
}
doc
}
#[must_use]
pub fn select(&self, tag: &str) -> Vec<&HtmlElement> {
self.elements.iter().filter(|e| e.tag == tag).collect()
}
#[must_use]
pub fn text_content(&self) -> String {
self.elements
.iter()
.map(|e| e.text.as_str())
.collect::<Vec<_>>()
.join(" ")
}
#[must_use]
pub fn contains_text(&self, text: &str) -> bool {
self.text_content().contains(text)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_basic_html() {
let doc = HtmlDocument::from_html("<h1>Hello</h1><p>World</p>");
assert_eq!(doc.elements.len(), 2);
assert_eq!(doc.elements[0].tag, "h1");
assert_eq!(doc.elements[0].text, "Hello");
}
#[test]
fn select_by_tag() {
let doc = HtmlDocument::from_html("<p>One</p><div>Two</div><p>Three</p>");
let paragraphs = doc.select("p");
assert_eq!(paragraphs.len(), 2);
}
#[test]
fn text_content() {
let doc = HtmlDocument::from_html("<h1>Title</h1><p>Body</p>");
assert!(doc.text_content().contains("Title"));
assert!(doc.text_content().contains("Body"));
}
#[test]
fn contains_text_positive() {
let doc = HtmlDocument::from_html("<p>Hello World</p>");
assert!(doc.contains_text("Hello"));
}
#[test]
fn contains_text_negative() {
let doc = HtmlDocument::from_html("<p>Hello</p>");
assert!(!doc.contains_text("Goodbye"));
}
#[test]
fn empty_document() {
let doc = HtmlDocument::from_html("");
assert!(doc.elements.is_empty());
assert!(doc.text_content().is_empty());
}
}