use crate::node::Node;
#[cfg(test)]
use crate::node::NodeType;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use std::default::Default;
pub fn parse_html(html: &str) -> Node {
let dom = parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut html.as_bytes())
.unwrap();
convert_handle(&dom.document, false, false)
}
fn convert_handle(handle: &Handle, in_code: bool, in_pre: bool) -> Node {
let node = handle.as_ref();
match &node.data {
NodeData::Document => {
let mut doc_node = Node::new_document();
for child in node.children.borrow().iter() {
doc_node.add_child(convert_handle(child, false, false));
}
doc_node
}
NodeData::Element { name, attrs, .. } => {
let tag_name = name.local.to_string();
let mut elem = Node::new_element(&tag_name);
for attr in attrs.borrow().iter() {
elem.set_attribute(&attr.name.local.to_string(), &attr.value.to_string());
}
let is_pre = tag_name.eq_ignore_ascii_case("PRE") || in_pre;
let is_code = tag_name.eq_ignore_ascii_case("CODE") && !is_pre;
for child in node.children.borrow().iter() {
elem.add_child(convert_handle(child, is_code || in_code, is_pre));
}
elem
}
NodeData::Text { contents } => {
let text = contents.borrow().to_string();
let processed = if in_code || in_pre {
text
} else {
crate::utilities::collapse_whitespace(&text)
};
let mut text_node = Node::new_text(&processed);
text_node.is_code = in_code;
text_node
}
NodeData::Comment { contents } => Node::new_comment(&contents.to_string()),
NodeData::ProcessingInstruction { .. } | NodeData::Doctype { .. } => Node::new_document(),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple_html() {
let html = "<p>Hello</p>";
let doc = parse_html(html);
assert!(!doc.children.is_empty());
}
#[test]
fn test_parse_with_attributes() {
let html = r#"<a href="https://example.com" title="Example">Link</a>"#;
let doc = parse_html(html);
assert!(!doc.children.is_empty());
}
#[test]
fn test_parse_nested_elements() {
let html = "<div><p>Hello <strong>World</strong></p></div>";
let doc = parse_html(html);
assert!(!doc.children.is_empty());
}
#[test]
fn test_parse_multiple_elements() {
let html = "<p>First</p><p>Second</p>";
let doc = parse_html(html);
assert!(!doc.children.is_empty());
}
#[test]
fn test_code_element_marking() {
let html = "<p>Use <code>console.log()</code> function.</p>";
let doc = parse_html(html);
fn find_code_text(node: &Node) -> Option<String> {
for child in &node.children {
if child.node_name == "CODE" {
for subchild in &child.children {
if subchild.node_type == NodeType::Text && subchild.is_code {
return Some(subchild.node_value.clone());
}
}
}
if let Some(text) = find_code_text(child) {
return Some(text);
}
}
None
}
let code_text = find_code_text(&doc);
assert_eq!(code_text, Some("console.log()".to_string()));
}
#[test]
fn test_code_in_pre_not_marked() {
let html = "<pre><code>function hello() {\n console.log();\n}</code></pre>";
let doc = parse_html(html);
fn find_code_is_marked(node: &Node) -> Option<bool> {
for child in &node.children {
if child.node_name == "CODE" {
for subchild in &child.children {
if subchild.node_type == NodeType::Text {
return Some(subchild.is_code);
}
}
}
if let Some(marked) = find_code_is_marked(child) {
return Some(marked);
}
}
None
}
let is_marked = find_code_is_marked(&doc);
assert_eq!(is_marked, Some(false));
}
}