#![allow(deprecated)]
pub mod grammar;
use std::{
collections::HashMap,
fmt::{self, Write},
};
use enum_extract_macro::EnumExtract;
use indextree::{Arena, NodeId};
use std::sync::LazyLock;
use regex::{Captures, Regex};
pub use crate::html::grammar::parse;
pub use crate::html::grammar::parse_fragment;
pub use crate::html::grammar::QuirksMode;
static VOID_TAGS: &[&str] = &[
"meta", "link", "img", "input", "br", "hr", "col", "area", "base", "embed", "keygen",
"param", "source", "track", "wbr",
];
type TagAttributes = HashMap<String, String>;
#[derive(Debug, PartialEq, Clone)]
pub struct HtmlTag {
pub name: String,
pub attributes: TagAttributes,
}
impl HtmlTag {
pub fn new(name: String) -> HtmlTag {
HtmlTag {
name,
attributes: HashMap::new(),
}
}
pub fn get_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
self.internal_get_text(doc_node, document, false)
}
pub fn get_all_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
self.internal_get_text(doc_node, document, true)
}
fn internal_get_text(
&self,
doc_node: &DocumentNode,
document: &HtmlDocument,
recurse: bool,
) -> Option<String> {
let mut o_text: Option<String> = None;
let mut stack: Vec<DocumentNode> = doc_node.children(document).collect();
stack.reverse();
while let Some(child) = stack.pop() {
let child_node = document.get_html_node(&child);
if let Some(child_node) = child_node {
match child_node {
HtmlNode::Text(text) => {
o_text = Some(HtmlTag::append_text(o_text, text.value.to_string()));
}
HtmlNode::Tag(_) => {
if recurse {
let grandchildren: Vec<DocumentNode> = child.children(document).collect();
for gc in grandchildren.into_iter().rev() {
stack.push(gc);
}
}
}
HtmlNode::Comment(_)
| HtmlNode::ProcessingInstruction(_)
| HtmlNode::Doctype(_) => {}
}
}
}
o_text
}
fn append_text(o_text: Option<String>, append_text: String) -> String {
match o_text {
Some(t) => {
if t.ends_with(|ch: char| ch.is_whitespace())
|| append_text.starts_with(|ch: char| ch.is_whitespace())
{
format!("{}{}", t, append_text)
} else {
format!("{} {}", t, append_text)
}
}
None => append_text,
}
}
}
#[derive(PartialEq, Clone, Debug)]
pub struct HtmlText {
pub value: String,
pub only_whitespace: bool,
}
impl HtmlText {
pub fn new(value: &str) -> HtmlText {
let text = unescape_characters(value);
let only_whitespace = text.trim().is_empty();
HtmlText {
value: text,
only_whitespace,
}
}
}
#[derive(PartialEq, Clone, Debug)]
pub struct HtmlComment {
pub value: String,
}
impl HtmlComment {
pub fn new(value: String) -> HtmlComment {
HtmlComment { value }
}
}
#[derive(PartialEq, Clone, Debug)]
pub struct HtmlProcessingInstruction {
pub target: String,
pub data: String,
}
impl HtmlProcessingInstruction {
pub fn new(target: String, data: String) -> HtmlProcessingInstruction {
HtmlProcessingInstruction { target, data }
}
}
#[derive(PartialEq, Clone, Debug)]
pub struct HtmlDoctype {
pub name: String,
pub public_id: Option<String>,
pub system_id: Option<String>,
}
impl HtmlDoctype {
pub fn new(name: String, public_id: Option<String>, system_id: Option<String>) -> HtmlDoctype {
HtmlDoctype {
name,
public_id,
system_id,
}
}
}
pub fn unescape_characters(text: &str) -> String {
static NUMERIC_CHAR_REF_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"&#(?:x([0-9a-fA-F]+)|(\d+));").unwrap());
let text = NUMERIC_CHAR_REF_RE
.replace_all(text, |caps: &Captures| {
if let Some(hex) = caps.get(1) {
if let Ok(num) = u32::from_str_radix(hex.as_str(), 16) {
return char::from_u32(num).unwrap_or('\u{FFFD}').to_string();
}
} else if let Some(dec) = caps.get(2) {
if let Ok(num) = dec.as_str().parse::<u32>() {
return char::from_u32(num).unwrap_or('\u{FFFD}').to_string();
}
}
"\u{FFFD}".to_string()
})
.into_owned();
text.replace("<", "<")
.replace(">", ">")
.replace(""", r#"""#)
.replace("&", "&")
}
pub fn escape_characters(text: &str) -> String {
text.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(r#"""#, """)
.replace("'", "'")
}
pub fn trim_internal_whitespace(text: &str) -> String {
let mut result = String::new();
let mut last_char = ' ';
for c in text.chars() {
if c.is_whitespace() {
if !last_char.is_whitespace() {
result.push(' ');
}
} else {
result.push(c);
}
last_char = c;
}
result.trim_end().to_string()
}
#[derive(Clone, Debug, EnumExtract)]
pub enum HtmlNode {
Tag(HtmlTag),
Text(HtmlText),
Comment(HtmlComment),
ProcessingInstruction(HtmlProcessingInstruction),
Doctype(HtmlDoctype),
}
impl HtmlNode {
pub fn get_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
self.internal_get_text(doc_node, document, false)
}
pub fn get_all_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
self.internal_get_text(doc_node, document, true)
}
fn internal_get_text(
&self,
doc_node: &DocumentNode,
document: &HtmlDocument,
recurse: bool,
) -> Option<String> {
match self {
HtmlNode::Tag(tag) => {
if recurse {
tag.get_all_text(doc_node, document)
} else {
tag.get_text(doc_node, document)
}
}
HtmlNode::Text(text) => Some(text.value.to_string()),
HtmlNode::Comment(_) | HtmlNode::ProcessingInstruction(_) | HtmlNode::Doctype(_) => {
None
}
}
}
pub fn get_attributes(&self) -> Option<&TagAttributes> {
match self {
HtmlNode::Tag(tag) => Some(&tag.attributes),
_ => None,
}
}
}
#[deprecated(
since = "0.8.0",
note = "Use `XpathItemTree` directly via `html::parse()` and `XpathItemTree::from(&doc)` instead"
)]
#[derive(Clone)]
pub struct HtmlDocument {
pub(crate) arena: Arena<HtmlNode>,
pub root_node: DocumentNode,
}
impl HtmlDocument {
pub fn new(arena: Arena<HtmlNode>, root_node: DocumentNode) -> HtmlDocument {
HtmlDocument { arena, root_node }
}
pub fn get_html_node(&self, node: &DocumentNode) -> Option<&HtmlNode> {
self.arena.get(node.id).map(|x| x.get())
}
pub fn to_formatted_string(&self, format_type: DocumentFormatType) -> String {
display_node(0, self, &self.root_node, format_type).expect("failed to display node")
}
pub fn iter(&self) -> impl Iterator<Item = DocumentNode> + '_ {
self.arena.iter().map(|node| {
let id = self.arena.get_node_id(node).unwrap();
DocumentNode::new(id)
})
}
}
impl fmt::Display for HtmlDocument {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let text = display_node(0, self, &self.root_node, DocumentFormatType::Standard)?;
write!(f, "{}", text)
}
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Debug, Hash)]
pub enum DocumentFormatType {
Standard,
IgnoreWhitespace,
Indented,
}
fn display_node(
start_indent: usize,
doc: &HtmlDocument,
start_node: &DocumentNode,
format_type: DocumentFormatType,
) -> Result<String, fmt::Error> {
fn display_indent(indent: usize, str: &mut String) -> fmt::Result {
for _ in 0..indent {
write!(str, " ")?;
}
Ok(())
}
enum Phase {
Enter(DocumentNode, usize),
Exit(String, usize), }
let mut result = String::new();
let mut stack: Vec<Phase> = vec![Phase::Enter(*start_node, start_indent)];
while let Some(phase) = stack.pop() {
match phase {
Phase::Enter(doc_node, indent) => {
let html_node = doc.get_html_node(&doc_node).ok_or(fmt::Error)?;
match html_node {
HtmlNode::Tag(tag) => {
if matches!(format_type, DocumentFormatType::Indented) {
display_indent(indent, &mut result)?;
}
write!(&mut result, "<{}", tag.name)?;
let mut sorted_attrs: Vec<_> = tag.attributes.iter().collect();
sorted_attrs.sort_by(|a, b| a.0.cmp(b.0));
for attribute in sorted_attrs {
write!(&mut result, r#" {}="{}""#, attribute.0, attribute.1)?;
}
write!(&mut result, ">")?;
if matches!(format_type, DocumentFormatType::Indented) {
writeln!(&mut result)?;
}
if !VOID_TAGS.contains(&tag.name.as_str()) {
stack.push(Phase::Exit(tag.name.clone(), indent));
let children: Vec<DocumentNode> = doc_node.children(doc).collect();
for child in children.into_iter().rev() {
stack.push(Phase::Enter(child, indent + 1));
}
}
}
HtmlNode::Text(text) => {
let output_text = escape_characters(text.value.as_str());
match format_type {
DocumentFormatType::Standard => {
write!(&mut result, "{}", output_text)?;
}
DocumentFormatType::IgnoreWhitespace => {
if !text.only_whitespace {
write!(&mut result, "{}", output_text)?;
}
}
DocumentFormatType::Indented => {
if !text.only_whitespace {
display_indent(indent, &mut result)?;
writeln!(&mut result, "{}", output_text.trim())?;
}
}
}
}
HtmlNode::Comment(comment) => {
if matches!(format_type, DocumentFormatType::Indented) {
display_indent(indent, &mut result)?;
}
let sanitized = comment.value.replace("--", "- -");
write!(&mut result, "<!--{}-->", sanitized)?;
if matches!(format_type, DocumentFormatType::Indented) {
writeln!(&mut result)?;
}
}
HtmlNode::ProcessingInstruction(pi) => {
if matches!(format_type, DocumentFormatType::Indented) {
display_indent(indent, &mut result)?;
}
if pi.data.is_empty() {
write!(&mut result, "<?{}?>", pi.target)?;
} else {
write!(&mut result, "<?{} {}?>", pi.target, pi.data)?;
}
if matches!(format_type, DocumentFormatType::Indented) {
writeln!(&mut result)?;
}
}
HtmlNode::Doctype(doctype) => {
if matches!(format_type, DocumentFormatType::Indented) {
display_indent(indent, &mut result)?;
}
write!(&mut result, "<!DOCTYPE {}", doctype.name)?;
if let Some(ref public_id) = doctype.public_id {
write!(&mut result, r#" PUBLIC "{}""#, public_id)?;
if let Some(ref system_id) = doctype.system_id {
write!(&mut result, r#" "{}""#, system_id)?;
}
} else if let Some(ref system_id) = doctype.system_id {
write!(&mut result, r#" SYSTEM "{}""#, system_id)?;
}
write!(&mut result, ">")?;
if matches!(format_type, DocumentFormatType::Indented) {
writeln!(&mut result)?;
}
}
}
}
Phase::Exit(tag_name, indent) => {
if matches!(format_type, DocumentFormatType::Indented) {
display_indent(indent, &mut result)?;
}
write!(&mut result, "</{}>", tag_name)?;
if matches!(format_type, DocumentFormatType::Indented) {
writeln!(&mut result)?;
}
}
}
}
Ok(result)
}
#[deprecated(
since = "0.8.0",
note = "Use `XpathItemTree` directly via `html::parse()` and `XpathItemTree::from(&doc)` instead"
)]
#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Debug, Hash)]
pub struct DocumentNode {
id: NodeId,
}
impl DocumentNode {
pub fn new(id: NodeId) -> DocumentNode {
DocumentNode { id }
}
pub fn get_all_text(&self, document: &HtmlDocument) -> Option<String> {
match document.get_html_node(self) {
Some(html_node) => html_node.get_all_text(self, document),
None => None,
}
}
pub fn get_text(&self, document: &HtmlDocument) -> Option<String> {
match document.get_html_node(self) {
Some(html_node) => html_node.get_text(self, document),
None => None,
}
}
pub fn get_attributes<'a>(&'a self, document: &'a HtmlDocument) -> Option<&'a TagAttributes> {
match document.get_html_node(self) {
Some(html_node) => html_node.get_attributes(),
None => None,
}
}
pub fn children<'a>(
&self,
document: &'a HtmlDocument,
) -> impl Iterator<Item = DocumentNode> + 'a {
Box::new(self.id.children(&document.arena).map(DocumentNode::new))
}
pub fn parent(&self, document: &HtmlDocument) -> Option<DocumentNode> {
self.id
.ancestors(&document.arena)
.nth(1)
.map(DocumentNode::new)
}
}
#[cfg(test)]
mod tests {
use indoc::indoc;
use super::*;
#[test]
fn html_node_get_text_should_work_on_text_node() {
let mut arena = Arena::new();
let text_node = HtmlNode::Text(HtmlText::new("hello world"));
let text_doc_node = DocumentNode::new(arena.new_node(text_node));
let document = HtmlDocument::new(arena, text_doc_node);
let text_node = document.get_html_node(&text_doc_node).unwrap();
let result = text_node.get_text(&text_doc_node, &document).unwrap();
assert_eq!("hello world", result);
}
#[test]
fn html_node_get_text_should_work_on_tag_node_with_one_text_child() {
let mut arena = Arena::new();
let text_node = HtmlNode::Text(HtmlText::new("hello world"));
let text_node_id = arena.new_node(text_node);
let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
let tag_node_id = arena.new_node(tag_node);
let tag_doc_node = DocumentNode::new(tag_node_id);
tag_node_id.append(text_node_id, &mut arena);
let document = HtmlDocument::new(arena, tag_doc_node);
let tag_node = document.get_html_node(&tag_doc_node).unwrap();
let result = tag_node.get_text(&tag_doc_node, &document).unwrap();
assert_eq!("hello world", result);
}
#[test]
fn html_node_get_text_should_work_on_tag_node_with_two_text_children() {
let mut arena = Arena::new();
let text_node = HtmlNode::Text(HtmlText::new("hello"));
let text_node_id = arena.new_node(text_node);
let text_node2 = HtmlNode::Text(HtmlText::new("world"));
let text_node2_id = arena.new_node(text_node2);
let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
let tag_node_id = arena.new_node(tag_node);
tag_node_id.append(text_node_id, &mut arena);
tag_node_id.append(text_node2_id, &mut arena);
let tag_doc_node = DocumentNode::new(tag_node_id);
let document = HtmlDocument::new(arena, tag_doc_node);
let tag_node = document.get_html_node(&tag_doc_node).unwrap();
let result = tag_node.get_text(&tag_doc_node, &document).unwrap();
assert_eq!("hello world", result);
}
#[test]
fn html_node_get_text_should_ignore_nested_text() {
let mut arena = Arena::new();
let text_node = HtmlNode::Text(HtmlText::new("hello"));
let text_node_id = arena.new_node(text_node);
let text_node2 = HtmlNode::Text(HtmlText::new("world"));
let text_node2_id = arena.new_node(text_node2);
let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
let tag_node_id = arena.new_node(tag_node);
tag_node_id.append(text_node_id, &mut arena);
let tag_node2 = HtmlNode::Tag(HtmlTag::new(String::from("tag2")));
let tag_node2_id = arena.new_node(tag_node2);
tag_node2_id.append(text_node2_id, &mut arena);
tag_node_id.append(tag_node2_id, &mut arena);
let tag_doc_node = DocumentNode::new(tag_node_id);
let document = HtmlDocument::new(arena, tag_doc_node);
let tag_node = document.get_html_node(&tag_doc_node).unwrap();
let result = tag_node.get_text(&tag_doc_node, &document).unwrap();
assert_eq!("hello", result);
}
#[test]
fn html_node_get_all_text_should_include_nested_text() {
let mut arena = Arena::new();
let text_node = HtmlNode::Text(HtmlText::new("hello"));
let text_node_id = arena.new_node(text_node);
let text_node2 = HtmlNode::Text(HtmlText::new("world"));
let text_node2_id = arena.new_node(text_node2);
let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
let tag_node_id = arena.new_node(tag_node);
tag_node_id.append(text_node_id, &mut arena);
let tag_node2 = HtmlNode::Tag(HtmlTag::new(String::from("tag2")));
let tag_node2_id = arena.new_node(tag_node2);
tag_node2_id.append(text_node2_id, &mut arena);
tag_node_id.append(tag_node2_id, &mut arena);
let tag_doc_node = DocumentNode::new(tag_node_id);
let document = HtmlDocument::new(arena, tag_doc_node);
let tag_node = document.get_html_node(&tag_doc_node).unwrap();
let result = tag_node.get_all_text(&tag_doc_node, &document).unwrap();
assert_eq!("hello world", result);
}
#[test]
fn html_node_get_attributes_for_tag() {
let node = HtmlNode::Tag(HtmlTag {
name: "div".to_string(),
attributes: HashMap::from([("attr_name".to_string(), "attr_value".to_string())]),
});
assert!(node.get_attributes().is_some());
assert_eq!(node.get_attributes().unwrap()["attr_name"], "attr_value");
}
#[test]
fn html_node_get_attributes_for_text() {
let node = HtmlNode::Text(HtmlText::new("hello world"));
assert!(node.get_attributes().is_none())
}
#[test]
fn document_node_get_attributes_for_tag() {
let mut arena = Arena::new();
let html_node = HtmlNode::Tag(HtmlTag {
name: "div".to_string(),
attributes: HashMap::from([("attr_name".to_string(), "attr_value".to_string())]),
});
let doc_node = DocumentNode::new(arena.new_node(html_node));
let html_document = HtmlDocument::new(arena, doc_node);
let node = html_document.get_html_node(&doc_node).unwrap();
let attributes = node.get_attributes();
assert!(attributes.is_some());
assert_eq!(attributes.unwrap()["attr_name"], "attr_value");
}
#[test]
fn document_node_get_attributes_for_text() {
let mut arena = Arena::new();
let html_node = HtmlNode::Text(HtmlText::new("hello world"));
let doc_node = DocumentNode::new(arena.new_node(html_node));
let html_document = HtmlDocument::new(arena, doc_node);
let node = html_document.get_html_node(&doc_node).unwrap();
let attributes = node.get_attributes();
assert!(attributes.is_none());
}
}