use crate::structs::{
AttributeValues, Attributes, Node,
NodeType::{self, *},
};
use std::{collections::VecDeque, fmt::Display};
#[derive(Debug, PartialEq, Eq)]
pub enum MalformedTagError {
MissingClosingBracket(u32),
MissingTagName(u32),
}
#[derive(Debug, PartialEq, Eq)]
pub enum MalformedAttributeError {
MissingQuotationMark(u32),
MissingAttributeName(u32),
MissingAttributeValue(u32),
}
#[derive(Debug, PartialEq, Eq)]
pub enum ParseHTMLError {
MalformedTag(String, MalformedTagError),
MalformedAttribute(String, MalformedAttributeError),
}
impl Display for ParseHTMLError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ParseHTMLError::MalformedTag(tag, error) => match error {
MalformedTagError::MissingClosingBracket(index) => {
write!(
f,
"Malformed tag: {} - Missing closing bracket at around index {}",
tag, index
)
}
MalformedTagError::MissingTagName(index) => {
write!(
f,
"Malformed tag: {} - Missing tag name at around index {}",
tag, index
)
}
},
ParseHTMLError::MalformedAttribute(attr, error) => match error {
MalformedAttributeError::MissingQuotationMark(index) => {
write!(
f,
"Malformed attribute: {} - Missing quotation mark at around index {}",
attr, index
)
}
MalformedAttributeError::MissingAttributeName(index) => {
write!(
f,
"Malformed attribute: {} - Missing attribute name at around index {}",
attr, index
)
}
MalformedAttributeError::MissingAttributeValue(index) => {
write!(
f,
"Malformed attribute: {} - Missing attribute value at around index {}",
attr, index
)
}
},
}
}
}
pub fn safe_parse_html(input: String) -> Result<Node, ParseHTMLError> {
let mut current_index = 0;
let mut nodes = Vec::new();
let mut stack: Vec<Node> = Vec::new();
while current_index < input.len() {
let rest = &input[current_index..];
if rest.starts_with("<!") {
if rest.starts_with("<!DOCTYPE") {
current_index += rest.find('>').unwrap() + 1;
continue;
}
if let Some(closing_comment_index) = rest.find("-->") {
let comment = &rest[..closing_comment_index + 3];
let new_node = Node {
tag_name: Some(Comment),
value: Some(
comment
.trim_start_matches("<!")
.trim_start_matches("--")
.trim_end_matches("-->")
.to_string(),
),
attributes: None,
within_special_tag: None,
children: Vec::new(),
};
nodes.push(new_node);
current_index += closing_comment_index + 3;
continue;
}
return Err(ParseHTMLError::MalformedTag(
rest.to_string(),
MalformedTagError::MissingClosingBracket(current_index as u32),
));
}
if rest.starts_with('<') {
if let Some(mut closing_index) = find_closing_bracket_index(rest) {
let self_closing = if rest.chars().nth(closing_index - 1) == Some('/') {
closing_index -= 1;
true
} else {
false
};
let tag_content = &rest[1..closing_index];
let node_name;
let mut attribute_map = None;
if let Some(space_index) = tag_content.find(|c: char| c.is_whitespace()) {
node_name = &tag_content[..space_index];
let attributes = &tag_content[space_index..];
match parse_tag_attributes(attributes, current_index) {
Ok(map) => attribute_map = map,
Err(err) => return Err(err),
}
} else {
node_name = tag_content;
}
if node_name.is_empty() {
return Err(ParseHTMLError::MalformedTag(
tag_content.to_string(),
MalformedTagError::MissingTagName(current_index as u32),
));
}
if rest.starts_with("</") {
match stack.pop() {
Some(last_node) => {
if stack.is_empty() {
nodes.push(last_node);
} else {
let parent = stack.last_mut().unwrap(); parent.children.push(last_node);
}
current_index += closing_index + 1;
continue;
}
None => {
let closing_bracket_of_closing_tag = rest.find('>');
return Err(ParseHTMLError::MalformedTag(
if let Some(index) = closing_bracket_of_closing_tag {
rest[..index + 1].to_string()
} else {
rest.to_string()
},
MalformedTagError::MissingClosingBracket(current_index as u32),
));
}
}
}
let node_type = NodeType::from_tag_str(node_name);
let mut new_node = Node {
tag_name: Some(node_type.clone()),
value: None,
attributes: attribute_map,
within_special_tag: None,
children: Vec::new(),
};
if self_closing {
if let Some(parent) = stack.last_mut() {
modify_node_with_parent(&mut new_node, parent);
parent.children.push(new_node);
} else {
nodes.push(new_node);
}
current_index += closing_index + 2;
continue;
}
if let Some(parent) = stack.last_mut() {
modify_node_with_parent(&mut new_node, parent);
}
stack.push(new_node);
current_index += closing_index + 1;
continue;
} else {
return Err(ParseHTMLError::MalformedTag(
rest.to_string(),
MalformedTagError::MissingClosingBracket(current_index as u32),
));
}
}
let next_opening_tag = rest.find('<').unwrap_or(rest.len());
let text = &rest[..next_opening_tag];
if text.trim().is_empty() {
current_index += next_opening_tag;
continue;
}
let new_node = Node {
tag_name: Some(Text),
value: Some(text.to_string()),
attributes: None,
within_special_tag: None,
children: Vec::new(),
};
modify_stack_with_node(&mut stack, new_node);
current_index += next_opening_tag
}
if !stack.is_empty() {
for stack_node in stack.drain(..) {
nodes.push(stack_node);
}
}
if nodes.len() == 1 {
return Ok(nodes.remove(0));
}
Ok(Node {
tag_name: None,
value: None,
attributes: None,
within_special_tag: None,
children: nodes,
})
}
fn modify_stack_with_node(stack: &mut Vec<Node>, mut new_node: Node) {
if let Some(parent) = stack.last_mut() {
modify_node_with_parent(&mut new_node, parent);
parent.children.push(new_node.clone());
return;
}
stack.push(new_node.clone());
}
fn modify_node_with_parent(node: &mut Node, parent: &Node) {
if parent.within_special_tag.is_some() {
node.within_special_tag
.clone_from(&parent.within_special_tag)
}
if let Some(parent_tag_name) = &parent.tag_name {
if parent_tag_name.is_special_tag() {
if let Some(within_special_tag) = &mut node.within_special_tag {
within_special_tag.push(parent_tag_name.clone());
} else {
node.within_special_tag = Some(vec![parent_tag_name.clone()]);
}
}
}
}
#[deprecated(
since = "0.7.0",
note = "This function is deprecated and will be removed in future versions. Please use the safe_parse_html function instead."
)]
pub fn parse_html(input: String) -> Node {
let parsed = safe_parse_html(input);
match parsed {
Ok(node) => node,
Err(err) => panic!("error parsing html: {:?}", err),
}
}
fn parse_tag_attributes(
tag_attributes: &str,
current_index: usize,
) -> Result<Option<Attributes>, ParseHTMLError> {
let tag_attributes = tag_attributes.trim();
if tag_attributes.is_empty() {
return Ok(None);
}
let mut attribute_map = Attributes::new();
let mut current_key = String::new();
let mut current_value_in_quotes = String::new();
let mut in_quotes = false;
let mut may_be_reading_non_quoted_value = false;
for char in tag_attributes.trim().chars() {
if in_quotes {
if char.eq(&'"') {
add_to_attribute_map(&mut attribute_map, ¤t_key, ¤t_value_in_quotes);
current_key.clear();
current_value_in_quotes.clear();
in_quotes = false;
continue;
}
current_value_in_quotes.push(char);
continue;
}
if char.eq(&'"') {
if current_key.is_empty() {
return Err(ParseHTMLError::MalformedAttribute(
tag_attributes.to_string(),
MalformedAttributeError::MissingAttributeName(current_index as u32),
));
}
in_quotes = true;
may_be_reading_non_quoted_value = false;
continue;
}
if char.is_whitespace() {
if may_be_reading_non_quoted_value {
if current_value_in_quotes.is_empty() {
continue;
}
add_to_attribute_map(&mut attribute_map, ¤t_key, ¤t_value_in_quotes);
current_key.clear();
current_value_in_quotes.clear();
may_be_reading_non_quoted_value = false;
continue;
}
if !current_key.is_empty() {
attribute_map.insert(current_key.clone(), AttributeValues::from(true));
current_key.clear();
continue;
}
continue;
}
if !in_quotes && !may_be_reading_non_quoted_value && char.eq(&'=') {
if current_key.is_empty() {
return Err(ParseHTMLError::MalformedAttribute(
tag_attributes.to_string(),
MalformedAttributeError::MissingAttributeName(current_index as u32),
));
}
may_be_reading_non_quoted_value = true;
continue;
}
if may_be_reading_non_quoted_value {
current_value_in_quotes.push(char);
continue;
}
current_key.push(char);
}
if may_be_reading_non_quoted_value && !current_value_in_quotes.is_empty() {
add_to_attribute_map(&mut attribute_map, ¤t_key, ¤t_value_in_quotes);
}
if in_quotes {
return Err(ParseHTMLError::MalformedAttribute(
current_value_in_quotes,
MalformedAttributeError::MissingQuotationMark(current_index as u32),
));
}
match attribute_map.is_empty() {
true => Ok(None),
false => Ok(Some(attribute_map)),
}
}
fn add_to_attribute_map(
attribute_map: &mut Attributes,
current_key: &str,
current_value_in_quotes: &str,
) {
if current_key.is_empty() || current_value_in_quotes.is_empty() {
return;
}
attribute_map.insert(
current_key.to_string(),
AttributeValues::from(current_value_in_quotes),
);
}
fn find_closing_bracket_index(rest: &str) -> Option<usize> {
let mut attribute_value_stack: VecDeque<char> = VecDeque::new(); for (idx, char) in rest.char_indices() {
if char.eq(&'"') || char.eq(&'\'') {
if let Some(back) = attribute_value_stack.back() {
if back.eq(&char) {
attribute_value_stack.pop_back();
} else {
attribute_value_stack.push_back(char)
}
} else {
attribute_value_stack.push_back(char)
}
}
if char.eq(&'>') && attribute_value_stack.is_empty() {
return Some(idx);
}
}
None
}
#[test]
fn issue_25() {
let input = "property=\"og:type\" content= \"website\"".to_string();
let expected = Attributes::from(vec![
("property".to_string(), AttributeValues::from("og:type")),
("content".to_string(), AttributeValues::from("website")),
]);
let parsed = parse_tag_attributes(&input, 0).unwrap().unwrap();
assert_eq!(parsed, expected);
}
#[test]
fn issue_31() {
let input = r#"<img src="https://exmaple.com/img.png" alt="Rust<br/>Logo"/>"#.to_string();
let expected = Node {
tag_name: Some(Unknown("img".to_string())),
value: None,
attributes: Some(Attributes {
id: None,
class: None,
href: None,
attributes: std::collections::HashMap::from([
(
"src".to_string(),
AttributeValues::from("https://exmaple.com/img.png"),
),
("alt".to_string(), AttributeValues::from("Rust<br/>Logo")),
]),
}),
children: Vec::new(),
within_special_tag: None,
};
let parsed = safe_parse_html(input).unwrap();
assert_eq!(parsed, expected)
}
#[test]
fn issue_36() {
let input = "<img src=\"https://hoerspiele.dra.de/fileadmin/www.hoerspiele.dra.de/images/vollinfo/4970918_B01.jpg\" />".to_string();
let expected = Node {
tag_name: Some(Unknown("img".to_string())),
value: None,
attributes: Some(Attributes {
id: None,
class: None,
href: None,
attributes: std::collections::HashMap::from([(
"src".to_string(),
AttributeValues::from("https://hoerspiele.dra.de/fileadmin/www.hoerspiele.dra.de/images/vollinfo/4970918_B01.jpg"),
)]),
}),
children: Vec::new(),
within_special_tag: None,
};
let parsed = safe_parse_html(input).unwrap();
assert_eq!(parsed, expected);
let input = r#"<!DOCTYPE html><meta http-equiv="content-type" content="text/html; charset=utf-8"><div class="column"><div class="gallery-wrap single">
<div class="gallery-container">
<figure class="image">
<figure class="image">
<img title="Illustration »Der dunkle Kongress« © ARD / Jürgen Frey"
alt="Illustration »Der dunkle Kongress« © ARD / Jürgen Frey"
src="https://hoerspiele.dra.de/fileadmin/www.hoerspiele.dra.de/images/vollinfo/4970918_B01.jpg">
<figcaption class="image-caption">Illustration »Der dunkle Kongress«
© ARD / Jürgen Frey</figcaption>
</figure></div></div></div>"#.to_string();
safe_parse_html(input).unwrap();
}