use html5ever::{
local_name, namespace_url, ns, parse_document, parse_fragment,
tendril::TendrilSink, QualName,
};
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use once_cell::sync::Lazy;
use sauron_core::{
html::{
attributes,
attributes::{
AttributeValue, Style, HTML_ATTRS, HTML_ATTRS_SPECIAL, HTML_STYLES,
},
html_element_self_closing,
tags::{
HTML_SC_TAGS, HTML_TAGS, HTML_TAGS_NON_COMMON,
HTML_TAGS_WITH_MACRO_NON_COMMON,
},
text,
},
mt_dom,
svg::{
attributes::{SVG_ATTRS, SVG_ATTRS_SPECIAL, SVG_ATTRS_XLINK},
tags::{SVG_TAGS, SVG_TAGS_NON_COMMON, SVG_TAGS_SPECIAL},
SVG_NAMESPACE,
},
Attribute, Node,
};
use std::collections::HashSet;
use std::iter::FromIterator;
use std::{fmt, io};
use thiserror::Error;
static ALL_SVG_TAGS: Lazy<HashSet<&&'static str>> = Lazy::new(|| {
HashSet::from_iter(
SVG_TAGS
.iter()
.chain(SVG_TAGS_NON_COMMON.iter())
.chain(SVG_TAGS_SPECIAL.iter().map(|(_func, t)| t)),
)
});
static ALL_HTML_TAGS: Lazy<HashSet<&&'static str>> = Lazy::new(|| {
HashSet::from_iter(
HTML_TAGS
.iter()
.chain(HTML_SC_TAGS.iter())
.chain(HTML_TAGS_NON_COMMON.iter())
.chain(HTML_TAGS_WITH_MACRO_NON_COMMON.iter()),
)
});
static SELF_CLOSING_TAGS: Lazy<HashSet<&&'static str>> =
Lazy::new(|| HashSet::from_iter(HTML_SC_TAGS.iter()));
#[derive(Debug, Error)]
pub enum ParseError {
#[error("Generic Error {0}")]
Generic(String),
#[error("{0}")]
IoError(#[from] io::Error),
#[error("{0}")]
FmtError(#[from] fmt::Error),
}
fn match_tag(tag: &str) -> Option<&'static str> {
ALL_HTML_TAGS
.iter()
.chain(ALL_SVG_TAGS.iter())
.find(|item| item.eq_ignore_ascii_case(&tag))
.map(|item| **item)
}
fn match_attribute(key: &str) -> Option<&'static str> {
HTML_ATTRS
.iter()
.chain(SVG_ATTRS.iter())
.find(|att| att.eq_ignore_ascii_case(&key))
.map(|att| *att)
.or_else(|| {
HTML_ATTRS_SPECIAL
.iter()
.chain(SVG_ATTRS_SPECIAL.iter())
.chain(SVG_ATTRS_XLINK.iter())
.find(|(_func, att)| att.eq_ignore_ascii_case(&key))
.map(|(func, _att)| *func)
})
}
fn match_style_name(key: &str) -> Option<&'static str> {
HTML_STYLES
.iter()
.find(|name| name.eq_ignore_ascii_case(&key))
.map(|name| *name)
}
pub fn match_attribute_function(key: &str) -> Option<&'static str> {
HTML_ATTRS
.iter()
.chain(SVG_ATTRS.iter())
.find(|att| att.eq_ignore_ascii_case(key))
.map(|att| *att)
.or_else(|| {
HTML_ATTRS_SPECIAL
.iter()
.chain(SVG_ATTRS_SPECIAL.iter())
.chain(SVG_ATTRS_XLINK.iter())
.find(|(func, _att)| func.eq_ignore_ascii_case(key))
.map(|(func, _att)| *func)
})
}
pub fn tag_namespace(tag: &str) -> Option<&'static str> {
let is_html = ALL_HTML_TAGS.contains(&tag);
let is_svg = ALL_SVG_TAGS.contains(&tag);
if !is_html {
if is_svg {
Some(SVG_NAMESPACE)
} else {
None
}
} else {
None
}
}
pub fn is_self_closing(tag: &str) -> bool {
SELF_CLOSING_TAGS.contains(&tag)
}
fn extract_attributes<MSG>(
attrs: &Vec<html5ever::Attribute>,
) -> Vec<Attribute<MSG>> {
attrs
.iter()
.filter_map(|att| {
let key = att.name.local.to_string();
let value = att.value.to_string();
if key == "style" {
let styles = extract_styles(&value);
Some(mt_dom::attr("style", AttributeValue::from_styles(styles)))
} else if let Some(attr) = match_attribute(&key) {
Some(attributes::attr(attr, value))
} else {
log::warn!("Not a standard html attribute: {}", key);
None
}
})
.collect()
}
fn extract_styles(style: &str) -> Vec<Style> {
let mut extracted = vec![];
println!("processing style: {}", style);
let mut single_styles: Vec<&str> = style.split(";").collect();
single_styles.retain(|item| !item.trim().is_empty());
for single in single_styles {
let key_value: Vec<&str> = single.split(":").collect();
assert_eq!(key_value.len(), 2);
let key = key_value[0].trim();
let value = key_value[1].trim();
println!("style [{}] = [{}]", key, value);
if let Some(match_style) = match_style_name(key) {
extracted.push(Style::new(match_style, value.to_string().into()));
}
}
extracted
}
fn process_children<MSG>(node: &Handle) -> Vec<Node<MSG>> {
node.children
.borrow()
.iter()
.filter_map(|child_node| process_node(child_node))
.collect()
}
fn process_node<MSG>(node: &Handle) -> Option<Node<MSG>> {
match &node.data {
NodeData::Text { ref contents } => {
let text_content = contents.borrow().to_string();
if text_content.trim().is_empty() {
None
} else {
Some(text(text_content))
}
}
NodeData::Element {
ref name,
ref attrs,
..
} => {
let tag = name.local.to_string();
if let Some(html_tag) = match_tag(&tag) {
let children_nodes = process_children(node);
let attributes = extract_attributes(&attrs.borrow());
let is_self_closing = HTML_SC_TAGS.contains(&html_tag);
Some(html_element_self_closing(
html_tag,
attributes,
children_nodes,
is_self_closing,
))
} else {
log::warn!("Invalid tag: {}", tag);
None
}
}
NodeData::Document => {
let mut children_nodes = process_children(node);
let children_len = children_nodes.len();
if children_len == 1 {
Some(children_nodes.remove(0))
} else if children_len == 2 {
Some(children_nodes.remove(1))
} else {
None
}
}
_ => None,
}
}
pub fn parse<MSG>(html: &str) -> Result<Option<Node<MSG>>, ParseError> {
let html_start = html.trim_start();
let parser = if html_start.starts_with("<html")
|| html_start.starts_with("<!DOCTYPE")
{
parse_document(RcDom::default(), Default::default())
} else {
parse_fragment(
RcDom::default(),
Default::default(),
QualName::new(None, ns!(html), local_name!("div")),
vec![],
)
};
let dom = parser.one(html);
let node = process_node(&dom.document);
Ok(node)
}
pub fn parse_simple<MSG>(html: &str) -> Result<Vec<Node<MSG>>, ParseError> {
if let Some(html) = parse(html)? {
if let Some(element) = html.take_element() {
assert_eq!(*element.tag(), "html");
Ok(element.take_children())
} else {
Ok(vec![])
}
} else {
Ok(vec![])
}
}
#[cfg(test)]
mod tests {
use super::*;
use sauron_core::{html::div, Render};
#[test]
fn test_html_child() {
let html = r#"<article class="side-to-side">
<div>
This is div content1
</div>
<footer>
This is footer
</footer>
</article>"#;
let node: Vec<Node<()>> = parse_simple(html).expect("must parse");
println!("node: {:#?}", node);
let one = div(vec![], node);
println!("one: {}", one.render_to_string());
}
#[test]
fn tag_namespace_is_none_in_html_div() {
assert_eq!(None, tag_namespace("div"));
assert_eq!(None, tag_namespace("span"));
assert_eq!(None, tag_namespace("a"));
assert_eq!(None, tag_namespace("title"));
assert_eq!(None, tag_namespace("style"));
assert_eq!(None, tag_namespace("script"));
}
#[test]
fn tag_namespace_in_svg_should_return_svg_namespace() {
assert_eq!(Some(SVG_NAMESPACE), tag_namespace("svg"));
assert_eq!(Some(SVG_NAMESPACE), tag_namespace("rect"));
assert_eq!(Some(SVG_NAMESPACE), tag_namespace("line"));
assert_eq!(Some(SVG_NAMESPACE), tag_namespace("circle"));
}
}