sauron_parse/
parser.rs

1//! This module parses literal html returns sauron dom tree
2
3use html5ever::{
4    local_name, namespace_url, ns, parse_document, parse_fragment,
5    tendril::TendrilSink, QualName,
6};
7use markup5ever_rcdom::{Handle, NodeData, RcDom};
8use once_cell::sync::Lazy;
9use sauron_core::{
10    html::{
11        attributes,
12        attributes::{
13            AttributeValue, Style, HTML_ATTRS, HTML_ATTRS_SPECIAL, HTML_STYLES,
14        },
15        html_element_self_closing,
16        tags::{
17            HTML_SC_TAGS, HTML_TAGS, HTML_TAGS_NON_COMMON,
18            HTML_TAGS_WITH_MACRO_NON_COMMON,
19        },
20        text,
21    },
22    mt_dom,
23    svg::{
24        attributes::{SVG_ATTRS, SVG_ATTRS_SPECIAL, SVG_ATTRS_XLINK},
25        tags::{SVG_TAGS, SVG_TAGS_NON_COMMON, SVG_TAGS_SPECIAL},
26        SVG_NAMESPACE,
27    },
28    Attribute, Node,
29};
30use std::collections::HashSet;
31use std::iter::FromIterator;
32use std::{fmt, io};
33use thiserror::Error;
34
35static ALL_SVG_TAGS: Lazy<HashSet<&&'static str>> = Lazy::new(|| {
36    HashSet::from_iter(
37        SVG_TAGS
38            .iter()
39            .chain(SVG_TAGS_NON_COMMON.iter())
40            .chain(SVG_TAGS_SPECIAL.iter().map(|(_func, t)| t)),
41    )
42});
43
44/// All of the html tags, excluding the SVG tags.
45/// This is mainly used for checking whether element should be
46/// created with namespace or not.
47///
48/// False negatives are:
49///    script; // this conflicts with html::script        , html::tags::script       > svg::tags::script
50///    style; // conflics with html::attributes::style    , html::attributes::style  > svg::tags::style
51///    text; // conflicts with html::text                 , html::text               > svg::tags::text
52///    a;   // conflicts with html::a                     , html::tags::a            > svg::tags::a
53///
54/// If used inside an svg node, svg elements scuh as text, a, style, script will not work correcly
55/// in client-side rendering.
56/// However, in server-side rendering it will work just fine.
57static ALL_HTML_TAGS: Lazy<HashSet<&&'static str>> = Lazy::new(|| {
58    HashSet::from_iter(
59        HTML_TAGS
60            .iter()
61            .chain(HTML_SC_TAGS.iter())
62            .chain(HTML_TAGS_NON_COMMON.iter())
63            .chain(HTML_TAGS_WITH_MACRO_NON_COMMON.iter()),
64    )
65});
66
67static SELF_CLOSING_TAGS: Lazy<HashSet<&&'static str>> =
68    Lazy::new(|| HashSet::from_iter(HTML_SC_TAGS.iter()));
69
70/// all the possible error when parsing html string
71#[derive(Debug, Error)]
72pub enum ParseError {
73    /// generic parser error, expressed in string
74    #[error("Generic Error {0}")]
75    Generic(String),
76    /// io error
77    #[error("{0}")]
78    IoError(#[from] io::Error),
79    /// formatting error
80    #[error("{0}")]
81    FmtError(#[from] fmt::Error),
82}
83
84fn match_tag(tag: &str) -> Option<&'static str> {
85    ALL_HTML_TAGS
86        .iter()
87        .chain(ALL_SVG_TAGS.iter())
88        .find(|item| item.eq_ignore_ascii_case(&tag))
89        .map(|item| **item)
90}
91
92fn match_attribute(key: &str) -> Option<&'static str> {
93    HTML_ATTRS
94        .iter()
95        .chain(SVG_ATTRS.iter())
96        .find(|att| att.eq_ignore_ascii_case(&key))
97        .map(|att| *att)
98        .or_else(|| {
99            HTML_ATTRS_SPECIAL
100                .iter()
101                .chain(SVG_ATTRS_SPECIAL.iter())
102                .chain(SVG_ATTRS_XLINK.iter())
103                .find(|(_func, att)| att.eq_ignore_ascii_case(&key))
104                .map(|(func, _att)| *func)
105        })
106}
107
108fn match_style_name(key: &str) -> Option<&'static str> {
109    HTML_STYLES
110        .iter()
111        .find(|name| name.eq_ignore_ascii_case(&key))
112        .map(|name| *name)
113}
114
115/// return the static str of this function name
116pub fn match_attribute_function(key: &str) -> Option<&'static str> {
117    HTML_ATTRS
118        .iter()
119        .chain(SVG_ATTRS.iter())
120        .find(|att| att.eq_ignore_ascii_case(key))
121        .map(|att| *att)
122        .or_else(|| {
123            HTML_ATTRS_SPECIAL
124                .iter()
125                .chain(SVG_ATTRS_SPECIAL.iter())
126                .chain(SVG_ATTRS_XLINK.iter())
127                .find(|(func, _att)| func.eq_ignore_ascii_case(key))
128                .map(|(func, _att)| *func)
129        })
130}
131
132/// Find the namespace of this tag
133/// if the arg tag is an SVG tag, return the svg namespace
134/// html tags don't need to have namespace while svg does, otherwise it will not be properly
135/// mounted into the DOM
136/// # Examples
137/// ```rust
138/// use sauron_core::prelude::*;
139/// use sauron_parse::tag_namespace;
140///     assert_eq!(None, tag_namespace("div"));
141///     assert_eq!(Some(SVG_NAMESPACE), tag_namespace("rect"));
142/// ```
143///
144/// Limitations: `script`, `style`,and `a` used inside svg will return `None`, as these are also valid html tags.
145pub fn tag_namespace(tag: &str) -> Option<&'static str> {
146    let is_html = ALL_HTML_TAGS.contains(&tag);
147    let is_svg = ALL_SVG_TAGS.contains(&tag);
148    if !is_html {
149        if is_svg {
150            // we return the svg namespace only when the tag is not an html, but an svg tag
151            // False negatives:
152            // This means that script, style, a and title used inside in svg tag will not work
153            // properly, since this 3 tags are valid html tags
154            Some(SVG_NAMESPACE)
155        } else {
156            None
157        }
158    } else {
159        None
160    }
161}
162
163/// Returns true if this html tag is self closing
164pub fn is_self_closing(tag: &str) -> bool {
165    SELF_CLOSING_TAGS.contains(&tag)
166}
167
168fn extract_attributes<MSG>(
169    attrs: &Vec<html5ever::Attribute>,
170) -> Vec<Attribute<MSG>> {
171    attrs
172        .iter()
173        .filter_map(|att| {
174            let key = att.name.local.to_string();
175            let value = att.value.to_string();
176            if key == "style" {
177                let styles = extract_styles(&value);
178                Some(mt_dom::attr("style", AttributeValue::from_styles(styles)))
179            } else if let Some(attr) = match_attribute(&key) {
180                Some(attributes::attr(attr, value))
181            } else {
182                log::warn!("Not a standard html attribute: {}", key);
183                None
184            }
185        })
186        .collect()
187}
188
189/// extract the styles into an arry
190/// example: display:flex; flex-direction: column;
191fn extract_styles(style: &str) -> Vec<Style> {
192    let mut extracted = vec![];
193    println!("processing style: {}", style);
194    let mut single_styles: Vec<&str> = style.split(";").collect();
195    single_styles.retain(|item| !item.trim().is_empty());
196    for single in single_styles {
197        let key_value: Vec<&str> = single.split(":").collect();
198        assert_eq!(key_value.len(), 2);
199        let key = key_value[0].trim();
200        let value = key_value[1].trim();
201        println!("style   [{}] = [{}]", key, value);
202        if let Some(match_style) = match_style_name(key) {
203            extracted.push(Style::new(match_style, value.to_string().into()));
204        }
205    }
206    extracted
207}
208
209fn process_children<MSG>(node: &Handle) -> Vec<Node<MSG>> {
210    node.children
211        .borrow()
212        .iter()
213        .filter_map(|child_node| process_node(child_node))
214        .collect()
215}
216
217fn process_node<MSG>(node: &Handle) -> Option<Node<MSG>> {
218    match &node.data {
219        NodeData::Text { ref contents } => {
220            let text_content = contents.borrow().to_string();
221            if text_content.trim().is_empty() {
222                None
223            } else {
224                Some(text(text_content))
225            }
226        }
227
228        NodeData::Element {
229            ref name,
230            ref attrs,
231            ..
232        } => {
233            let tag = name.local.to_string();
234            if let Some(html_tag) = match_tag(&tag) {
235                let children_nodes = process_children(node);
236                let attributes = extract_attributes(&attrs.borrow());
237                let is_self_closing = HTML_SC_TAGS.contains(&html_tag);
238                Some(html_element_self_closing(
239                    html_tag,
240                    attributes,
241                    children_nodes,
242                    is_self_closing,
243                ))
244            } else {
245                log::warn!("Invalid tag: {}", tag);
246                None
247            }
248        }
249        NodeData::Document => {
250            let mut children_nodes = process_children(node);
251            let children_len = children_nodes.len();
252            if children_len == 1 {
253                Some(children_nodes.remove(0))
254            } else if children_len == 2 {
255                Some(children_nodes.remove(1))
256            } else {
257                None
258            }
259        }
260        _ => None,
261    }
262}
263
264/// Parse html string and convert it into sauron Node
265pub fn parse<MSG>(html: &str) -> Result<Option<Node<MSG>>, ParseError> {
266    let html_start = html.trim_start();
267    let parser = if html_start.starts_with("<html")
268        || html_start.starts_with("<!DOCTYPE")
269    {
270        parse_document(RcDom::default(), Default::default())
271    } else {
272        parse_fragment(
273            RcDom::default(),
274            Default::default(),
275            QualName::new(None, ns!(html), local_name!("div")),
276            vec![],
277        )
278    };
279
280    let dom = parser.one(html);
281    let node = process_node(&dom.document);
282    Ok(node)
283}
284
285/// the document is not wrapped with html
286pub fn parse_simple<MSG>(html: &str) -> Result<Vec<Node<MSG>>, ParseError> {
287    if let Some(html) = parse(html)? {
288        if let Some(element) = html.take_element() {
289            assert_eq!(*element.tag(), "html");
290            Ok(element.take_children())
291        } else {
292            Ok(vec![])
293        }
294    } else {
295        Ok(vec![])
296    }
297}
298
299#[cfg(test)]
300mod tests {
301    use super::*;
302    use sauron_core::{html::div, Render};
303
304    #[test]
305    fn test_html_child() {
306        let html = r#"<article class="side-to-side">
307    <div>
308        This is div content1
309    </div>
310    <footer>
311        This is footer
312    </footer>
313</article>"#;
314        let node: Vec<Node<()>> = parse_simple(html).expect("must parse");
315        println!("node: {:#?}", node);
316        let one = div(vec![], node);
317        println!("one: {}", one.render_to_string());
318    }
319
320    #[test]
321    fn tag_namespace_is_none_in_html_div() {
322        assert_eq!(None, tag_namespace("div"));
323        assert_eq!(None, tag_namespace("span"));
324        assert_eq!(None, tag_namespace("a"));
325        assert_eq!(None, tag_namespace("title"));
326        assert_eq!(None, tag_namespace("style"));
327        assert_eq!(None, tag_namespace("script"));
328    }
329    #[test]
330    fn tag_namespace_in_svg_should_return_svg_namespace() {
331        assert_eq!(Some(SVG_NAMESPACE), tag_namespace("svg"));
332        assert_eq!(Some(SVG_NAMESPACE), tag_namespace("rect"));
333        assert_eq!(Some(SVG_NAMESPACE), tag_namespace("line"));
334        assert_eq!(Some(SVG_NAMESPACE), tag_namespace("circle"));
335    }
336}