sauron_html_parser/
lib.rs

1#![deny(warnings)]
2
3use std::{fmt, io, ops::Deref};
4
5use rphtml::{
6    config::ParseOptions,
7    parser::{Doc, NodeType},
8    types::BoxDynError,
9};
10
11use sauron_core::{
12    html::{attributes::*, lookup, *},
13    vdom::{AttributeValue, Node, Value},
14};
15
16/// all the possible error when parsing html string
17#[derive(Debug, thiserror::Error)]
18pub enum ParseError {
19    /// io error
20    #[error("{0}")]
21    IoError(#[from] io::Error),
22    /// formatting error
23    #[error("{0}")]
24    FmtError(#[from] fmt::Error),
25    /// rphtml specific error
26    #[error("{0}")]
27    RpHtmlError(#[from] BoxDynError),
28    /// the tag is not a valid html
29    #[error("Invalid tag: {0}")]
30    InvalidTag(String),
31}
32
33/// Parse escaped html strings like `"Hello world!"`
34/// into `"Hello world!"` and then into a node tree.
35pub fn raw_html<MSG>(html: &str) -> Node<MSG> {
36    // decode html entitiesd back since it will be safely converted into text
37    let html = html_escape::decode_html_entities(html);
38    parse_html(&html)
39        .expect("must be ok")
40        .expect("must have a node")
41}
42
43/// Parse none-escaped html strings like `"Hello world!"`
44/// into a node tree (see also [raw_html]).
45pub fn parse_html<MSG>(html: &str) -> Result<Option<Node<MSG>>, ParseError> {
46    let doc = Doc::parse(
47        html,
48        ParseOptions {
49            case_sensitive_tagname: false,
50            allow_self_closing: true,
51            auto_fix_unclosed_tag: true,
52            auto_fix_unexpected_endtag: true,
53            auto_fix_unescaped_lt: true,
54        },
55    )?;
56    process_node(doc.get_root_node().borrow().deref())
57}
58
59//TODO: This is not dealing with html symbols such as
60//   `&#9650;`
61//   `&#9660;`
62fn process_node<MSG>(node: &rphtml::parser::Node) -> Result<Option<Node<MSG>>, ParseError> {
63    let content = if let Some(content) = &node.content {
64        let content = String::from_iter(content.iter());
65        Some(content)
66    } else {
67        None
68    };
69
70    let mut child_nodes = if let Some(childs) = &node.childs {
71        childs
72            .iter()
73            .flat_map(|child| process_node(child.borrow().deref()).ok().flatten())
74            .collect()
75    } else {
76        vec![]
77    };
78
79    match node.node_type {
80        NodeType::Tag => {
81            let tag = &node.meta.as_ref().expect("must have a tag");
82            let tag_name = String::from_iter(tag.borrow().name.iter());
83            if let Some(html_tag) = lookup::match_tag(&tag_name) {
84                let is_self_closing = HTML_SC_TAGS.contains(&html_tag);
85                let attributes: Vec<Attribute<MSG>> = tag
86                    .borrow()
87                    .attrs
88                    .iter()
89                    .filter_map(|attr| {
90                        attr.key.as_ref().and_then(|key| {
91                            let key = String::from_iter(key.content.iter());
92                            if let Some(attr_key) = lookup::match_attribute(&key) {
93                                let value = if let Some(value) = &attr.value {
94                                    let value = String::from_iter(value.content.iter());
95                                    AttributeValue::Simple(Value::from(value))
96                                } else {
97                                    AttributeValue::Empty
98                                };
99                                Some(Attribute::new(None, attr_key, value))
100                            } else {
101                                log::warn!("Not a standard html attribute: {}", key);
102                                None
103                            }
104                        })
105                    })
106                    .collect();
107
108                Ok(Some(html_element(
109                    None,
110                    html_tag,
111                    attributes,
112                    child_nodes,
113                    is_self_closing,
114                )))
115            } else {
116                log::error!("invalid tag: {}", tag_name);
117                Err(ParseError::InvalidTag(tag_name))
118            }
119        }
120        NodeType::Text => {
121            let content = content.expect("must have a content");
122            Ok(Some(text(content)))
123        }
124        NodeType::AbstractRoot => {
125            let child_nodes_len = child_nodes.len();
126            match child_nodes_len {
127                0 => Ok(Some(node_list([]))),
128                1 => Ok(Some(child_nodes.remove(0))),
129                _ => Ok(Some(node_list(child_nodes))),
130            }
131        }
132        _ => Ok(None),
133    }
134}