html_parser/dom/
mod.rs

1use crate::Result;
2use pest::{iterators::Pair, iterators::Pairs, Parser};
3use serde::Serialize;
4use std::default::Default;
5
6use crate::error::Error;
7use crate::grammar::Grammar;
8use crate::Rule;
9
10pub mod element;
11pub mod formatting;
12pub mod node;
13pub mod span;
14
15use crate::dom::span::SourceSpan;
16use element::{Element, ElementVariant};
17use node::Node;
18
19/// Document, DocumentFragment or Empty
20#[derive(Debug, Clone, PartialEq, Serialize)]
21#[serde(rename_all = "camelCase")]
22pub enum DomVariant {
23    /// This means that the parsed html had the representation of an html document. The doctype is optional but a document should only have one root node with the name of html.
24    /// Example:
25    /// ```text
26    /// <!doctype html>
27    /// <html>
28    ///     <head></head>
29    ///     <body>
30    ///         <h1>Hello world</h1>
31    ///     </body>
32    /// </html>
33    /// ```
34    Document,
35    /// A document fragment means that the parsed html did not have the representation of a document. A fragment can have multiple root children of any name except html, body or head.
36    /// Example:
37    /// ```text
38    /// <h1>Hello world</h1>
39    /// ```
40    DocumentFragment,
41    /// An empty dom means that the input was empty
42    Empty,
43}
44
45/// **The main struct** & the result of the parsed html
46#[derive(Debug, Clone, Serialize, PartialEq)]
47#[serde(rename_all = "camelCase")]
48pub struct Dom {
49    /// The type of the tree that was parsed
50    pub tree_type: DomVariant,
51
52    /// All of the root children in the tree
53    #[serde(skip_serializing_if = "Vec::is_empty")]
54    pub children: Vec<Node>,
55
56    /// A collection of all errors during parsing
57    #[serde(skip_serializing)]
58    pub errors: Vec<String>,
59}
60
61impl Default for Dom {
62    fn default() -> Self {
63        Self {
64            tree_type: DomVariant::Empty,
65            children: vec![],
66            errors: vec![],
67        }
68    }
69}
70
71impl Dom {
72    pub fn parse(input: &str) -> Result<Self> {
73        let pairs = match Grammar::parse(Rule::html, input) {
74            Ok(pairs) => pairs,
75            Err(error) => return formatting::error_msg(error),
76        };
77        Self::build_dom(pairs)
78    }
79
80    pub fn to_json(&self) -> Result<String> {
81        Ok(serde_json::to_string(self)?)
82    }
83
84    pub fn to_json_pretty(&self) -> Result<String> {
85        Ok(serde_json::to_string_pretty(self)?)
86    }
87
88    fn build_dom(pairs: Pairs<Rule>) -> Result<Self> {
89        let mut dom = Self::default();
90
91        // NOTE: The logic is roughly as follows:
92        // 1) A document containing nothing but comments is DomVariant::Empty even though it will have
93        //    children in this first pass.  We fix this in the next section.  This allows us to use
94        //    DomVariant::Empty to indicate "we haven't decided the type yet".
95        // 2) If the type is DomVariant::Empty _so far_, then it can be changed to DomVariant::Document
96        //    or DomVariant::DocumentFragment.  DomVariant is only selected in this stage if we see a
97        //    DOCTYPE tag.  Comments do not change the type.
98        // 3) If the type is non-empty, we don't re-set the type.  We do look for conflicts between
99        //    the type and the tokens in the next stage.
100        for pair in pairs {
101            match pair.as_rule() {
102                // A <!DOCTYPE> tag means a full-fledged document.  Note that because of the way
103                // the grammar is written, we will only get this token if the <!DOCTYPE> occurs
104                // before any other tag; otherwise it will be parsed as a custom tag.
105                Rule::doctype => {
106                    if dom.tree_type == DomVariant::Empty {
107                        dom.tree_type = DomVariant::Document;
108                    }
109                }
110
111                // If we see an element, build the sub-tree and add it as a child.  If we don't
112                // have a document type yet (i.e. "empty"), select DocumentFragment
113                Rule::node_element => match Self::build_node_element(pair, &mut dom) {
114                    Ok(el) => {
115                        if let Some(node) = el {
116                            if dom.tree_type == DomVariant::Empty {
117                                dom.tree_type = DomVariant::DocumentFragment;
118                            };
119                            dom.children.push(node);
120                        }
121                    }
122                    Err(error) => {
123                        dom.errors.push(format!("{}", error));
124                    }
125                },
126
127                // Similar to an element, we add it as a child and select DocumentFragment if we
128                // don't already have a document type.
129                Rule::node_text => {
130                    if dom.tree_type == DomVariant::Empty {
131                        dom.tree_type = DomVariant::DocumentFragment;
132                    }
133                    let text = pair.as_str().to_string();
134                    if !text.trim().is_empty() {
135                        dom.children.push(Node::Text(text));
136                    }
137                }
138
139                // Store comments as a child, but it doesn't affect the document type selection
140                // until the next phase (validation).
141                Rule::node_comment => {
142                    dom.children
143                        .push(Node::Comment(pair.into_inner().as_str().to_string()));
144                }
145
146                // Ignore 'end of input', which then allows the catch-all unreachable!() arm to
147                // function properly.
148                Rule::EOI => (),
149
150                // This should be unreachable, due to the way the grammar is written
151                _ => unreachable!("[build dom] unknown rule: {:?}", pair.as_rule()),
152            };
153        }
154
155        // Implement some checks on the generated dom's data and initial type.  The type may be
156        // modified in this section.
157        match dom.tree_type {
158            // A DomVariant::Empty can only have comments. Anything else is an error.
159            DomVariant::Empty => {
160                for node in &dom.children {
161                    if let Node::Comment(_) = node {
162                        // An "empty" document, but it has comments - this is where we cleanup the
163                        // earlier assumption that a document with only comments is "empty".
164                        // Really, it is a "fragment".
165                        dom.tree_type = DomVariant::DocumentFragment
166                    } else {
167                        // Anything else (i.e. Text() or Element() ) can't happen at the top level;
168                        // if we had seen one, we would have set the document type above
169                        unreachable!("[build dom] empty document with an Element {:?}", node)
170                    }
171                }
172            }
173
174            // A DomVariant::Document can only have comments and an <HTML> node at the top level.
175            // Only one <HTML> tag is permitted.
176            DomVariant::Document => {
177                if dom
178                    .children
179                    .iter()
180                    .filter(|x| match x {
181                        Node::Element(el) if el.name.to_lowercase() == "html" => true,
182                        _ => false,
183                    })
184                    .count()
185                    > 1
186                {
187                    return Err(Error::Parsing(format!("Document with multiple HTML tags",)));
188                }
189            }
190
191            // A DomVariant::DocumentFragment should not have <HEAD>, or <BODY> tags at the
192            // top-level.  If we find an <HTML> tag, then we consider this a Document instead (if
193            // it comes before any other elements, and if there is only one <HTML> tag).
194            DomVariant::DocumentFragment => {
195                let mut seen_html = false;
196                let mut seen_elements = false;
197
198                for node in &dom.children {
199                    match node {
200                        // Nodes other than <HTML> - reject <HEAD> and <BODY>
201                        Node::Element(ref el) if el.name.clone().to_lowercase() != "html" => {
202                            if el.name == "head" || el.name == "body" {
203                                return Err(Error::Parsing(format!(
204                                    "A document fragment should not include {}",
205                                    el.name
206                                )));
207                            }
208                            seen_elements = true;
209                        }
210                        // <HTML> Nodes - one (before any other elements) is okay
211                        Node::Element(ref el) if el.name.clone().to_lowercase() == "html" => {
212                            if seen_html || seen_elements {
213                                return Err(Error::Parsing(format!(
214                                    "A document fragment should not include {}",
215                                    el.name
216                                )));
217                            };
218
219                            // A fragment with just an <HTML> tag is a document
220                            dom.tree_type = DomVariant::Document;
221                            seen_html = true;
222                        }
223                        // Comment() and Text() nodes are permitted at the top-level of a
224                        // DocumentFragment
225                        _ => (),
226                    }
227                }
228            }
229        }
230
231        // The result is the validated tree
232        Ok(dom)
233    }
234
235    fn build_node_element(pair: Pair<Rule>, dom: &mut Dom) -> Result<Option<Node>> {
236        let source_span = {
237            let pair_span = pair.as_span();
238            let (start_line, start_column) = pair_span.start_pos().line_col();
239            let (end_line, end_column) = pair_span.end_pos().line_col();
240
241            SourceSpan::new(
242                String::from(pair_span.as_str()),
243                start_line,
244                end_line,
245                start_column,
246                end_column,
247            )
248        };
249
250        let mut element = Element {
251            source_span,
252            ..Element::default()
253        };
254
255        for pair in pair.into_inner() {
256            match pair.as_rule() {
257                Rule::node_element | Rule::el_raw_text => {
258                    match Self::build_node_element(pair, dom) {
259                        Ok(el) => {
260                            if let Some(child_element) = el {
261                                element.children.push(child_element)
262                            }
263                        }
264                        Err(error) => {
265                            dom.errors.push(format!("{}", error));
266                        }
267                    }
268                }
269                Rule::node_text | Rule::el_raw_text_content => {
270                    let text = pair.as_str().to_string();
271                    if !text.trim().is_empty() {
272                        element.children.push(Node::Text(text));
273                    }
274                }
275                Rule::node_comment => {
276                    element
277                        .children
278                        .push(Node::Comment(pair.into_inner().as_str().to_string()));
279                }
280                // TODO: To enable some kind of validation we should probably align this with
281                // https://html.spec.whatwg.org/multipage/syntax.html#elements-2
282                // Also see element variants
283                Rule::el_name | Rule::el_void_name | Rule::el_raw_text_name => {
284                    element.name = pair.as_str().to_string();
285                }
286                Rule::attr => match Self::build_attribute(pair.into_inner()) {
287                    Ok((attr_key, attr_value)) => {
288                        match attr_key.as_str() {
289                            "id" => element.id = attr_value,
290                            "class" => {
291                                if let Some(classes) = attr_value {
292                                    let classes = classes.split_whitespace().collect::<Vec<_>>();
293                                    for class in classes {
294                                        element.classes.push(class.to_string());
295                                    }
296                                }
297                            }
298                            _ => {
299                                element.attributes.insert(attr_key, attr_value);
300                            }
301                        };
302                    }
303                    Err(error) => {
304                        dom.errors.push(format!("{}", error));
305                    }
306                },
307                Rule::el_normal_end | Rule::el_raw_text_end => {
308                    element.variant = ElementVariant::Normal;
309                    break;
310                }
311                Rule::el_dangling => (),
312                Rule::EOI => (),
313                _ => {
314                    return Err(Error::Parsing(format!(
315                        "Failed to create element at rule: {:?}",
316                        pair.as_rule()
317                    )))
318                }
319            }
320        }
321        if element.name != "" {
322            Ok(Some(Node::Element(element)))
323        } else {
324            Ok(None)
325        }
326    }
327
328    fn build_attribute(pairs: Pairs<Rule>) -> Result<(String, Option<String>)> {
329        let mut attribute = ("".to_string(), None);
330        for pair in pairs {
331            match pair.as_rule() {
332                Rule::attr_key => {
333                    attribute.0 = pair.as_str().trim().to_string();
334                }
335                Rule::attr_non_quoted => {
336                    attribute.1 = Some(pair.as_str().trim().to_string());
337                }
338                Rule::attr_quoted => {
339                    let inner_pair = pair
340                        .into_inner()
341                        .into_iter()
342                        .next()
343                        .expect("attribute value");
344
345                    match inner_pair.as_rule() {
346                        Rule::attr_value => attribute.1 = Some(inner_pair.as_str().to_string()),
347                        _ => {
348                            return Err(Error::Parsing(format!(
349                                "Failed to parse attr value: {:?}",
350                                inner_pair.as_rule()
351                            )))
352                        }
353                    }
354                }
355                _ => {
356                    return Err(Error::Parsing(format!(
357                        "Failed to parse attr: {:?}",
358                        pair.as_rule()
359                    )))
360                }
361            }
362        }
363        Ok(attribute)
364    }
365}