html_parser/dom/mod.rs
1use crate::Result;
2use pest::{iterators::Pair, iterators::Pairs, Parser};
3use serde::Serialize;
4use std::default::Default;
5
6use crate::error::Error;
7use crate::grammar::Grammar;
8use crate::Rule;
9
10pub mod element;
11pub mod formatting;
12pub mod node;
13pub mod span;
14
15use crate::dom::span::SourceSpan;
16use element::{Element, ElementVariant};
17use node::Node;
18
19/// Document, DocumentFragment or Empty
20#[derive(Debug, Clone, PartialEq, Serialize)]
21#[serde(rename_all = "camelCase")]
22pub enum DomVariant {
23 /// This means that the parsed html had the representation of an html document. The doctype is optional but a document should only have one root node with the name of html.
24 /// Example:
25 /// ```text
26 /// <!doctype html>
27 /// <html>
28 /// <head></head>
29 /// <body>
30 /// <h1>Hello world</h1>
31 /// </body>
32 /// </html>
33 /// ```
34 Document,
35 /// A document fragment means that the parsed html did not have the representation of a document. A fragment can have multiple root children of any name except html, body or head.
36 /// Example:
37 /// ```text
38 /// <h1>Hello world</h1>
39 /// ```
40 DocumentFragment,
41 /// An empty dom means that the input was empty
42 Empty,
43}
44
45/// **The main struct** & the result of the parsed html
46#[derive(Debug, Clone, Serialize, PartialEq)]
47#[serde(rename_all = "camelCase")]
48pub struct Dom {
49 /// The type of the tree that was parsed
50 pub tree_type: DomVariant,
51
52 /// All of the root children in the tree
53 #[serde(skip_serializing_if = "Vec::is_empty")]
54 pub children: Vec<Node>,
55
56 /// A collection of all errors during parsing
57 #[serde(skip_serializing)]
58 pub errors: Vec<String>,
59}
60
61impl Default for Dom {
62 fn default() -> Self {
63 Self {
64 tree_type: DomVariant::Empty,
65 children: vec![],
66 errors: vec![],
67 }
68 }
69}
70
71impl Dom {
72 pub fn parse(input: &str) -> Result<Self> {
73 let pairs = match Grammar::parse(Rule::html, input) {
74 Ok(pairs) => pairs,
75 Err(error) => return formatting::error_msg(error),
76 };
77 Self::build_dom(pairs)
78 }
79
80 pub fn to_json(&self) -> Result<String> {
81 Ok(serde_json::to_string(self)?)
82 }
83
84 pub fn to_json_pretty(&self) -> Result<String> {
85 Ok(serde_json::to_string_pretty(self)?)
86 }
87
88 fn build_dom(pairs: Pairs<Rule>) -> Result<Self> {
89 let mut dom = Self::default();
90
91 // NOTE: The logic is roughly as follows:
92 // 1) A document containing nothing but comments is DomVariant::Empty even though it will have
93 // children in this first pass. We fix this in the next section. This allows us to use
94 // DomVariant::Empty to indicate "we haven't decided the type yet".
95 // 2) If the type is DomVariant::Empty _so far_, then it can be changed to DomVariant::Document
96 // or DomVariant::DocumentFragment. DomVariant is only selected in this stage if we see a
97 // DOCTYPE tag. Comments do not change the type.
98 // 3) If the type is non-empty, we don't re-set the type. We do look for conflicts between
99 // the type and the tokens in the next stage.
100 for pair in pairs {
101 match pair.as_rule() {
102 // A <!DOCTYPE> tag means a full-fledged document. Note that because of the way
103 // the grammar is written, we will only get this token if the <!DOCTYPE> occurs
104 // before any other tag; otherwise it will be parsed as a custom tag.
105 Rule::doctype => {
106 if dom.tree_type == DomVariant::Empty {
107 dom.tree_type = DomVariant::Document;
108 }
109 }
110
111 // If we see an element, build the sub-tree and add it as a child. If we don't
112 // have a document type yet (i.e. "empty"), select DocumentFragment
113 Rule::node_element => match Self::build_node_element(pair, &mut dom) {
114 Ok(el) => {
115 if let Some(node) = el {
116 if dom.tree_type == DomVariant::Empty {
117 dom.tree_type = DomVariant::DocumentFragment;
118 };
119 dom.children.push(node);
120 }
121 }
122 Err(error) => {
123 dom.errors.push(format!("{}", error));
124 }
125 },
126
127 // Similar to an element, we add it as a child and select DocumentFragment if we
128 // don't already have a document type.
129 Rule::node_text => {
130 if dom.tree_type == DomVariant::Empty {
131 dom.tree_type = DomVariant::DocumentFragment;
132 }
133 let text = pair.as_str().to_string();
134 if !text.trim().is_empty() {
135 dom.children.push(Node::Text(text));
136 }
137 }
138
139 // Store comments as a child, but it doesn't affect the document type selection
140 // until the next phase (validation).
141 Rule::node_comment => {
142 dom.children
143 .push(Node::Comment(pair.into_inner().as_str().to_string()));
144 }
145
146 // Ignore 'end of input', which then allows the catch-all unreachable!() arm to
147 // function properly.
148 Rule::EOI => (),
149
150 // This should be unreachable, due to the way the grammar is written
151 _ => unreachable!("[build dom] unknown rule: {:?}", pair.as_rule()),
152 };
153 }
154
155 // Implement some checks on the generated dom's data and initial type. The type may be
156 // modified in this section.
157 match dom.tree_type {
158 // A DomVariant::Empty can only have comments. Anything else is an error.
159 DomVariant::Empty => {
160 for node in &dom.children {
161 if let Node::Comment(_) = node {
162 // An "empty" document, but it has comments - this is where we cleanup the
163 // earlier assumption that a document with only comments is "empty".
164 // Really, it is a "fragment".
165 dom.tree_type = DomVariant::DocumentFragment
166 } else {
167 // Anything else (i.e. Text() or Element() ) can't happen at the top level;
168 // if we had seen one, we would have set the document type above
169 unreachable!("[build dom] empty document with an Element {:?}", node)
170 }
171 }
172 }
173
174 // A DomVariant::Document can only have comments and an <HTML> node at the top level.
175 // Only one <HTML> tag is permitted.
176 DomVariant::Document => {
177 if dom
178 .children
179 .iter()
180 .filter(|x| match x {
181 Node::Element(el) if el.name.to_lowercase() == "html" => true,
182 _ => false,
183 })
184 .count()
185 > 1
186 {
187 return Err(Error::Parsing(format!("Document with multiple HTML tags",)));
188 }
189 }
190
191 // A DomVariant::DocumentFragment should not have <HEAD>, or <BODY> tags at the
192 // top-level. If we find an <HTML> tag, then we consider this a Document instead (if
193 // it comes before any other elements, and if there is only one <HTML> tag).
194 DomVariant::DocumentFragment => {
195 let mut seen_html = false;
196 let mut seen_elements = false;
197
198 for node in &dom.children {
199 match node {
200 // Nodes other than <HTML> - reject <HEAD> and <BODY>
201 Node::Element(ref el) if el.name.clone().to_lowercase() != "html" => {
202 if el.name == "head" || el.name == "body" {
203 return Err(Error::Parsing(format!(
204 "A document fragment should not include {}",
205 el.name
206 )));
207 }
208 seen_elements = true;
209 }
210 // <HTML> Nodes - one (before any other elements) is okay
211 Node::Element(ref el) if el.name.clone().to_lowercase() == "html" => {
212 if seen_html || seen_elements {
213 return Err(Error::Parsing(format!(
214 "A document fragment should not include {}",
215 el.name
216 )));
217 };
218
219 // A fragment with just an <HTML> tag is a document
220 dom.tree_type = DomVariant::Document;
221 seen_html = true;
222 }
223 // Comment() and Text() nodes are permitted at the top-level of a
224 // DocumentFragment
225 _ => (),
226 }
227 }
228 }
229 }
230
231 // The result is the validated tree
232 Ok(dom)
233 }
234
235 fn build_node_element(pair: Pair<Rule>, dom: &mut Dom) -> Result<Option<Node>> {
236 let source_span = {
237 let pair_span = pair.as_span();
238 let (start_line, start_column) = pair_span.start_pos().line_col();
239 let (end_line, end_column) = pair_span.end_pos().line_col();
240
241 SourceSpan::new(
242 String::from(pair_span.as_str()),
243 start_line,
244 end_line,
245 start_column,
246 end_column,
247 )
248 };
249
250 let mut element = Element {
251 source_span,
252 ..Element::default()
253 };
254
255 for pair in pair.into_inner() {
256 match pair.as_rule() {
257 Rule::node_element | Rule::el_raw_text => {
258 match Self::build_node_element(pair, dom) {
259 Ok(el) => {
260 if let Some(child_element) = el {
261 element.children.push(child_element)
262 }
263 }
264 Err(error) => {
265 dom.errors.push(format!("{}", error));
266 }
267 }
268 }
269 Rule::node_text | Rule::el_raw_text_content => {
270 let text = pair.as_str().to_string();
271 if !text.trim().is_empty() {
272 element.children.push(Node::Text(text));
273 }
274 }
275 Rule::node_comment => {
276 element
277 .children
278 .push(Node::Comment(pair.into_inner().as_str().to_string()));
279 }
280 // TODO: To enable some kind of validation we should probably align this with
281 // https://html.spec.whatwg.org/multipage/syntax.html#elements-2
282 // Also see element variants
283 Rule::el_name | Rule::el_void_name | Rule::el_raw_text_name => {
284 element.name = pair.as_str().to_string();
285 }
286 Rule::attr => match Self::build_attribute(pair.into_inner()) {
287 Ok((attr_key, attr_value)) => {
288 match attr_key.as_str() {
289 "id" => element.id = attr_value,
290 "class" => {
291 if let Some(classes) = attr_value {
292 let classes = classes.split_whitespace().collect::<Vec<_>>();
293 for class in classes {
294 element.classes.push(class.to_string());
295 }
296 }
297 }
298 _ => {
299 element.attributes.insert(attr_key, attr_value);
300 }
301 };
302 }
303 Err(error) => {
304 dom.errors.push(format!("{}", error));
305 }
306 },
307 Rule::el_normal_end | Rule::el_raw_text_end => {
308 element.variant = ElementVariant::Normal;
309 break;
310 }
311 Rule::el_dangling => (),
312 Rule::EOI => (),
313 _ => {
314 return Err(Error::Parsing(format!(
315 "Failed to create element at rule: {:?}",
316 pair.as_rule()
317 )))
318 }
319 }
320 }
321 if element.name != "" {
322 Ok(Some(Node::Element(element)))
323 } else {
324 Ok(None)
325 }
326 }
327
328 fn build_attribute(pairs: Pairs<Rule>) -> Result<(String, Option<String>)> {
329 let mut attribute = ("".to_string(), None);
330 for pair in pairs {
331 match pair.as_rule() {
332 Rule::attr_key => {
333 attribute.0 = pair.as_str().trim().to_string();
334 }
335 Rule::attr_non_quoted => {
336 attribute.1 = Some(pair.as_str().trim().to_string());
337 }
338 Rule::attr_quoted => {
339 let inner_pair = pair
340 .into_inner()
341 .into_iter()
342 .next()
343 .expect("attribute value");
344
345 match inner_pair.as_rule() {
346 Rule::attr_value => attribute.1 = Some(inner_pair.as_str().to_string()),
347 _ => {
348 return Err(Error::Parsing(format!(
349 "Failed to parse attr value: {:?}",
350 inner_pair.as_rule()
351 )))
352 }
353 }
354 }
355 _ => {
356 return Err(Error::Parsing(format!(
357 "Failed to parse attr: {:?}",
358 pair.as_rule()
359 )))
360 }
361 }
362 }
363 Ok(attribute)
364 }
365}