1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
use crate::Result;
use pest::{iterators::Pair, iterators::Pairs, Parser};
use serde::Serialize;
use std::default::Default;
use crate::error::Error;
use crate::grammar::Grammar;
use crate::Rule;
pub mod element;
pub mod formatting;
pub mod node;
pub mod span;
use crate::dom::span::SourceSpan;
use element::{Element, ElementVariant};
use node::Node;
/// Document, DocumentFragment or Empty
#[derive(Debug, Clone, PartialEq, Serialize)]
#[serde(rename_all = "camelCase")]
pub enum DomVariant {
/// This means that the parsed html had the representation of an html document. The doctype is optional but a document should only have one root node with the name of html.
/// Example:
/// ```text
/// <!doctype html>
/// <html>
/// <head></head>
/// <body>
/// <h1>Hello world</h1>
/// </body>
/// </html>
/// ```
Document,
/// A document fragment means that the parsed html did not have the representation of a document. A fragment can have multiple root children of any name except html, body or head.
/// Example:
/// ```text
/// <h1>Hello world</h1>
/// ```
DocumentFragment,
/// An empty dom means that the input was empty
Empty,
}
/// **The main struct** & the result of the parsed html
#[derive(Debug, Clone, Serialize, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct Dom {
/// The type of the tree that was parsed
pub tree_type: DomVariant,
/// All of the root children in the tree
#[serde(skip_serializing_if = "Vec::is_empty")]
pub children: Vec<Node>,
/// A collection of all errors during parsing
#[serde(skip_serializing)]
pub errors: Vec<String>,
}
impl Default for Dom {
fn default() -> Self {
Self {
tree_type: DomVariant::Empty,
children: vec![],
errors: vec![],
}
}
}
impl Dom {
pub fn parse(input: &str) -> Result<Self> {
let pairs = match Grammar::parse(Rule::html, input) {
Ok(pairs) => pairs,
Err(error) => return formatting::error_msg(error),
};
Self::build_dom(pairs)
}
pub fn to_json(&self) -> Result<String> {
Ok(serde_json::to_string(self)?)
}
pub fn to_json_pretty(&self) -> Result<String> {
Ok(serde_json::to_string_pretty(self)?)
}
fn build_dom(pairs: Pairs<Rule>) -> Result<Self> {
let mut dom = Self::default();
// NOTE: The logic is roughly as follows:
// 1) A document containing nothing but comments is DomVariant::Empty even though it will have
// children in this first pass. We fix this in the next section. This allows us to use
// DomVariant::Empty to indicate "we haven't decided the type yet".
// 2) If the type is DomVariant::Empty _so far_, then it can be changed to DomVariant::Document
// or DomVariant::DocumentFragment. DomVariant is only selected in this stage if we see a
// DOCTYPE tag. Comments do not change the type.
// 3) If the type is non-empty, we don't re-set the type. We do look for conflicts between
// the type and the tokens in the next stage.
for pair in pairs {
match pair.as_rule() {
// A <!DOCTYPE> tag means a full-fledged document. Note that because of the way
// the grammar is written, we will only get this token if the <!DOCTYPE> occurs
// before any other tag; otherwise it will be parsed as a custom tag.
Rule::doctype => {
if dom.tree_type == DomVariant::Empty {
dom.tree_type = DomVariant::Document;
}
}
// If we see an element, build the sub-tree and add it as a child. If we don't
// have a document type yet (i.e. "empty"), select DocumentFragment
Rule::node_element => match Self::build_node_element(pair, &mut dom) {
Ok(el) => {
if let Some(node) = el {
if dom.tree_type == DomVariant::Empty {
dom.tree_type = DomVariant::DocumentFragment;
};
dom.children.push(node);
}
}
Err(error) => {
dom.errors.push(format!("{}", error));
}
},
// Similar to an element, we add it as a child and select DocumentFragment if we
// don't already have a document type.
Rule::node_text => {
if dom.tree_type == DomVariant::Empty {
dom.tree_type = DomVariant::DocumentFragment;
}
let text = pair.as_str().to_string();
if !text.trim().is_empty() {
dom.children.push(Node::Text(text));
}
}
// Store comments as a child, but it doesn't affect the document type selection
// until the next phase (validation).
Rule::node_comment => {
dom.children
.push(Node::Comment(pair.into_inner().as_str().to_string()));
}
// Ignore 'end of input', which then allows the catch-all unreachable!() arm to
// function properly.
Rule::EOI => (),
// This should be unreachable, due to the way the grammar is written
_ => unreachable!("[build dom] unknown rule: {:?}", pair.as_rule()),
};
}
// Implement some checks on the generated dom's data and initial type. The type may be
// modified in this section.
match dom.tree_type {
// A DomVariant::Empty can only have comments. Anything else is an error.
DomVariant::Empty => {
for node in &dom.children {
if let Node::Comment(_) = node {
// An "empty" document, but it has comments - this is where we cleanup the
// earlier assumption that a document with only comments is "empty".
// Really, it is a "fragment".
dom.tree_type = DomVariant::DocumentFragment
} else {
// Anything else (i.e. Text() or Element() ) can't happen at the top level;
// if we had seen one, we would have set the document type above
unreachable!("[build dom] empty document with an Element {:?}", node)
}
}
}
// A DomVariant::Document can only have comments and an <HTML> node at the top level.
// Only one <HTML> tag is permitted.
DomVariant::Document => {
if dom
.children
.iter()
.filter(|x| match x {
Node::Element(el) if el.name.to_lowercase() == "html" => true,
_ => false,
})
.count()
> 1
{
return Err(Error::Parsing(format!("Document with multiple HTML tags",)));
}
}
// A DomVariant::DocumentFragment should not have <HEAD>, or <BODY> tags at the
// top-level. If we find an <HTML> tag, then we consider this a Document instead (if
// it comes before any other elements, and if there is only one <HTML> tag).
DomVariant::DocumentFragment => {
let mut seen_html = false;
let mut seen_elements = false;
for node in &dom.children {
match node {
// Nodes other than <HTML> - reject <HEAD> and <BODY>
Node::Element(ref el) if el.name.clone().to_lowercase() != "html" => {
if el.name == "head" || el.name == "body" {
return Err(Error::Parsing(format!(
"A document fragment should not include {}",
el.name
)));
}
seen_elements = true;
}
// <HTML> Nodes - one (before any other elements) is okay
Node::Element(ref el) if el.name.clone().to_lowercase() == "html" => {
if seen_html || seen_elements {
return Err(Error::Parsing(format!(
"A document fragment should not include {}",
el.name
)));
};
// A fragment with just an <HTML> tag is a document
dom.tree_type = DomVariant::Document;
seen_html = true;
}
// Comment() and Text() nodes are permitted at the top-level of a
// DocumentFragment
_ => (),
}
}
}
}
// The result is the validated tree
Ok(dom)
}
fn build_node_element(pair: Pair<Rule>, dom: &mut Dom) -> Result<Option<Node>> {
let source_span = {
let pair_span = pair.as_span();
let (start_line, start_column) = pair_span.start_pos().line_col();
let (end_line, end_column) = pair_span.end_pos().line_col();
SourceSpan::new(
String::from(pair_span.as_str()),
start_line,
end_line,
start_column,
end_column,
)
};
let mut element = Element {
source_span,
..Element::default()
};
for pair in pair.into_inner() {
match pair.as_rule() {
Rule::node_element | Rule::el_raw_text => {
match Self::build_node_element(pair, dom) {
Ok(el) => {
if let Some(child_element) = el {
element.children.push(child_element)
}
}
Err(error) => {
dom.errors.push(format!("{}", error));
}
}
}
Rule::node_text | Rule::el_raw_text_content => {
let text = pair.as_str().to_string();
if !text.trim().is_empty() {
element.children.push(Node::Text(text));
}
}
Rule::node_comment => {
element
.children
.push(Node::Comment(pair.into_inner().as_str().to_string()));
}
// TODO: To enable some kind of validation we should probably align this with
// https://html.spec.whatwg.org/multipage/syntax.html#elements-2
// Also see element variants
Rule::el_name | Rule::el_void_name | Rule::el_raw_text_name => {
element.name = pair.as_str().to_string();
}
Rule::attr => match Self::build_attribute(pair.into_inner()) {
Ok((attr_key, attr_value)) => {
match attr_key.as_str() {
"id" => element.id = attr_value,
"class" => {
if let Some(classes) = attr_value {
let classes = classes.split_whitespace().collect::<Vec<_>>();
for class in classes {
element.classes.push(class.to_string());
}
}
}
_ => {
element.attributes.insert(attr_key, attr_value);
}
};
}
Err(error) => {
dom.errors.push(format!("{}", error));
}
},
Rule::el_normal_end | Rule::el_raw_text_end => {
element.variant = ElementVariant::Normal;
break;
}
Rule::el_dangling => (),
Rule::EOI => (),
_ => {
return Err(Error::Parsing(format!(
"Failed to create element at rule: {:?}",
pair.as_rule()
)))
}
}
}
if element.name != "" {
Ok(Some(Node::Element(element)))
} else {
Ok(None)
}
}
fn build_attribute(pairs: Pairs<Rule>) -> Result<(String, Option<String>)> {
let mut attribute = ("".to_string(), None);
for pair in pairs {
match pair.as_rule() {
Rule::attr_key => {
attribute.0 = pair.as_str().trim().to_string();
}
Rule::attr_non_quoted => {
attribute.1 = Some(pair.as_str().trim().to_string());
}
Rule::attr_quoted => {
let inner_pair = pair
.into_inner()
.into_iter()
.next()
.expect("attribute value");
match inner_pair.as_rule() {
Rule::attr_value => attribute.1 = Some(inner_pair.as_str().to_string()),
_ => {
return Err(Error::Parsing(format!(
"Failed to parse attr value: {:?}",
inner_pair.as_rule()
)))
}
}
}
_ => {
return Err(Error::Parsing(format!(
"Failed to parse attr: {:?}",
pair.as_rule()
)))
}
}
}
Ok(attribute)
}
}