html2md_rs/parser.rs
1//! This module contains functions which parsees HTML string into a custom Node struct.
2//!
3//! The Node struct is used to represent the HTML elements and their children in a tree-like structure.
4//!
5//! With the `safe_parse_html` function, malformed HTML will return an error instead of panicking.
6//! The `parse_html` function is a wrapper around `safe_parse_html` that panics if the input is malformed. However, it is deprecated and will be removed in future versions.
7
8use crate::structs::{
9 AttributeValues, Attributes, Node,
10 NodeType::{self, *},
11};
12use std::{collections::VecDeque, fmt::Display};
13
14/// Errors that will be returned when parsing malformed HTML tags
15#[derive(Debug, PartialEq, Eq)]
16pub enum MalformedTagError {
17 /// The closing bracket of the tag is missing
18 MissingClosingBracket(u32),
19 /// The tag name is missing
20 MissingTagName(u32),
21}
22
23/// Errors that will be returned when parsing malformed HTML attributes
24#[derive(Debug, PartialEq, Eq)]
25pub enum MalformedAttributeError {
26 /// The quotation mark of the attribute is missing
27 MissingQuotationMark(u32),
28 /// The attribute name is missing
29 MissingAttributeName(u32),
30 /// The attribute value is missing
31 MissingAttributeValue(u32),
32}
33
34/// Errors that can occur when parsing HTML
35#[derive(Debug, PartialEq, Eq)]
36pub enum ParseHTMLError {
37 /// The tag is malformed
38 MalformedTag(String, MalformedTagError),
39 /// The attribute is malformed
40 MalformedAttribute(String, MalformedAttributeError),
41}
42
43impl Display for ParseHTMLError {
44 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45 match self {
46 ParseHTMLError::MalformedTag(tag, error) => match error {
47 MalformedTagError::MissingClosingBracket(index) => {
48 write!(
49 f,
50 "Malformed tag: {} - Missing closing bracket at around index {}",
51 tag, index
52 )
53 }
54 MalformedTagError::MissingTagName(index) => {
55 write!(
56 f,
57 "Malformed tag: {} - Missing tag name at around index {}",
58 tag, index
59 )
60 }
61 },
62 ParseHTMLError::MalformedAttribute(attr, error) => match error {
63 MalformedAttributeError::MissingQuotationMark(index) => {
64 write!(
65 f,
66 "Malformed attribute: {} - Missing quotation mark at around index {}",
67 attr, index
68 )
69 }
70 MalformedAttributeError::MissingAttributeName(index) => {
71 write!(
72 f,
73 "Malformed attribute: {} - Missing attribute name at around index {}",
74 attr, index
75 )
76 }
77 MalformedAttributeError::MissingAttributeValue(index) => {
78 write!(
79 f,
80 "Malformed attribute: {} - Missing attribute value at around index {}",
81 attr, index
82 )
83 }
84 },
85 }
86 }
87}
88
89/// Safely parses a string of HTML into a Node struct
90///
91/// # Arguments
92///
93/// * `input` - A string slice that holds the HTML to be parsed
94///
95/// # Examples
96///
97/// ```
98/// use html2md_rs::{
99/// parser::safe_parse_html,
100/// structs::{
101/// Node,
102/// NodeType::{Div, Text},
103/// },
104/// };
105///
106/// let input = "<div>hello</div>".to_string();
107/// let parsed = safe_parse_html(input);
108/// let expected = Node {
109/// tag_name: Some(Div),
110/// value: None,
111/// within_special_tag: None,
112/// attributes: None,
113/// children: vec![Node {
114/// tag_name: Some(Text),
115/// value: Some("hello".to_string()),
116/// attributes: None,
117/// within_special_tag: None,
118/// children: Vec::new(),
119/// }],
120/// };
121///
122/// assert_eq!(parsed, Ok(expected));
123/// ```
124pub fn safe_parse_html(input: String) -> Result<Node, ParseHTMLError> {
125 // current_index is the index of the current character being processed
126 let mut current_index = 0;
127 // nodes is a vector of nodes that will be returned as an attribute of the resulting node
128 let mut nodes = Vec::new();
129 // stack is a LIFO stack of nodes that are being processed
130 let mut stack: Vec<Node> = Vec::new();
131
132 while current_index < input.len() {
133 let rest = &input[current_index..];
134 if rest.starts_with("<!") {
135 // if the current character is an exclamation mark, it's a comment or DOCTYPE
136 if rest.starts_with("<!DOCTYPE") {
137 // if the comment is a DOCTYPE, ignore it
138 current_index += rest.find('>').unwrap() + 1;
139 continue;
140 }
141 // find the closing comment tag
142 if let Some(closing_comment_index) = rest.find("-->") {
143 // if the closing comment tag is found, the comment is valid
144 // extract the comment from the rest
145 let comment = &rest[..closing_comment_index + 3];
146 // create a new node with the comment
147 let new_node = Node {
148 tag_name: Some(Comment),
149 value: Some(
150 comment
151 .trim_start_matches("<!")
152 .trim_start_matches("--")
153 .trim_end_matches("-->")
154 .to_string(),
155 ),
156 attributes: None,
157 within_special_tag: None,
158 children: Vec::new(),
159 };
160 // add the new_node to the stack
161 nodes.push(new_node);
162 // increment the current_index by the closing_comment_index + 3
163 // and continue to the next iteration
164 current_index += closing_comment_index + 3;
165 continue;
166 }
167 // if the closing comment tag is not found, the comment is malformed
168 return Err(ParseHTMLError::MalformedTag(
169 rest.to_string(),
170 MalformedTagError::MissingClosingBracket(current_index as u32),
171 ));
172 }
173
174 if rest.starts_with('<') {
175 if let Some(mut closing_index) = find_closing_bracket_index(rest) {
176 // if the tag is a self-closing tag (i.e. <tag_name ... />)
177 let self_closing = if rest.chars().nth(closing_index - 1) == Some('/') {
178 // if the last character right before the closing bracket is a forward slash, the tag is self-closing
179 // closing_index is the index of the closing bracket, so decrement it to ignore the forward slash
180 closing_index -= 1;
181 true
182 } else {
183 // if the last character right before the closing bracket is not a forward slash, the tag is not self-closing
184 false
185 };
186
187 // the tag content is the string between the opening and closing brackets
188 let tag_content = &rest[1..closing_index];
189
190 // initialize the node name and attribute map
191 let node_name;
192 let mut attribute_map = None;
193 if let Some(space_index) = tag_content.find(|c: char| c.is_whitespace()) {
194 // if the tag contains a space, split the tag into the node name and attributes
195 // space_index is the index of the first spce
196 // node_name is the tag name (i.e. <tag_name ...>)
197 node_name = &tag_content[..space_index];
198 // attributes is the string after the first space before the closing bracket
199 let attributes = &tag_content[space_index..];
200 // parse the attribute string into a map
201 match parse_tag_attributes(attributes, current_index) {
202 Ok(map) => attribute_map = map,
203 Err(err) => return Err(err),
204 }
205 } else {
206 // if the tag doesn't contain a space, the tag is the node name
207 node_name = tag_content;
208 }
209
210 if node_name.is_empty() {
211 // if the tag name is empty, the tag is malformed
212 return Err(ParseHTMLError::MalformedTag(
213 tag_content.to_string(),
214 MalformedTagError::MissingTagName(current_index as u32),
215 ));
216 }
217
218 if rest.starts_with("</") {
219 // if the tag is a closing tag, pop the last node from the stack and add it to the parent
220 match stack.pop() {
221 Some(last_node) => {
222 if stack.is_empty() {
223 // if the stack is empty, the last node is the root node
224 nodes.push(last_node);
225 } else {
226 let parent = stack.last_mut().unwrap(); // stack is not empty, so unwrap is safe
227 parent.children.push(last_node);
228 }
229 current_index += closing_index + 1;
230 continue;
231 }
232 None => {
233 // if there is nothing in the stack, the tag is malformed
234 let closing_bracket_of_closing_tag = rest.find('>');
235 return Err(ParseHTMLError::MalformedTag(
236 if let Some(index) = closing_bracket_of_closing_tag {
237 // if there is a closing bracket, return the tag with the error
238 rest[..index + 1].to_string()
239 } else {
240 rest.to_string()
241 },
242 MalformedTagError::MissingClosingBracket(current_index as u32),
243 ));
244 }
245 }
246 }
247
248 // parse thae tag name into a NodeType from the node_name string
249 let node_type = NodeType::from_tag_str(node_name);
250
251 // initialize a new node with the tag name and attribute map
252 let mut new_node = Node {
253 tag_name: Some(node_type.clone()),
254 value: None,
255 attributes: attribute_map,
256 within_special_tag: None,
257 children: Vec::new(),
258 };
259
260 if self_closing {
261 // if the tag is self-closing, add the node to the parent
262 // if a parent does not exist, add the node to the nodes vector
263 if let Some(parent) = stack.last_mut() {
264 modify_node_with_parent(&mut new_node, parent);
265 parent.children.push(new_node);
266 } else {
267 nodes.push(new_node);
268 }
269 // because the tag is self-closing, increment the current_index by the closing_index + 2
270 // and continute to the next iteration
271 current_index += closing_index + 2;
272 continue;
273 }
274 // if the tag is not self-closing
275 // add the new_node to the stack
276 if let Some(parent) = stack.last_mut() {
277 modify_node_with_parent(&mut new_node, parent);
278 }
279 stack.push(new_node);
280 // because the tag is not self-closing, increment the current_index by the closing_index + 1
281 current_index += closing_index + 1;
282 continue;
283 } else {
284 // if a closing bracket is not found, the tag is malformed
285 return Err(ParseHTMLError::MalformedTag(
286 rest.to_string(),
287 MalformedTagError::MissingClosingBracket(current_index as u32),
288 ));
289 }
290 }
291
292 // if the current character is not a '<', it's just a text
293 // if an opening bracket is not found, the rest is the content of the text
294 // else, anything upto the opening bracket is the content of the text
295 let next_opening_tag = rest.find('<').unwrap_or(rest.len());
296 let text = &rest[..next_opening_tag];
297 if text.trim().is_empty() {
298 // if text is empty or only whitespace, ignore it
299 // increment the current_index by next_opening_tag and continue to the next iteration
300 current_index += next_opening_tag;
301 continue;
302 }
303
304 // initialize new_node as text with the content of the text
305 let new_node = Node {
306 tag_name: Some(Text),
307 value: Some(text.to_string()),
308 attributes: None,
309 within_special_tag: None,
310 children: Vec::new(),
311 };
312
313 // add the new_node to the stack
314 modify_stack_with_node(&mut stack, new_node);
315
316 current_index += next_opening_tag
317 }
318
319 // if the stack is not empty, add the stack to the nodes vector
320 if !stack.is_empty() {
321 for stack_node in stack.drain(..) {
322 nodes.push(stack_node);
323 }
324 }
325
326 if nodes.len() == 1 {
327 return Ok(nodes.remove(0));
328 }
329
330 Ok(Node {
331 tag_name: None,
332 value: None,
333 attributes: None,
334 within_special_tag: None,
335 children: nodes,
336 })
337}
338
339/// Adds a new node to the stack with respect to the parent node's special tag and tag type
340///
341/// # Arguments
342///
343/// * `stack` - A mutable reference to a vector of nodes
344/// * `new_node` - A mutable reference to a node to be added to the stack
345fn modify_stack_with_node(stack: &mut Vec<Node>, mut new_node: Node) {
346 if let Some(parent) = stack.last_mut() {
347 // if the stack is not empty, add new_node to the parent
348 // modify the new_node with the parent's within_special_tag and tag type
349 modify_node_with_parent(&mut new_node, parent);
350 parent.children.push(new_node.clone());
351 return;
352 }
353 // if stack is empty, add new_node to the stack
354 stack.push(new_node.clone());
355}
356
357/// Modifies a node with the parent's within_special_tag and tag type
358///
359/// # Arguments
360///
361/// * `node` - A mutable reference to a Node to be modified
362/// * `parent` - A reference to the parent Node
363fn modify_node_with_parent(node: &mut Node, parent: &Node) {
364 if parent.within_special_tag.is_some() {
365 node.within_special_tag
366 .clone_from(&parent.within_special_tag)
367 }
368 if let Some(parent_tag_name) = &parent.tag_name {
369 if parent_tag_name.is_special_tag() {
370 if let Some(within_special_tag) = &mut node.within_special_tag {
371 within_special_tag.push(parent_tag_name.clone());
372 } else {
373 node.within_special_tag = Some(vec![parent_tag_name.clone()]);
374 }
375 }
376 }
377}
378
379/// Parses a string of HTML into a Node struct
380///
381/// Panics if the input is malformed
382///
383/// # Arguments
384///
385/// * `input` - A string slice that holds the HTML to be parsed
386///
387/// # Examples
388///
389/// ```
390/// use html2md_rs::{
391/// parser::parse_html,
392/// structs::{
393/// Node,
394/// NodeType::{Div, Text},
395/// },
396/// };
397///
398/// let input = "<div>hello</div>".to_string();
399/// let parsed = parse_html(input);
400/// let expected = Node {
401/// tag_name: Some(Div),
402/// value: None,
403/// attributes: None,
404/// within_special_tag: None,
405/// children: vec![Node {
406/// tag_name: Some(Text),
407/// value: Some("hello".to_string()),
408/// attributes: None,
409/// within_special_tag: None,
410/// children: Vec::new(),
411/// }],
412/// };
413///
414/// assert_eq!(parsed, expected);
415/// ```
416#[deprecated(
417 since = "0.7.0",
418 note = "This function is deprecated and will be removed in future versions. Please use the safe_parse_html function instead."
419)]
420pub fn parse_html(input: String) -> Node {
421 let parsed = safe_parse_html(input);
422 match parsed {
423 Ok(node) => node,
424 Err(err) => panic!("error parsing html: {:?}", err),
425 }
426}
427
428fn parse_tag_attributes(
429 tag_attributes: &str,
430 current_index: usize,
431) -> Result<Option<Attributes>, ParseHTMLError> {
432 let tag_attributes = tag_attributes.trim();
433
434 // if the input is empty or only whitespace, return None
435 if tag_attributes.is_empty() {
436 return Ok(None);
437 }
438
439 let mut attribute_map = Attributes::new();
440
441 let mut current_key = String::new();
442 let mut current_value_in_quotes = String::new();
443 let mut in_quotes = false;
444 let mut may_be_reading_non_quoted_value = false;
445
446 for char in tag_attributes.trim().chars() {
447 // iterate through each character in the trimmed tag_attributes string
448
449 if in_quotes {
450 // if we are in quotation marks, just add the character to the current_value_in_quotes
451 // except for if the character is a quotation mark, which indicates the end of the value
452 if char.eq(&'"') {
453 // if the character is a quotation mark, add the current_value_in_quotes to the attribute_map
454 // and reset the current_key and current_value_in_quotes
455 add_to_attribute_map(&mut attribute_map, ¤t_key, ¤t_value_in_quotes);
456 current_key.clear();
457 current_value_in_quotes.clear();
458 in_quotes = false;
459 continue;
460 }
461 current_value_in_quotes.push(char);
462 continue;
463 }
464
465 if char.eq(&'"') {
466 // if the character is a quotation mark, we are about to start the value
467 // we know in_quotes is false because that is checked above
468 if current_key.is_empty() {
469 // if the current_key is empty, the attribute is malformed
470 return Err(ParseHTMLError::MalformedAttribute(
471 tag_attributes.to_string(),
472 MalformedAttributeError::MissingAttributeName(current_index as u32),
473 ));
474 }
475 // set the in_quotes flag to true
476 in_quotes = true;
477 // if the character is a quotation mark, we are going to be in quotes
478 // so we don't need to keep track of non-quoted value flag
479 may_be_reading_non_quoted_value = false;
480 continue;
481 }
482
483 if char.is_whitespace() {
484 if may_be_reading_non_quoted_value {
485 if current_value_in_quotes.is_empty() {
486 // if we are reading a non-quoted value and the value is empty, we can ignore the whitespace
487 continue;
488 }
489 // if we are reading a non-quoted value, the whitespace indicates the end of the value
490 // add the value to the attribute_map
491 add_to_attribute_map(&mut attribute_map, ¤t_key, ¤t_value_in_quotes);
492 current_key.clear();
493 current_value_in_quotes.clear();
494 may_be_reading_non_quoted_value = false;
495 continue;
496 }
497 // if the character is whitespace, if could be indicating the end of a key
498 if !current_key.is_empty() {
499 // if the key has some value, add it to the attribute_map with value true
500 attribute_map.insert(current_key.clone(), AttributeValues::from(true));
501 current_key.clear();
502 continue;
503 }
504 // if the current_key is empty, the whitespace can be ignored
505 continue;
506 }
507
508 if !in_quotes && !may_be_reading_non_quoted_value && char.eq(&'=') {
509 // if the character is an equal sign, the current_key is complete
510 // if we are in quotes or reading a non-quoted value, the equal sign is part of the value
511 // and we are about to start the value
512 if current_key.is_empty() {
513 // if the current_key is empty, the attribute is malformed
514 return Err(ParseHTMLError::MalformedAttribute(
515 tag_attributes.to_string(),
516 MalformedAttributeError::MissingAttributeName(current_index as u32),
517 ));
518 }
519 // equal sign indicates the start of the value up to the next whitespace
520 may_be_reading_non_quoted_value = true;
521 continue;
522 }
523
524 if may_be_reading_non_quoted_value {
525 // if we are reading a non-quoted value, add the character to the current_value_in_quotes
526 current_value_in_quotes.push(char);
527 continue;
528 }
529
530 // otherwise, add the character to the current_key
531 current_key.push(char);
532 }
533
534 if may_be_reading_non_quoted_value && !current_value_in_quotes.is_empty() {
535 // if we are reading a non-quoted value and the value is not empty, add the value to the attribute_map
536 add_to_attribute_map(&mut attribute_map, ¤t_key, ¤t_value_in_quotes);
537 }
538
539 if in_quotes {
540 return Err(ParseHTMLError::MalformedAttribute(
541 current_value_in_quotes,
542 MalformedAttributeError::MissingQuotationMark(current_index as u32),
543 ));
544 }
545
546 // if not, return the attribute map
547 match attribute_map.is_empty() {
548 true => Ok(None),
549 false => Ok(Some(attribute_map)),
550 }
551}
552
553fn add_to_attribute_map(
554 attribute_map: &mut Attributes,
555 current_key: &str,
556 current_value_in_quotes: &str,
557) {
558 if current_key.is_empty() || current_value_in_quotes.is_empty() {
559 return;
560 }
561 attribute_map.insert(
562 current_key.to_string(),
563 AttributeValues::from(current_value_in_quotes),
564 );
565}
566
567fn find_closing_bracket_index(rest: &str) -> Option<usize> {
568 let mut attribute_value_stack: VecDeque<char> = VecDeque::new(); // needed to fix #31
569 for (idx, char) in rest.char_indices() {
570 if char.eq(&'"') || char.eq(&'\'') {
571 if let Some(back) = attribute_value_stack.back() {
572 if back.eq(&char) {
573 attribute_value_stack.pop_back();
574 } else {
575 attribute_value_stack.push_back(char)
576 }
577 } else {
578 attribute_value_stack.push_back(char)
579 }
580 }
581 if char.eq(&'>') && attribute_value_stack.is_empty() {
582 return Some(idx);
583 }
584 }
585 None
586}
587
588// https://github.com/izyuumi/html2md-rs/issues/25
589#[test]
590fn issue_25() {
591 let input = "property=\"og:type\" content= \"website\"".to_string();
592 let expected = Attributes::from(vec![
593 ("property".to_string(), AttributeValues::from("og:type")),
594 ("content".to_string(), AttributeValues::from("website")),
595 ]);
596 let parsed = parse_tag_attributes(&input, 0).unwrap().unwrap();
597 assert_eq!(parsed, expected);
598}
599
600// https://github.com/izyuumi/html2md-rs/issues/31
601#[test]
602fn issue_31() {
603 let input = r#"<img src="https://exmaple.com/img.png" alt="Rust<br/>Logo"/>"#.to_string();
604 let expected = Node {
605 tag_name: Some(Unknown("img".to_string())),
606 value: None,
607 attributes: Some(Attributes {
608 id: None,
609 class: None,
610 href: None,
611 attributes: std::collections::HashMap::from([
612 (
613 "src".to_string(),
614 AttributeValues::from("https://exmaple.com/img.png"),
615 ),
616 ("alt".to_string(), AttributeValues::from("Rust<br/>Logo")),
617 ]),
618 }),
619 children: Vec::new(),
620 within_special_tag: None,
621 };
622 let parsed = safe_parse_html(input).unwrap();
623 assert_eq!(parsed, expected)
624}
625
626// https://github.com/izyuumi/html2md-rs/issues/36
627#[test]
628fn issue_36() {
629 let input = "<img src=\"https://hoerspiele.dra.de/fileadmin/www.hoerspiele.dra.de/images/vollinfo/4970918_B01.jpg\" />".to_string();
630 let expected = Node {
631 tag_name: Some(Unknown("img".to_string())),
632 value: None,
633 attributes: Some(Attributes {
634 id: None,
635 class: None,
636 href: None,
637 attributes: std::collections::HashMap::from([(
638 "src".to_string(),
639 AttributeValues::from("https://hoerspiele.dra.de/fileadmin/www.hoerspiele.dra.de/images/vollinfo/4970918_B01.jpg"),
640 )]),
641 }),
642 children: Vec::new(),
643 within_special_tag: None,
644 };
645 let parsed = safe_parse_html(input).unwrap();
646 assert_eq!(parsed, expected);
647
648 let input = r#"<!DOCTYPE html><meta http-equiv="content-type" content="text/html; charset=utf-8"><div class="column"><div class="gallery-wrap single">
649 <div class="gallery-container">
650 <figure class="image">
651 <figure class="image">
652 <img title="Illustration »Der dunkle Kongress« © ARD / Jürgen Frey"
653 alt="Illustration »Der dunkle Kongress« © ARD / Jürgen Frey"
654 src="https://hoerspiele.dra.de/fileadmin/www.hoerspiele.dra.de/images/vollinfo/4970918_B01.jpg">
655 <figcaption class="image-caption">Illustration »Der dunkle Kongress«
656© ARD / Jürgen Frey</figcaption>
657</figure></div></div></div>"#.to_string();
658 safe_parse_html(input).unwrap();
659}