html2md_rs/
parser.rs

1//! This module contains functions which parsees HTML string into a custom Node struct.
2//!
3//! The Node struct is used to represent the HTML elements and their children in a tree-like structure.
4//!
5//! With the `safe_parse_html` function, malformed HTML will return an error instead of panicking.
6//! The `parse_html` function is a wrapper around `safe_parse_html` that panics if the input is malformed. However, it is deprecated and will be removed in future versions.
7
8use crate::structs::{
9    AttributeValues, Attributes, Node,
10    NodeType::{self, *},
11};
12use std::{collections::VecDeque, fmt::Display};
13
14/// Errors that will be returned when parsing malformed HTML tags
15#[derive(Debug, PartialEq, Eq)]
16pub enum MalformedTagError {
17    /// The closing bracket of the tag is missing
18    MissingClosingBracket(u32),
19    /// The tag name is missing
20    MissingTagName(u32),
21}
22
23/// Errors that will be returned when parsing malformed HTML attributes
24#[derive(Debug, PartialEq, Eq)]
25pub enum MalformedAttributeError {
26    /// The quotation mark of the attribute is missing
27    MissingQuotationMark(u32),
28    /// The attribute name is missing
29    MissingAttributeName(u32),
30    /// The attribute value is missing
31    MissingAttributeValue(u32),
32}
33
34/// Errors that can occur when parsing HTML
35#[derive(Debug, PartialEq, Eq)]
36pub enum ParseHTMLError {
37    /// The tag is malformed
38    MalformedTag(String, MalformedTagError),
39    /// The attribute is malformed
40    MalformedAttribute(String, MalformedAttributeError),
41}
42
43impl Display for ParseHTMLError {
44    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45        match self {
46            ParseHTMLError::MalformedTag(tag, error) => match error {
47                MalformedTagError::MissingClosingBracket(index) => {
48                    write!(
49                        f,
50                        "Malformed tag: {} - Missing closing bracket at around index {}",
51                        tag, index
52                    )
53                }
54                MalformedTagError::MissingTagName(index) => {
55                    write!(
56                        f,
57                        "Malformed tag: {} - Missing tag name at around index {}",
58                        tag, index
59                    )
60                }
61            },
62            ParseHTMLError::MalformedAttribute(attr, error) => match error {
63                MalformedAttributeError::MissingQuotationMark(index) => {
64                    write!(
65                        f,
66                        "Malformed attribute: {} - Missing quotation mark at around index {}",
67                        attr, index
68                    )
69                }
70                MalformedAttributeError::MissingAttributeName(index) => {
71                    write!(
72                        f,
73                        "Malformed attribute: {} - Missing attribute name at around index {}",
74                        attr, index
75                    )
76                }
77                MalformedAttributeError::MissingAttributeValue(index) => {
78                    write!(
79                        f,
80                        "Malformed attribute: {} - Missing attribute value at around index {}",
81                        attr, index
82                    )
83                }
84            },
85        }
86    }
87}
88
89/// Safely parses a string of HTML into a Node struct
90///
91/// # Arguments
92///
93/// * `input` - A string slice that holds the HTML to be parsed
94///
95/// # Examples
96///
97/// ```
98/// use html2md_rs::{
99///     parser::safe_parse_html,
100///     structs::{
101///         Node,
102///         NodeType::{Div, Text},
103///     },
104/// };
105///
106/// let input = "<div>hello</div>".to_string();
107/// let parsed = safe_parse_html(input);
108/// let expected = Node {
109///     tag_name: Some(Div),
110///     value: None,
111///     within_special_tag: None,
112///     attributes: None,
113///     children: vec![Node {
114///         tag_name: Some(Text),
115///         value: Some("hello".to_string()),
116///         attributes: None,
117///         within_special_tag: None,
118///         children: Vec::new(),
119///     }],
120/// };
121///
122/// assert_eq!(parsed, Ok(expected));
123/// ```
124pub fn safe_parse_html(input: String) -> Result<Node, ParseHTMLError> {
125    // current_index is the index of the current character being processed
126    let mut current_index = 0;
127    // nodes is a vector of nodes that will be returned as an attribute of the resulting node
128    let mut nodes = Vec::new();
129    // stack is a LIFO stack of nodes that are being processed
130    let mut stack: Vec<Node> = Vec::new();
131
132    while current_index < input.len() {
133        let rest = &input[current_index..];
134        if rest.starts_with("<!") {
135            // if the current character is an exclamation mark, it's a comment or DOCTYPE
136            if rest.starts_with("<!DOCTYPE") {
137                // if the comment is a DOCTYPE, ignore it
138                current_index += rest.find('>').unwrap() + 1;
139                continue;
140            }
141            // find the closing comment tag
142            if let Some(closing_comment_index) = rest.find("-->") {
143                // if the closing comment tag is found, the comment is valid
144                // extract the comment from the rest
145                let comment = &rest[..closing_comment_index + 3];
146                // create a new node with the comment
147                let new_node = Node {
148                    tag_name: Some(Comment),
149                    value: Some(
150                        comment
151                            .trim_start_matches("<!")
152                            .trim_start_matches("--")
153                            .trim_end_matches("-->")
154                            .to_string(),
155                    ),
156                    attributes: None,
157                    within_special_tag: None,
158                    children: Vec::new(),
159                };
160                // add the new_node to the stack
161                nodes.push(new_node);
162                // increment the current_index by the closing_comment_index + 3
163                // and continue to the next iteration
164                current_index += closing_comment_index + 3;
165                continue;
166            }
167            // if the closing comment tag is not found, the comment is malformed
168            return Err(ParseHTMLError::MalformedTag(
169                rest.to_string(),
170                MalformedTagError::MissingClosingBracket(current_index as u32),
171            ));
172        }
173
174        if rest.starts_with('<') {
175            if let Some(mut closing_index) = find_closing_bracket_index(rest) {
176                // if the tag is a self-closing tag (i.e. <tag_name ... />)
177                let self_closing = if rest.chars().nth(closing_index - 1) == Some('/') {
178                    // if the last character right before the closing bracket is a forward slash, the tag is self-closing
179                    // closing_index is the index of the closing bracket, so decrement it to ignore the forward slash
180                    closing_index -= 1;
181                    true
182                } else {
183                    // if the last character right before the closing bracket is not a forward slash, the tag is not self-closing
184                    false
185                };
186
187                // the tag content is the string between the opening and closing brackets
188                let tag_content = &rest[1..closing_index];
189
190                // initialize the node name and attribute map
191                let node_name;
192                let mut attribute_map = None;
193                if let Some(space_index) = tag_content.find(|c: char| c.is_whitespace()) {
194                    // if the tag contains a space, split the tag into the node name and attributes
195                    // space_index is the index of the first spce
196                    // node_name is the tag name (i.e. <tag_name ...>)
197                    node_name = &tag_content[..space_index];
198                    // attributes is the string after the first space before the closing bracket
199                    let attributes = &tag_content[space_index..];
200                    // parse the attribute string into a map
201                    match parse_tag_attributes(attributes, current_index) {
202                        Ok(map) => attribute_map = map,
203                        Err(err) => return Err(err),
204                    }
205                } else {
206                    // if the tag doesn't contain a space, the tag is the node name
207                    node_name = tag_content;
208                }
209
210                if node_name.is_empty() {
211                    // if the tag name is empty, the tag is malformed
212                    return Err(ParseHTMLError::MalformedTag(
213                        tag_content.to_string(),
214                        MalformedTagError::MissingTagName(current_index as u32),
215                    ));
216                }
217
218                if rest.starts_with("</") {
219                    // if the tag is a closing tag, pop the last node from the stack and add it to the parent
220                    match stack.pop() {
221                        Some(last_node) => {
222                            if stack.is_empty() {
223                                // if the stack is empty, the last node is the root node
224                                nodes.push(last_node);
225                            } else {
226                                let parent = stack.last_mut().unwrap(); // stack is not empty, so unwrap is safe
227                                parent.children.push(last_node);
228                            }
229                            current_index += closing_index + 1;
230                            continue;
231                        }
232                        None => {
233                            // if there is nothing in the stack, the tag is malformed
234                            let closing_bracket_of_closing_tag = rest.find('>');
235                            return Err(ParseHTMLError::MalformedTag(
236                                if let Some(index) = closing_bracket_of_closing_tag {
237                                    // if there is a closing bracket, return the tag with the error
238                                    rest[..index + 1].to_string()
239                                } else {
240                                    rest.to_string()
241                                },
242                                MalformedTagError::MissingClosingBracket(current_index as u32),
243                            ));
244                        }
245                    }
246                }
247
248                // parse thae tag name into a NodeType from the node_name string
249                let node_type = NodeType::from_tag_str(node_name);
250
251                // initialize a new node with the tag name and attribute map
252                let mut new_node = Node {
253                    tag_name: Some(node_type.clone()),
254                    value: None,
255                    attributes: attribute_map,
256                    within_special_tag: None,
257                    children: Vec::new(),
258                };
259
260                if self_closing {
261                    // if the tag is self-closing, add the node to the parent
262                    // if a parent does not exist, add the node to the nodes vector
263                    if let Some(parent) = stack.last_mut() {
264                        modify_node_with_parent(&mut new_node, parent);
265                        parent.children.push(new_node);
266                    } else {
267                        nodes.push(new_node);
268                    }
269                    // because the tag is self-closing, increment the current_index by the closing_index + 2
270                    // and continute to the next iteration
271                    current_index += closing_index + 2;
272                    continue;
273                }
274                // if the tag is not self-closing
275                // add the new_node to the stack
276                if let Some(parent) = stack.last_mut() {
277                    modify_node_with_parent(&mut new_node, parent);
278                }
279                stack.push(new_node);
280                // because the tag is not self-closing, increment the current_index by the closing_index + 1
281                current_index += closing_index + 1;
282                continue;
283            } else {
284                // if a closing bracket is not found, the tag is malformed
285                return Err(ParseHTMLError::MalformedTag(
286                    rest.to_string(),
287                    MalformedTagError::MissingClosingBracket(current_index as u32),
288                ));
289            }
290        }
291
292        // if the current character is not a '<', it's just a text
293        // if an opening bracket is not found, the rest is the content of the text
294        // else, anything upto the opening bracket is the content of the text
295        let next_opening_tag = rest.find('<').unwrap_or(rest.len());
296        let text = &rest[..next_opening_tag];
297        if text.trim().is_empty() {
298            // if text is empty or only whitespace, ignore it
299            // increment the current_index by next_opening_tag and continue to the next iteration
300            current_index += next_opening_tag;
301            continue;
302        }
303
304        // initialize new_node as text with the content of the text
305        let new_node = Node {
306            tag_name: Some(Text),
307            value: Some(text.to_string()),
308            attributes: None,
309            within_special_tag: None,
310            children: Vec::new(),
311        };
312
313        // add the new_node to the stack
314        modify_stack_with_node(&mut stack, new_node);
315
316        current_index += next_opening_tag
317    }
318
319    // if the stack is not empty, add the stack to the nodes vector
320    if !stack.is_empty() {
321        for stack_node in stack.drain(..) {
322            nodes.push(stack_node);
323        }
324    }
325
326    if nodes.len() == 1 {
327        return Ok(nodes.remove(0));
328    }
329
330    Ok(Node {
331        tag_name: None,
332        value: None,
333        attributes: None,
334        within_special_tag: None,
335        children: nodes,
336    })
337}
338
339/// Adds a new node to the stack with respect to the parent node's special tag and tag type
340///
341/// # Arguments
342///
343/// * `stack` - A mutable reference to a vector of nodes
344/// * `new_node` - A mutable reference to a node to be added to the stack
345fn modify_stack_with_node(stack: &mut Vec<Node>, mut new_node: Node) {
346    if let Some(parent) = stack.last_mut() {
347        // if the stack is not empty, add new_node to the parent
348        // modify the new_node with the parent's within_special_tag and tag type
349        modify_node_with_parent(&mut new_node, parent);
350        parent.children.push(new_node.clone());
351        return;
352    }
353    // if stack is empty, add new_node to the stack
354    stack.push(new_node.clone());
355}
356
357/// Modifies a node with the parent's within_special_tag and tag type
358///
359/// # Arguments
360///
361/// * `node` - A mutable reference to a Node to be modified
362/// * `parent` - A reference to the parent Node
363fn modify_node_with_parent(node: &mut Node, parent: &Node) {
364    if parent.within_special_tag.is_some() {
365        node.within_special_tag
366            .clone_from(&parent.within_special_tag)
367    }
368    if let Some(parent_tag_name) = &parent.tag_name {
369        if parent_tag_name.is_special_tag() {
370            if let Some(within_special_tag) = &mut node.within_special_tag {
371                within_special_tag.push(parent_tag_name.clone());
372            } else {
373                node.within_special_tag = Some(vec![parent_tag_name.clone()]);
374            }
375        }
376    }
377}
378
379/// Parses a string of HTML into a Node struct
380///
381/// Panics if the input is malformed
382///
383/// # Arguments
384///
385/// * `input` - A string slice that holds the HTML to be parsed
386///
387/// # Examples
388///
389/// ```
390/// use html2md_rs::{
391///     parser::parse_html,
392///     structs::{
393///         Node,
394///         NodeType::{Div, Text},
395///     },
396/// };
397///
398/// let input = "<div>hello</div>".to_string();
399/// let parsed = parse_html(input);
400/// let expected = Node {
401///     tag_name: Some(Div),
402///     value: None,
403///     attributes: None,
404///     within_special_tag: None,
405///     children: vec![Node {
406///         tag_name: Some(Text),
407///         value: Some("hello".to_string()),
408///         attributes: None,
409///         within_special_tag: None,
410///         children: Vec::new(),
411///     }],
412/// };
413///
414/// assert_eq!(parsed, expected);
415/// ```
416#[deprecated(
417    since = "0.7.0",
418    note = "This function is deprecated and will be removed in future versions. Please use the safe_parse_html function instead."
419)]
420pub fn parse_html(input: String) -> Node {
421    let parsed = safe_parse_html(input);
422    match parsed {
423        Ok(node) => node,
424        Err(err) => panic!("error parsing html: {:?}", err),
425    }
426}
427
428fn parse_tag_attributes(
429    tag_attributes: &str,
430    current_index: usize,
431) -> Result<Option<Attributes>, ParseHTMLError> {
432    let tag_attributes = tag_attributes.trim();
433
434    // if the input is empty or only whitespace, return None
435    if tag_attributes.is_empty() {
436        return Ok(None);
437    }
438
439    let mut attribute_map = Attributes::new();
440
441    let mut current_key = String::new();
442    let mut current_value_in_quotes = String::new();
443    let mut in_quotes = false;
444    let mut may_be_reading_non_quoted_value = false;
445
446    for char in tag_attributes.trim().chars() {
447        // iterate through each character in the trimmed tag_attributes string
448
449        if in_quotes {
450            // if we are in quotation marks, just add the character to the current_value_in_quotes
451            // except for if the character is a quotation mark, which indicates the end of the value
452            if char.eq(&'"') {
453                // if the character is a quotation mark, add the current_value_in_quotes to the attribute_map
454                // and reset the current_key and current_value_in_quotes
455                add_to_attribute_map(&mut attribute_map, &current_key, &current_value_in_quotes);
456                current_key.clear();
457                current_value_in_quotes.clear();
458                in_quotes = false;
459                continue;
460            }
461            current_value_in_quotes.push(char);
462            continue;
463        }
464
465        if char.eq(&'"') {
466            // if the character is a quotation mark, we are about to start the value
467            // we know in_quotes is false because that is checked above
468            if current_key.is_empty() {
469                // if the current_key is empty, the attribute is malformed
470                return Err(ParseHTMLError::MalformedAttribute(
471                    tag_attributes.to_string(),
472                    MalformedAttributeError::MissingAttributeName(current_index as u32),
473                ));
474            }
475            // set the in_quotes flag to true
476            in_quotes = true;
477            // if the character is a quotation mark, we are going to be in quotes
478            // so we don't need to keep track of non-quoted value flag
479            may_be_reading_non_quoted_value = false;
480            continue;
481        }
482
483        if char.is_whitespace() {
484            if may_be_reading_non_quoted_value {
485                if current_value_in_quotes.is_empty() {
486                    // if we are reading a non-quoted value and the value is empty, we can ignore the whitespace
487                    continue;
488                }
489                // if we are reading a non-quoted value, the whitespace indicates the end of the value
490                // add the value to the attribute_map
491                add_to_attribute_map(&mut attribute_map, &current_key, &current_value_in_quotes);
492                current_key.clear();
493                current_value_in_quotes.clear();
494                may_be_reading_non_quoted_value = false;
495                continue;
496            }
497            // if the character is whitespace, if could be indicating the end of a key
498            if !current_key.is_empty() {
499                // if the key has some value, add it to the attribute_map with value true
500                attribute_map.insert(current_key.clone(), AttributeValues::from(true));
501                current_key.clear();
502                continue;
503            }
504            // if the current_key is empty, the whitespace can be ignored
505            continue;
506        }
507
508        if !in_quotes && !may_be_reading_non_quoted_value && char.eq(&'=') {
509            // if the character is an equal sign, the current_key is complete
510            // if we are in quotes or reading a non-quoted value, the equal sign is part of the value
511            // and we are about to start the value
512            if current_key.is_empty() {
513                // if the current_key is empty, the attribute is malformed
514                return Err(ParseHTMLError::MalformedAttribute(
515                    tag_attributes.to_string(),
516                    MalformedAttributeError::MissingAttributeName(current_index as u32),
517                ));
518            }
519            // equal sign indicates the start of the value up to the next whitespace
520            may_be_reading_non_quoted_value = true;
521            continue;
522        }
523
524        if may_be_reading_non_quoted_value {
525            // if we are reading a non-quoted value, add the character to the current_value_in_quotes
526            current_value_in_quotes.push(char);
527            continue;
528        }
529
530        // otherwise, add the character to the current_key
531        current_key.push(char);
532    }
533
534    if may_be_reading_non_quoted_value && !current_value_in_quotes.is_empty() {
535        // if we are reading a non-quoted value and the value is not empty, add the value to the attribute_map
536        add_to_attribute_map(&mut attribute_map, &current_key, &current_value_in_quotes);
537    }
538
539    if in_quotes {
540        return Err(ParseHTMLError::MalformedAttribute(
541            current_value_in_quotes,
542            MalformedAttributeError::MissingQuotationMark(current_index as u32),
543        ));
544    }
545
546    // if not, return the attribute map
547    match attribute_map.is_empty() {
548        true => Ok(None),
549        false => Ok(Some(attribute_map)),
550    }
551}
552
553fn add_to_attribute_map(
554    attribute_map: &mut Attributes,
555    current_key: &str,
556    current_value_in_quotes: &str,
557) {
558    if current_key.is_empty() || current_value_in_quotes.is_empty() {
559        return;
560    }
561    attribute_map.insert(
562        current_key.to_string(),
563        AttributeValues::from(current_value_in_quotes),
564    );
565}
566
567fn find_closing_bracket_index(rest: &str) -> Option<usize> {
568    let mut attribute_value_stack: VecDeque<char> = VecDeque::new(); // needed to fix #31
569    for (idx, char) in rest.char_indices() {
570        if char.eq(&'"') || char.eq(&'\'') {
571            if let Some(back) = attribute_value_stack.back() {
572                if back.eq(&char) {
573                    attribute_value_stack.pop_back();
574                } else {
575                    attribute_value_stack.push_back(char)
576                }
577            } else {
578                attribute_value_stack.push_back(char)
579            }
580        }
581        if char.eq(&'>') && attribute_value_stack.is_empty() {
582            return Some(idx);
583        }
584    }
585    None
586}
587
588// https://github.com/izyuumi/html2md-rs/issues/25
589#[test]
590fn issue_25() {
591    let input = "property=\"og:type\" content= \"website\"".to_string();
592    let expected = Attributes::from(vec![
593        ("property".to_string(), AttributeValues::from("og:type")),
594        ("content".to_string(), AttributeValues::from("website")),
595    ]);
596    let parsed = parse_tag_attributes(&input, 0).unwrap().unwrap();
597    assert_eq!(parsed, expected);
598}
599
600// https://github.com/izyuumi/html2md-rs/issues/31
601#[test]
602fn issue_31() {
603    let input = r#"<img src="https://exmaple.com/img.png" alt="Rust<br/>Logo"/>"#.to_string();
604    let expected = Node {
605        tag_name: Some(Unknown("img".to_string())),
606        value: None,
607        attributes: Some(Attributes {
608            id: None,
609            class: None,
610            href: None,
611            attributes: std::collections::HashMap::from([
612                (
613                    "src".to_string(),
614                    AttributeValues::from("https://exmaple.com/img.png"),
615                ),
616                ("alt".to_string(), AttributeValues::from("Rust<br/>Logo")),
617            ]),
618        }),
619        children: Vec::new(),
620        within_special_tag: None,
621    };
622    let parsed = safe_parse_html(input).unwrap();
623    assert_eq!(parsed, expected)
624}
625
626// https://github.com/izyuumi/html2md-rs/issues/36
627#[test]
628fn issue_36() {
629    let input = "<img src=\"https://hoerspiele.dra.de/fileadmin/www.hoerspiele.dra.de/images/vollinfo/4970918_B01.jpg\" />".to_string();
630    let expected = Node {
631        tag_name: Some(Unknown("img".to_string())),
632        value: None,
633        attributes: Some(Attributes {
634            id: None,
635            class: None,
636            href: None,
637            attributes: std::collections::HashMap::from([(
638                "src".to_string(),
639                AttributeValues::from("https://hoerspiele.dra.de/fileadmin/www.hoerspiele.dra.de/images/vollinfo/4970918_B01.jpg"),
640            )]),
641        }),
642        children: Vec::new(),
643        within_special_tag: None,
644    };
645    let parsed = safe_parse_html(input).unwrap();
646    assert_eq!(parsed, expected);
647
648    let input = r#"<!DOCTYPE html><meta http-equiv="content-type" content="text/html; charset=utf-8"><div class="column"><div class="gallery-wrap single">
649    <div class="gallery-container">
650        <figure class="image">
651            <figure class="image">
652            <img title="Illustration »Der dunkle Kongress« © ARD / Jürgen Frey"
653                 alt="Illustration »Der dunkle Kongress« © ARD / Jürgen Frey" 
654                 src="https://hoerspiele.dra.de/fileadmin/www.hoerspiele.dra.de/images/vollinfo/4970918_B01.jpg">
655                <figcaption class="image-caption">Illustration »Der dunkle Kongress«
656© ARD / Jürgen Frey</figcaption>
657</figure></div></div></div>"#.to_string();
658    safe_parse_html(input).unwrap();
659}