1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
extern crate html5ever;
extern crate markup5ever_rcdom as rcdom;

use html5ever::{parse_document, tendril::TendrilSink};
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use std::collections::HashMap;

pub struct HtmlParser;

impl HtmlParser {
    pub fn new() -> Self {
        HtmlParser {}
    }

    pub fn parse_html(&self, input: &str) -> HtmlElement {
        let dom = parse_document(RcDom::default(), Default::default()).one(input);

        HtmlElement::from_dom(&dom.document)
    }
}

#[derive(Debug)]
pub struct HtmlElement {
    pub tag_name: Option<String>,
    pub text: String,
    pub children: Vec<HtmlElement>,
    pub attributes: HashMap<String, String>,
}

impl HtmlElement {
    fn from_dom(handle: &Handle) -> Self {
        match handle.data {
            NodeData::Document => {
                let children = handle
                    .children
                    .borrow()
                    .iter()
                    .map(HtmlElement::from_dom)
                    .collect();

                HtmlElement {
                    tag_name: None,
                    text: String::new(),
                    children,
                    attributes: HashMap::new(),
                }
            }
            NodeData::Element { ref name, ref attrs, .. } => {
                let tag_name = Some(name.local.to_string());
                let attributes = attrs.borrow().iter().map(|attr| {
                    (attr.name.local.to_string(), attr.value.to_string())
                }).collect();

                let mut children = Vec::new();
                let mut text = String::new();
                for child in handle.children.borrow().iter() {
                    let child_element = HtmlElement::from_dom(child);
                    // Append child text to the parent element's text if the child is a text node.
                    if child_element.tag_name.is_none() {
                        text += &child_element.text;
                    } else {
                        children.push(child_element);
                    }
                }

                HtmlElement {
                    tag_name,
                    text, // Now contains the aggregated text of its child text nodes.
                    children,
                    attributes,
                }
            }
            NodeData::Text { ref contents } => HtmlElement {
                tag_name: None,
                text: contents.borrow().to_string(),
                children: vec![],
                attributes: HashMap::new(),
            },
            _ => HtmlElement {
                tag_name: None,
                text: String::new(),
                children: vec![],
                attributes: HashMap::new(),
            },
        }
    }
}