1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
use crate::{
    language_facts::data_manager::HTMLDataManager,
    parser::html_scanner::{Scanner, TokenType},
};
use lsp_textdocument::FullTextDocument;

use super::{
    html_document::{HTMLDocument, Node, NodeAttribute},
    html_scanner::ScannerState,
};

pub struct HTMLParser;

impl HTMLParser {
    pub fn parse_document(
        document: &FullTextDocument,
        data_manager: &HTMLDataManager,
    ) -> HTMLDocument {
        HTMLParser::parse(
            document.get_content(None),
            &document.language_id(),
            data_manager,
        )
    }

    pub fn parse(text: &str, language_id: &str, data_manager: &HTMLDataManager) -> HTMLDocument {
        parse_html_document(text, language_id, &data_manager)
    }
}

pub fn parse_html_document(
    text: &str,
    language_id: &str,
    data_manager: &HTMLDataManager,
) -> HTMLDocument {
    let void_elements = data_manager.get_void_elements(language_id);
    let mut scanner = Scanner::new(text, 0, ScannerState::WithinContent, true);

    let mut html_document = Node::new(0, scanner.get_source_len(), vec![]);
    let mut cur = &mut html_document as *mut Node;
    let mut parent_list: Vec<*mut Node> = vec![];
    let mut end_tag_start = None;
    let mut end_tag_name = None;
    let mut pending_attribute = None;
    let mut token = scanner.scan();
    unsafe {
        while token != TokenType::EOS {
            match token {
                TokenType::StartTagOpen => {
                    let child =
                        Node::new(scanner.get_token_offset(), scanner.get_source_len(), vec![]);
                    let length = (*cur).children.len();
                    (*cur).children.push(child);
                    parent_list.push(cur);
                    cur = &mut (*cur).children[length];
                }
                TokenType::StartTag => {
                    (*cur).tag = Some(scanner.get_token_text().to_string());
                }
                TokenType::StartTagClose => {
                    if !parent_list.is_empty() {
                        (*cur).end = scanner.get_token_end();
                        if scanner.get_token_length() > 0 {
                            let tag = (*cur).tag.clone();
                            (*cur).start_tag_end = Some(scanner.get_token_end());
                            if tag.is_some()
                                && data_manager.is_void_element(&tag.unwrap(), &void_elements)
                            {
                                (*cur).closed = true;
                                cur = parent_list.pop().unwrap();
                            }
                        } else {
                            // pseudo close token from an incomplete start tag
                            cur = parent_list.pop().unwrap();
                        }
                    }
                }
                TokenType::StartTagSelfClose => {
                    if !parent_list.is_empty() {
                        (*cur).closed = true;
                        (*cur).end = scanner.get_token_end();
                        (*cur).start_tag_end = Some(scanner.get_token_end());
                        cur = parent_list.pop().unwrap();
                    }
                }
                TokenType::EndTagOpen => {
                    end_tag_start = Some(scanner.get_token_offset());
                    end_tag_name = None;
                }
                TokenType::EndTag => {
                    end_tag_name = Some(scanner.get_token_text().to_string().to_lowercase());
                }
                TokenType::EndTagClose => {
                    let mut node = cur;
                    let mut node_parent_list_length = parent_list.len();
                    let end_tag_name = end_tag_name.as_deref();
                    // see if we can find a matching tag
                    while !(*node).is_same_tag(end_tag_name) && node_parent_list_length > 0 {
                        node_parent_list_length -= 1;
                        node = parent_list[node_parent_list_length];
                    }
                    if node_parent_list_length > 0 {
                        while node_parent_list_length != parent_list.len() {
                            (*cur).end = end_tag_start.unwrap();
                            (*cur).closed = false;
                            cur = parent_list.pop().unwrap();
                        }
                        (*cur).closed = true;
                        (*cur).end_tag_start = end_tag_start;
                        (*cur).end = scanner.get_token_end();
                        cur = parent_list.pop().unwrap();
                    }
                }
                TokenType::AttributeName => {
                    let text = scanner.get_token_text();
                    pending_attribute = Some(text.to_string());
                    (*cur).attributes.insert(
                        text.to_string(),
                        NodeAttribute::new(None, scanner.get_token_offset()),
                    ); // Support valueless attributes such as 'checked'
                }
                TokenType::AttributeValue => {
                    let text = scanner.get_token_text();
                    if let Some(attr) = pending_attribute {
                        let offset = scanner.get_token_offset() - 1 - attr.chars().count();
                        (*cur)
                            .attributes
                            .insert(attr, NodeAttribute::new(Some(text.to_string()), offset));
                        pending_attribute = None;
                    }
                }
                _ => {}
            }
            token = scanner.scan();
        }
        while !parent_list.is_empty() {
            (*cur).end = scanner.get_source_len();
            (*cur).closed = false;
            cur = parent_list.pop().unwrap();
        }
    }
    let mut roots = vec![];
    for root in html_document.children {
        roots.push(root);
    }
    HTMLDocument { roots }
}