simple_html_parser/
parser.rs

1use super::ast::{AST, DOCTYPE};
2
3pub struct Parser {
4    input: String,
5    position: usize,
6}
7
8impl Parser {
9    pub fn new<T: ToString>(input: T) -> Self {
10        Self { input: input.to_string(), position: 0 }
11    }
12
13    fn peek(&self) -> Option<char> {
14        self.input[self.position..].chars().next()
15    }
16
17    fn next(&mut self) {
18        if let Some(c) = self.peek() {
19            self.position += c.len_utf8();
20        }
21    }
22
23    fn consume_whitespace(&mut self) {
24        while let Some(c) = self.peek() {
25            if !c.is_whitespace() {
26                break;
27            }
28            self.next();
29        }
30    }
31
32    pub fn parse(&mut self) -> AST {
33        self.consume_whitespace();
34        let doctype = self.parse_doctype();
35        let content = self.parse_content();
36        AST::Document(doctype, Box::new(Some(content)))
37    }
38
39    fn parse_doctype(&mut self) -> Option<DOCTYPE> {
40        if self.input[self.position..].starts_with("<!DOCTYPE") {
41            while self.peek() != Some('>') && self.position < self.input.len() {
42                self.next();
43            }
44            self.next(); // Consume '>'
45            Some(DOCTYPE::new())
46        } else {
47            None
48        }
49    }
50
51    fn parse_content(&mut self) -> AST {
52        self.consume_whitespace();
53        if let Some('<') = self.peek() {
54            self.parse_tag()
55        } else {
56            self.parse_text()
57        }
58    }
59
60    fn parse_tag(&mut self) -> AST {
61        self.next(); // Consume '<'
62        let tag_name = self.parse_identifier();
63        let attributes = self.parse_attributes();
64        let children = self.parse_children(&tag_name);
65        AST::Tag(tag_name, attributes, children)
66    }
67
68    fn parse_identifier(&mut self) -> String {
69        let mut identifier = String::new();
70        while let Some(c) = self.peek() {
71            if c.is_alphanumeric() || c == '-' || c == '_' {
72                identifier.push(c);
73                self.next();
74            } else {
75                break;
76            }
77        }
78        identifier
79    }
80
81    fn parse_attributes(&mut self) -> Vec<(String, String)> {
82        let mut attributes = Vec::new();
83        self.consume_whitespace();
84        while let Some(c) = self.peek() {
85            if c == '>' || c == '/' {
86                break;
87            }
88            let name = self.parse_identifier();
89            self.consume_whitespace();
90            if self.peek() == Some('=') {
91                self.next();
92                self.consume_whitespace();
93                let value = self.parse_attribute_value();
94                attributes.push((name, value));
95            } else {
96                attributes.push((name, String::new()));
97            }
98            self.consume_whitespace();
99        }
100        attributes
101    }
102
103    fn parse_attribute_value(&mut self) -> String {
104        let mut value = String::new();
105        if self.peek() == Some('"') {
106            self.next();
107            while let Some(c) = self.peek() {
108                if c == '"' {
109                    self.next();
110                    break;
111                }
112                value.push(c);
113                self.next();
114            }
115        }
116        value
117    }
118
119    fn parse_children(&mut self, parent_tag: &str) -> Vec<AST> {
120        let mut children = Vec::new();
121        if self.peek() == Some('/') {
122            self.next(); // Consume '/'
123            if self.peek() == Some('>') {
124                self.next(); // Consume '>'
125                return children; // Self-closing tag
126            }
127        } else if self.peek() == Some('>') {
128            self.next(); // Consume '>'
129        }
130    
131        while self.position < self.input.len() {
132            self.consume_whitespace();
133    
134            if let Some('<') = self.peek() {
135                self.next(); // Consume '<'
136    
137                if let Some('/') = self.peek() {
138                    self.next(); // Consume '/'
139                    let closing_tag = self.parse_identifier();
140    
141                    if closing_tag == parent_tag {
142                        self.consume_whitespace();
143                        if let Some('>') = self.peek() {
144                            self.next(); // Consume '>'
145                            break; // Stop parsing children, as we found the closing tag
146                        }
147                    } else {
148                        // Unexpected closing tag, treat as an error or skip it
149                        while self.peek() != Some('>') && self.position < self.input.len() {
150                            self.next();
151                        }
152                        self.next(); // Consume '>'
153                    }
154                } else {
155                    // Parse as new tag
156                    self.position -= 1; // Step back to '<'
157                    children.push(self.parse_tag());
158                }
159            } else {
160                // Parse text node
161                let text = self.parse_text();
162                if let AST::String(ref s) = text {
163                    if !s.trim().is_empty() {
164                        children.push(text);
165                    }
166                }
167            }
168        }
169    
170        children
171    }
172    
173
174    fn parse_text(&mut self) -> AST {
175        let mut text = String::new();
176        while let Some(c) = self.peek() {
177            if c == '<' {
178                break;
179            }
180            text.push(c);
181            self.next();
182        }
183        AST::String(text)
184    }
185}