simple_html_parser/
parser.rs1use super::ast::{AST, DOCTYPE};
2
3pub struct Parser {
4 input: String,
5 position: usize,
6}
7
8impl Parser {
9 pub fn new<T: ToString>(input: T) -> Self {
10 Self { input: input.to_string(), position: 0 }
11 }
12
13 fn peek(&self) -> Option<char> {
14 self.input[self.position..].chars().next()
15 }
16
17 fn next(&mut self) {
18 if let Some(c) = self.peek() {
19 self.position += c.len_utf8();
20 }
21 }
22
23 fn consume_whitespace(&mut self) {
24 while let Some(c) = self.peek() {
25 if !c.is_whitespace() {
26 break;
27 }
28 self.next();
29 }
30 }
31
32 pub fn parse(&mut self) -> AST {
33 self.consume_whitespace();
34 let doctype = self.parse_doctype();
35 let content = self.parse_content();
36 AST::Document(doctype, Box::new(Some(content)))
37 }
38
39 fn parse_doctype(&mut self) -> Option<DOCTYPE> {
40 if self.input[self.position..].starts_with("<!DOCTYPE") {
41 while self.peek() != Some('>') && self.position < self.input.len() {
42 self.next();
43 }
44 self.next(); Some(DOCTYPE::new())
46 } else {
47 None
48 }
49 }
50
51 fn parse_content(&mut self) -> AST {
52 self.consume_whitespace();
53 if let Some('<') = self.peek() {
54 self.parse_tag()
55 } else {
56 self.parse_text()
57 }
58 }
59
60 fn parse_tag(&mut self) -> AST {
61 self.next(); let tag_name = self.parse_identifier();
63 let attributes = self.parse_attributes();
64 let children = self.parse_children(&tag_name);
65 AST::Tag(tag_name, attributes, children)
66 }
67
68 fn parse_identifier(&mut self) -> String {
69 let mut identifier = String::new();
70 while let Some(c) = self.peek() {
71 if c.is_alphanumeric() || c == '-' || c == '_' {
72 identifier.push(c);
73 self.next();
74 } else {
75 break;
76 }
77 }
78 identifier
79 }
80
81 fn parse_attributes(&mut self) -> Vec<(String, String)> {
82 let mut attributes = Vec::new();
83 self.consume_whitespace();
84 while let Some(c) = self.peek() {
85 if c == '>' || c == '/' {
86 break;
87 }
88 let name = self.parse_identifier();
89 self.consume_whitespace();
90 if self.peek() == Some('=') {
91 self.next();
92 self.consume_whitespace();
93 let value = self.parse_attribute_value();
94 attributes.push((name, value));
95 } else {
96 attributes.push((name, String::new()));
97 }
98 self.consume_whitespace();
99 }
100 attributes
101 }
102
103 fn parse_attribute_value(&mut self) -> String {
104 let mut value = String::new();
105 if self.peek() == Some('"') {
106 self.next();
107 while let Some(c) = self.peek() {
108 if c == '"' {
109 self.next();
110 break;
111 }
112 value.push(c);
113 self.next();
114 }
115 }
116 value
117 }
118
119 fn parse_children(&mut self, parent_tag: &str) -> Vec<AST> {
120 let mut children = Vec::new();
121 if self.peek() == Some('/') {
122 self.next(); if self.peek() == Some('>') {
124 self.next(); return children; }
127 } else if self.peek() == Some('>') {
128 self.next(); }
130
131 while self.position < self.input.len() {
132 self.consume_whitespace();
133
134 if let Some('<') = self.peek() {
135 self.next(); if let Some('/') = self.peek() {
138 self.next(); let closing_tag = self.parse_identifier();
140
141 if closing_tag == parent_tag {
142 self.consume_whitespace();
143 if let Some('>') = self.peek() {
144 self.next(); break; }
147 } else {
148 while self.peek() != Some('>') && self.position < self.input.len() {
150 self.next();
151 }
152 self.next(); }
154 } else {
155 self.position -= 1; children.push(self.parse_tag());
158 }
159 } else {
160 let text = self.parse_text();
162 if let AST::String(ref s) = text {
163 if !s.trim().is_empty() {
164 children.push(text);
165 }
166 }
167 }
168 }
169
170 children
171 }
172
173
174 fn parse_text(&mut self) -> AST {
175 let mut text = String::new();
176 while let Some(c) = self.peek() {
177 if c == '<' {
178 break;
179 }
180 text.push(c);
181 self.next();
182 }
183 AST::String(text)
184 }
185}