1use super::entity::validate_entity;
2use super::error::{HtmlError, HtmlErrorKind};
3use super::lexer::Cursor;
4use super::value::HtmlValue;
5
6const VOID_ELEMENTS: &[&str] = &[
7 "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
8 "track", "wbr",
9];
10
11const RAW_TEXT_ELEMENTS: &[&str] = &["script", "style"];
12
13pub const DEFAULT_MAX_HTML_DEPTH: usize = 128;
14
15#[derive(Clone, Copy, Debug, PartialEq, Eq)]
16pub struct HtmlLimits {
17 pub max_depth: usize,
18 pub max_node_count: usize,
19 pub max_attribute_count: usize,
20 pub max_attribute_value_len: usize,
21}
22
23pub const DEFAULT_HTML_LIMITS: HtmlLimits = HtmlLimits {
24 max_depth: DEFAULT_MAX_HTML_DEPTH,
25 max_node_count: 256 * 1024,
26 max_attribute_count: 256,
27 max_attribute_value_len: 64 * 1024,
28};
29
30pub struct HtmlParser<'a> {
31 cursor: Cursor<'a>,
32 limits: HtmlLimits,
33 nodes_seen: usize,
34}
35
36impl<'a> HtmlParser<'a> {
37 pub const fn new(bytes: &'a [u8]) -> Self {
38 Self {
39 cursor: Cursor::new(bytes),
40 limits: DEFAULT_HTML_LIMITS,
41 nodes_seen: 0,
42 }
43 }
44
45 pub const fn with_limits(mut self, limits: HtmlLimits) -> Self {
46 self.limits = limits;
47 self
48 }
49
50 pub const fn with_max_depth(mut self, max_depth: usize) -> Self {
51 self.limits.max_depth = max_depth;
52 self
53 }
54
55 pub fn parse(mut self) -> Result<HtmlValue<'a>, HtmlError> {
56 self.parse_nodes(0)?;
57 Ok(HtmlValue::Document)
58 }
59
60 pub fn validate(mut self) -> Result<(), HtmlError> {
61 self.parse_nodes(0)?;
62 Ok(())
63 }
64
65 fn tick_node(&mut self) -> Result<(), HtmlError> {
66 self.nodes_seen = self.nodes_seen.saturating_add(1);
67 if self.nodes_seen > self.limits.max_node_count {
68 return Err(HtmlError::new(
69 HtmlErrorKind::MaxNodeCountExceeded,
70 self.cursor.position(),
71 ));
72 }
73 Ok(())
74 }
75
76 fn parse_nodes(&mut self, depth: usize) -> Result<(), HtmlError> {
77 if depth > self.limits.max_depth {
78 return Err(HtmlError::new(
79 HtmlErrorKind::MaxDepthExceeded,
80 self.cursor.position(),
81 ));
82 }
83
84 while !self.cursor.is_eof() {
85 if self.cursor.peek() == Some(b'<') {
86 if self.cursor.starts_with(b"</") {
87 return Ok(());
88 }
89 if self.cursor.starts_with(b"<!--") {
90 self.parse_comment()?;
91 } else if self.cursor.starts_with(b"<!") {
92 self.parse_doctype()?;
93 } else {
94 self.parse_element(depth)?;
95 }
96 } else {
97 self.parse_text()?;
98 }
99 }
100
101 Ok(())
102 }
103
104 fn parse_comment(&mut self) -> Result<(), HtmlError> {
105 self.tick_node()?;
106 let start = self.cursor.position();
107 self.cursor.advance(4);
108
109 loop {
110 if self.cursor.is_eof() {
111 return Err(HtmlError::new(HtmlErrorKind::UnterminatedComment, start));
112 }
113 if self.cursor.starts_with(b"-->") {
114 self.cursor.advance(3);
115 return Ok(());
116 }
117 self.cursor.advance(1);
118 }
119 }
120
121 fn parse_doctype(&mut self) -> Result<(), HtmlError> {
122 self.tick_node()?;
123 let start = self.cursor.position();
124 self.cursor.advance(2);
125
126 loop {
127 if self.cursor.is_eof() {
128 return Err(HtmlError::new(HtmlErrorKind::UnterminatedDoctype, start));
129 }
130 if self.cursor.peek() == Some(b'>') {
131 self.cursor.advance(1);
132 return Ok(());
133 }
134 self.cursor.advance(1);
135 }
136 }
137
138 fn parse_text(&mut self) -> Result<(), HtmlError> {
139 self.tick_node()?;
140 while let Some(b) = self.cursor.peek() {
141 match b {
142 b'<' => break,
143 b'&' => validate_entity(&mut self.cursor)?,
144 _ => self.cursor.advance(1),
145 }
146 }
147 Ok(())
148 }
149
150 fn parse_element(&mut self, depth: usize) -> Result<(), HtmlError> {
151 self.tick_node()?;
152 let tag_start = self.cursor.position();
153 self.cursor.advance(1);
154
155 let tag_name = self.cursor.read_tag_name()?;
156
157 self.parse_attributes()?;
158 self.cursor.skip_ws();
159
160 let self_closing = self.cursor.peek() == Some(b'/');
161 if self_closing {
162 self.cursor.advance(1);
163 }
164
165 if self.cursor.peek() != Some(b'>') {
166 return Err(HtmlError::new(HtmlErrorKind::UnterminatedTag, tag_start));
167 }
168 self.cursor.advance(1);
169
170 if self_closing || is_void_element(tag_name) {
171 return Ok(());
172 }
173
174 if is_raw_text_element(tag_name) {
175 return self.skip_raw_text(tag_name, tag_start);
176 }
177
178 self.parse_nodes(depth + 1)?;
179 self.parse_closing_tag(tag_name, tag_start)
180 }
181
182 fn parse_attributes(&mut self) -> Result<(), HtmlError> {
183 let mut count = 0usize;
184
185 loop {
186 self.cursor.skip_ws();
187 match self.cursor.peek() {
188 Some(b'>') | Some(b'/') | None => return Ok(()),
189 _ => {}
190 }
191
192 self.parse_attribute()?;
193 count = count.saturating_add(1);
194 if count > self.limits.max_attribute_count {
195 return Err(HtmlError::new(
196 HtmlErrorKind::MaxAttributeCountExceeded,
197 self.cursor.position(),
198 ));
199 }
200 }
201 }
202
203 fn parse_attribute(&mut self) -> Result<(), HtmlError> {
204 self.cursor.read_while(|b| {
205 b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b':' || b == b'.'
206 });
207
208 self.cursor.skip_ws();
209
210 if self.cursor.peek() != Some(b'=') {
211 return Ok(());
212 }
213 self.cursor.advance(1);
214 self.cursor.skip_ws();
215
216 match self.cursor.peek() {
217 Some(b'"') => self.parse_quoted_value(b'"'),
218 Some(b'\'') => self.parse_quoted_value(b'\''),
219 _ => {
220 self.cursor.read_while(|b| {
221 !matches!(
222 b,
223 b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/' | b'"' | b'\'' | b'='
224 )
225 });
226 Ok(())
227 }
228 }
229 }
230
231 fn parse_quoted_value(&mut self, quote: u8) -> Result<(), HtmlError> {
232 let start = self.cursor.position();
233 self.cursor.advance(1);
234 let content_start = self.cursor.position();
235
236 loop {
237 match self.cursor.peek() {
238 None => return Err(HtmlError::new(HtmlErrorKind::UnterminatedAttribute, start)),
239 Some(b) if b == quote => {
240 let len = self.cursor.position() - content_start;
241 if len > self.limits.max_attribute_value_len {
242 return Err(HtmlError::new(
243 HtmlErrorKind::MaxAttributeValueLengthExceeded,
244 content_start,
245 ));
246 }
247 self.cursor.advance(1);
248 return Ok(());
249 }
250 Some(b'&') => validate_entity(&mut self.cursor)?,
251 _ => self.cursor.advance(1),
252 }
253 }
254 }
255
256 fn parse_closing_tag(&mut self, expected: &str, open_offset: usize) -> Result<(), HtmlError> {
257 if !self.cursor.starts_with(b"</") {
258 return Err(HtmlError::new(
259 HtmlErrorKind::MismatchedClosingTag,
260 self.cursor.position(),
261 ));
262 }
263 self.cursor.advance(2);
264
265 let close_name = self.cursor.read_tag_name()?;
266
267 if !eq_ignore_ascii_case(close_name, expected) {
268 return Err(HtmlError::new(
269 HtmlErrorKind::MismatchedClosingTag,
270 open_offset,
271 ));
272 }
273
274 self.cursor.skip_ws();
275
276 if self.cursor.peek() != Some(b'>') {
277 return Err(HtmlError::new(
278 HtmlErrorKind::UnterminatedTag,
279 self.cursor.position(),
280 ));
281 }
282 self.cursor.advance(1);
283 Ok(())
284 }
285
286 fn skip_raw_text(&mut self, tag_name: &str, open_offset: usize) -> Result<(), HtmlError> {
287 loop {
288 if self.cursor.is_eof() {
289 return Err(HtmlError::new(HtmlErrorKind::UnterminatedTag, open_offset));
290 }
291 if self.cursor.starts_with(b"</") {
292 let saved = self.cursor.position();
293 self.cursor.advance(2);
294 if let Ok(name) = self.cursor.read_tag_name()
295 && eq_ignore_ascii_case(name, tag_name)
296 {
297 self.cursor.skip_ws();
298 if self.cursor.peek() == Some(b'>') {
299 self.cursor.advance(1);
300 return Ok(());
301 }
302 }
303 self.cursor.advance(0);
304 if self.cursor.position() == saved + 2 {
305 self.cursor.advance(1);
306 }
307 continue;
308 }
309 self.cursor.advance(1);
310 }
311 }
312}
313
314fn is_void_element(name: &str) -> bool {
315 VOID_ELEMENTS.iter().any(|&v| eq_ignore_ascii_case(v, name))
316}
317
318fn is_raw_text_element(name: &str) -> bool {
319 RAW_TEXT_ELEMENTS
320 .iter()
321 .any(|&v| eq_ignore_ascii_case(v, name))
322}
323
324fn eq_ignore_ascii_case(a: &str, b: &str) -> bool {
325 if a.len() != b.len() {
326 return false;
327 }
328 a.bytes()
329 .zip(b.bytes())
330 .all(|(x, y)| x.eq_ignore_ascii_case(&y))
331}
332
333pub fn parse_html(bytes: &[u8]) -> Result<HtmlValue<'_>, HtmlError> {
334 HtmlParser::new(bytes).parse()
335}
336
337pub fn parse_html_with_max_depth(
338 bytes: &[u8],
339 max_depth: usize,
340) -> Result<HtmlValue<'_>, HtmlError> {
341 HtmlParser::new(bytes).with_max_depth(max_depth).parse()
342}
343
344pub fn parse_html_with_limits(
345 bytes: &[u8],
346 limits: HtmlLimits,
347) -> Result<HtmlValue<'_>, HtmlError> {
348 HtmlParser::new(bytes).with_limits(limits).parse()
349}
350
351pub fn validate_html(bytes: &[u8]) -> Result<(), HtmlError> {
352 HtmlParser::new(bytes).validate()
353}