Skip to main content

sciforge_parser/html/
parser.rs

1use super::entity::validate_entity;
2use super::error::{HtmlError, HtmlErrorKind};
3use super::lexer::Cursor;
4use super::value::HtmlValue;
5
6const VOID_ELEMENTS: &[&str] = &[
7    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
8    "track", "wbr",
9];
10
11const RAW_TEXT_ELEMENTS: &[&str] = &["script", "style"];
12
13pub const DEFAULT_MAX_HTML_DEPTH: usize = 128;
14
15#[derive(Clone, Copy, Debug, PartialEq, Eq)]
16pub struct HtmlLimits {
17    pub max_depth: usize,
18    pub max_node_count: usize,
19    pub max_attribute_count: usize,
20    pub max_attribute_value_len: usize,
21}
22
23pub const DEFAULT_HTML_LIMITS: HtmlLimits = HtmlLimits {
24    max_depth: DEFAULT_MAX_HTML_DEPTH,
25    max_node_count: 256 * 1024,
26    max_attribute_count: 256,
27    max_attribute_value_len: 64 * 1024,
28};
29
30pub struct HtmlParser<'a> {
31    cursor: Cursor<'a>,
32    limits: HtmlLimits,
33    nodes_seen: usize,
34}
35
36impl<'a> HtmlParser<'a> {
37    pub const fn new(bytes: &'a [u8]) -> Self {
38        Self {
39            cursor: Cursor::new(bytes),
40            limits: DEFAULT_HTML_LIMITS,
41            nodes_seen: 0,
42        }
43    }
44
45    pub const fn with_limits(mut self, limits: HtmlLimits) -> Self {
46        self.limits = limits;
47        self
48    }
49
50    pub const fn with_max_depth(mut self, max_depth: usize) -> Self {
51        self.limits.max_depth = max_depth;
52        self
53    }
54
55    pub fn parse(mut self) -> Result<HtmlValue<'a>, HtmlError> {
56        self.parse_nodes(0)?;
57        Ok(HtmlValue::Document)
58    }
59
60    pub fn validate(mut self) -> Result<(), HtmlError> {
61        self.parse_nodes(0)?;
62        Ok(())
63    }
64
65    fn tick_node(&mut self) -> Result<(), HtmlError> {
66        self.nodes_seen = self.nodes_seen.saturating_add(1);
67        if self.nodes_seen > self.limits.max_node_count {
68            return Err(HtmlError::new(
69                HtmlErrorKind::MaxNodeCountExceeded,
70                self.cursor.position(),
71            ));
72        }
73        Ok(())
74    }
75
76    fn parse_nodes(&mut self, depth: usize) -> Result<(), HtmlError> {
77        if depth > self.limits.max_depth {
78            return Err(HtmlError::new(
79                HtmlErrorKind::MaxDepthExceeded,
80                self.cursor.position(),
81            ));
82        }
83
84        while !self.cursor.is_eof() {
85            if self.cursor.peek() == Some(b'<') {
86                if self.cursor.starts_with(b"</") {
87                    return Ok(());
88                }
89                if self.cursor.starts_with(b"<!--") {
90                    self.parse_comment()?;
91                } else if self.cursor.starts_with(b"<!") {
92                    self.parse_doctype()?;
93                } else {
94                    self.parse_element(depth)?;
95                }
96            } else {
97                self.parse_text()?;
98            }
99        }
100
101        Ok(())
102    }
103
104    fn parse_comment(&mut self) -> Result<(), HtmlError> {
105        self.tick_node()?;
106        let start = self.cursor.position();
107        self.cursor.advance(4);
108
109        loop {
110            if self.cursor.is_eof() {
111                return Err(HtmlError::new(HtmlErrorKind::UnterminatedComment, start));
112            }
113            if self.cursor.starts_with(b"-->") {
114                self.cursor.advance(3);
115                return Ok(());
116            }
117            self.cursor.advance(1);
118        }
119    }
120
121    fn parse_doctype(&mut self) -> Result<(), HtmlError> {
122        self.tick_node()?;
123        let start = self.cursor.position();
124        self.cursor.advance(2);
125
126        loop {
127            if self.cursor.is_eof() {
128                return Err(HtmlError::new(HtmlErrorKind::UnterminatedDoctype, start));
129            }
130            if self.cursor.peek() == Some(b'>') {
131                self.cursor.advance(1);
132                return Ok(());
133            }
134            self.cursor.advance(1);
135        }
136    }
137
138    fn parse_text(&mut self) -> Result<(), HtmlError> {
139        self.tick_node()?;
140        while let Some(b) = self.cursor.peek() {
141            match b {
142                b'<' => break,
143                b'&' => validate_entity(&mut self.cursor)?,
144                _ => self.cursor.advance(1),
145            }
146        }
147        Ok(())
148    }
149
150    fn parse_element(&mut self, depth: usize) -> Result<(), HtmlError> {
151        self.tick_node()?;
152        let tag_start = self.cursor.position();
153        self.cursor.advance(1);
154
155        let tag_name = self.cursor.read_tag_name()?;
156
157        self.parse_attributes()?;
158        self.cursor.skip_ws();
159
160        let self_closing = self.cursor.peek() == Some(b'/');
161        if self_closing {
162            self.cursor.advance(1);
163        }
164
165        if self.cursor.peek() != Some(b'>') {
166            return Err(HtmlError::new(HtmlErrorKind::UnterminatedTag, tag_start));
167        }
168        self.cursor.advance(1);
169
170        if self_closing || is_void_element(tag_name) {
171            return Ok(());
172        }
173
174        if is_raw_text_element(tag_name) {
175            return self.skip_raw_text(tag_name, tag_start);
176        }
177
178        self.parse_nodes(depth + 1)?;
179        self.parse_closing_tag(tag_name, tag_start)
180    }
181
182    fn parse_attributes(&mut self) -> Result<(), HtmlError> {
183        let mut count = 0usize;
184
185        loop {
186            self.cursor.skip_ws();
187            match self.cursor.peek() {
188                Some(b'>') | Some(b'/') | None => return Ok(()),
189                _ => {}
190            }
191
192            self.parse_attribute()?;
193            count = count.saturating_add(1);
194            if count > self.limits.max_attribute_count {
195                return Err(HtmlError::new(
196                    HtmlErrorKind::MaxAttributeCountExceeded,
197                    self.cursor.position(),
198                ));
199            }
200        }
201    }
202
203    fn parse_attribute(&mut self) -> Result<(), HtmlError> {
204        self.cursor.read_while(|b| {
205            b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b':' || b == b'.'
206        });
207
208        self.cursor.skip_ws();
209
210        if self.cursor.peek() != Some(b'=') {
211            return Ok(());
212        }
213        self.cursor.advance(1);
214        self.cursor.skip_ws();
215
216        match self.cursor.peek() {
217            Some(b'"') => self.parse_quoted_value(b'"'),
218            Some(b'\'') => self.parse_quoted_value(b'\''),
219            _ => {
220                self.cursor.read_while(|b| {
221                    !matches!(
222                        b,
223                        b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/' | b'"' | b'\'' | b'='
224                    )
225                });
226                Ok(())
227            }
228        }
229    }
230
231    fn parse_quoted_value(&mut self, quote: u8) -> Result<(), HtmlError> {
232        let start = self.cursor.position();
233        self.cursor.advance(1);
234        let content_start = self.cursor.position();
235
236        loop {
237            match self.cursor.peek() {
238                None => return Err(HtmlError::new(HtmlErrorKind::UnterminatedAttribute, start)),
239                Some(b) if b == quote => {
240                    let len = self.cursor.position() - content_start;
241                    if len > self.limits.max_attribute_value_len {
242                        return Err(HtmlError::new(
243                            HtmlErrorKind::MaxAttributeValueLengthExceeded,
244                            content_start,
245                        ));
246                    }
247                    self.cursor.advance(1);
248                    return Ok(());
249                }
250                Some(b'&') => validate_entity(&mut self.cursor)?,
251                _ => self.cursor.advance(1),
252            }
253        }
254    }
255
256    fn parse_closing_tag(&mut self, expected: &str, open_offset: usize) -> Result<(), HtmlError> {
257        if !self.cursor.starts_with(b"</") {
258            return Err(HtmlError::new(
259                HtmlErrorKind::MismatchedClosingTag,
260                self.cursor.position(),
261            ));
262        }
263        self.cursor.advance(2);
264
265        let close_name = self.cursor.read_tag_name()?;
266
267        if !eq_ignore_ascii_case(close_name, expected) {
268            return Err(HtmlError::new(
269                HtmlErrorKind::MismatchedClosingTag,
270                open_offset,
271            ));
272        }
273
274        self.cursor.skip_ws();
275
276        if self.cursor.peek() != Some(b'>') {
277            return Err(HtmlError::new(
278                HtmlErrorKind::UnterminatedTag,
279                self.cursor.position(),
280            ));
281        }
282        self.cursor.advance(1);
283        Ok(())
284    }
285
286    fn skip_raw_text(&mut self, tag_name: &str, open_offset: usize) -> Result<(), HtmlError> {
287        loop {
288            if self.cursor.is_eof() {
289                return Err(HtmlError::new(HtmlErrorKind::UnterminatedTag, open_offset));
290            }
291            if self.cursor.starts_with(b"</") {
292                let saved = self.cursor.position();
293                self.cursor.advance(2);
294                if let Ok(name) = self.cursor.read_tag_name()
295                    && eq_ignore_ascii_case(name, tag_name)
296                {
297                    self.cursor.skip_ws();
298                    if self.cursor.peek() == Some(b'>') {
299                        self.cursor.advance(1);
300                        return Ok(());
301                    }
302                }
303                self.cursor.advance(0);
304                if self.cursor.position() == saved + 2 {
305                    self.cursor.advance(1);
306                }
307                continue;
308            }
309            self.cursor.advance(1);
310        }
311    }
312}
313
314fn is_void_element(name: &str) -> bool {
315    VOID_ELEMENTS.iter().any(|&v| eq_ignore_ascii_case(v, name))
316}
317
318fn is_raw_text_element(name: &str) -> bool {
319    RAW_TEXT_ELEMENTS
320        .iter()
321        .any(|&v| eq_ignore_ascii_case(v, name))
322}
323
324fn eq_ignore_ascii_case(a: &str, b: &str) -> bool {
325    if a.len() != b.len() {
326        return false;
327    }
328    a.bytes()
329        .zip(b.bytes())
330        .all(|(x, y)| x.eq_ignore_ascii_case(&y))
331}
332
333pub fn parse_html(bytes: &[u8]) -> Result<HtmlValue<'_>, HtmlError> {
334    HtmlParser::new(bytes).parse()
335}
336
337pub fn parse_html_with_max_depth(
338    bytes: &[u8],
339    max_depth: usize,
340) -> Result<HtmlValue<'_>, HtmlError> {
341    HtmlParser::new(bytes).with_max_depth(max_depth).parse()
342}
343
344pub fn parse_html_with_limits(
345    bytes: &[u8],
346    limits: HtmlLimits,
347) -> Result<HtmlValue<'_>, HtmlError> {
348    HtmlParser::new(bytes).with_limits(limits).parse()
349}
350
351pub fn validate_html(bytes: &[u8]) -> Result<(), HtmlError> {
352    HtmlParser::new(bytes).validate()
353}