tl/parser/
base.rs

1use super::{
2    constants,
3    handle::NodeHandle,
4    tag::{Attributes, HTMLTag, Node},
5};
6use crate::InnerNodeHandle;
7use crate::{bytes::Bytes, inline::vec::InlineVec, simd, ParseError};
8use crate::{stream::Stream, ParserOptions};
9use std::collections::HashMap;
10
11/// A list of HTML nodes
12pub type Tree<'a> = Vec<Node<'a>>;
13
14/// Inline class vector
15pub type ClassVec = InlineVec<NodeHandle, 2>;
16
17/// HTML Version (<!DOCTYPE>)
18#[derive(Debug, Copy, Clone, PartialEq)]
19#[repr(C)]
20pub enum HTMLVersion {
21    /// HTML Version 5
22    HTML5,
23    /// Strict HTML 4.01
24    StrictHTML401,
25    /// Transitional HTML 4.01
26    TransitionalHTML401,
27    /// Frameset HTML 4.01:
28    FramesetHTML401,
29}
30/// The main HTML parser
31///
32/// Users of this library are not supposed to directly construct this struct.
33/// Instead, users must call `tl::parse()` and use the returned `VDom`.
34#[derive(Debug)]
35pub struct Parser<'a> {
36    /// The inner stream that is used to iterate through the HTML source
37    pub(crate) stream: Stream<'a, u8>,
38    pub(crate) stack: Vec<NodeHandle>,
39    /// Specified options for this HTML parser
40    pub(crate) options: ParserOptions,
41    /// A global collection of all HTML tags that appear in the source code
42    ///
43    /// HTML Nodes contain indicies into this vector
44    pub(crate) tags: Tree<'a>,
45    /// The topmost HTML nodes
46    pub(crate) ast: Vec<NodeHandle>,
47    /// A HashMap that maps Tag ID to a Node ID
48    pub(crate) ids: HashMap<Bytes<'a>, NodeHandle>,
49    /// A HashMap that maps Tag Class to a Node ID
50    pub(crate) classes: HashMap<Bytes<'a>, ClassVec>,
51    /// The current HTML version, if set
52    pub(crate) version: Option<HTMLVersion>,
53}
54
55impl<'a> Parser<'a> {
56    pub(crate) fn new(input: &str, options: ParserOptions) -> Parser<'_> {
57        Parser {
58            stack: Vec::with_capacity(4),
59            options,
60            tags: Vec::new(),
61            stream: Stream::new(input.as_bytes()),
62            ast: Vec::new(),
63            ids: HashMap::new(),
64            classes: HashMap::new(),
65            version: None,
66        }
67    }
68
69    #[inline(always)]
70    fn register_tag(&mut self, node: Node<'a>) -> NodeHandle {
71        self.tags.push(node);
72        NodeHandle::new((self.tags.len() - 1) as u32)
73    }
74
75    #[inline(always)]
76    fn skip_whitespaces(&mut self) {
77        self.read_while2(b' ', b'\n');
78    }
79
80    fn read_to(&mut self, needle: u8) -> &'a [u8] {
81        let start = self.stream.idx;
82        let bytes = &self.stream.data()[start..];
83
84        let end = simd::find(bytes, needle).unwrap_or_else(|| self.stream.len() - start);
85
86        self.stream.idx += end;
87        self.stream.slice(start, start + end)
88    }
89
90    fn read_to3(&mut self, needle: [u8; 3]) -> &'a [u8] {
91        let start = self.stream.idx;
92        let bytes = &self.stream.data()[start..];
93
94        let end = simd::find3(bytes, needle).unwrap_or_else(|| self.stream.len() - start);
95
96        self.stream.idx += end;
97        self.stream.slice(start, start + end)
98    }
99
100    fn read_while2(&mut self, needle1: u8, needle2: u8) -> Option<()> {
101        loop {
102            let ch = self.stream.current_cpy()?;
103
104            let eq1 = ch == needle1;
105            let eq2 = ch == needle2;
106
107            if !eq1 & !eq2 {
108                return Some(());
109            }
110
111            self.stream.advance();
112        }
113    }
114
115    fn read_ident(&mut self) -> Option<&'a [u8]> {
116        let start = self.stream.idx;
117        let bytes = &self.stream.data()[start..];
118
119        // If we do not find any characters that are not identifiers
120        // then we are probably at the end of the stream
121        let end = simd::search_non_ident(bytes).unwrap_or_else(|| self.stream.len() - start);
122
123        // If we don't find any identifier characters, return `None`.
124        if end == 0 {
125            return None;
126        }
127
128        self.stream.idx += end;
129        Some(self.stream.slice(start, start + end))
130    }
131
132    fn skip_comment_with_start(&mut self, start: usize) -> &'a [u8] {
133        while !self.stream.is_eof() {
134            let idx = self.stream.idx;
135
136            if self
137                .stream
138                .slice_len(idx, constants::COMMENT.len())
139                .eq(constants::COMMENT)
140            {
141                self.stream.advance_by(constants::COMMENT.len());
142
143                let is_end_of_comment = self.stream.expect_and_skip_cond(b'>');
144
145                if is_end_of_comment {
146                    return self.stream.slice(start, self.stream.idx);
147                }
148            }
149
150            self.stream.advance();
151        }
152
153        &[]
154    }
155
156    fn parse_attribute(&mut self) -> Option<(&'a [u8], Option<&'a [u8]>)> {
157        let name = self.read_ident()?;
158        self.skip_whitespaces();
159
160        let has_value = self.stream.expect_and_skip_cond(b'=');
161        if !has_value {
162            return Some((name, None));
163        }
164
165        self.skip_whitespaces();
166
167        let value = if let Some(quote) = self.stream.expect_oneof_and_skip(b"\"'") {
168            self.read_to(quote)
169        } else {
170            self.read_to3([b' ', b'\n', b'>'])
171        };
172
173        Some((name, Some(value)))
174    }
175
176    fn parse_attributes(&mut self) -> Option<Attributes<'a>> {
177        let mut attributes = Attributes::new();
178
179        loop {
180            self.skip_whitespaces();
181
182            let cur = self.stream.current_cpy()?;
183
184            if simd::is_closing(cur) {
185                break;
186            }
187
188            if let Some((key, value)) = self.parse_attribute() {
189                let has_value = value.is_some();
190                let value: Option<Bytes<'a>> = value.map(Into::into);
191
192                match key {
193                    b"id" => attributes.id = value,
194                    b"class" => attributes.class = value,
195                    _ => attributes.raw.insert(key.into(), value),
196                };
197
198                // Only advance past the delimiter if we read a value.
199                if has_value && !simd::is_closing(self.stream.current_cpy()?) {
200                    self.stream.advance();
201                }
202            } else {
203                // No valid attribute found; skip this character.
204                self.stream.advance();
205            }
206        }
207
208        Some(attributes)
209    }
210
211    #[inline]
212    fn add_to_parent(&mut self, handle: NodeHandle) {
213        if let Some(last) = self.stack.last() {
214            let last = self
215                .tags
216                .get_mut(last.get_inner() as usize)
217                .unwrap()
218                .as_tag_mut()
219                .unwrap();
220
221            last._children.push(handle);
222        } else {
223            self.ast.push(handle);
224        }
225    }
226
227    fn read_end(&mut self) {
228        self.stream.advance();
229
230        let closing_tag_name = self.read_to(b'>');
231
232        self.stream.expect_and_skip_cond(b'>');
233
234        let closing_tag_matches_parent = self
235            .stack
236            .last()
237            .and_then(|last_handle| last_handle.get(self))
238            .and_then(|last_item| last_item.as_tag())
239            .is_some_and(|last_tag| last_tag.name() == closing_tag_name);
240
241        if !closing_tag_matches_parent {
242            return;
243        }
244
245        if let Some(handle) = self.stack.pop() {
246            let tag = self
247                .tags
248                .get_mut(handle.get_inner() as usize)
249                .unwrap()
250                .as_tag_mut()
251                .unwrap();
252
253            let ptr = self.stream.data().as_ptr() as usize;
254            let offset = tag._raw.as_ptr() as usize;
255            let offset = offset - ptr;
256
257            tag._raw = self.stream.slice(offset, self.stream.idx).into();
258
259            let (track_classes, track_ids) = (
260                self.options.is_tracking_classes(),
261                self.options.is_tracking_ids(),
262            );
263
264            if let (true, Some(bytes)) = (track_classes, &tag._attributes.class) {
265                let s = bytes
266                    .as_bytes_borrowed()
267                    .and_then(|x| std::str::from_utf8(x).ok())
268                    .map(|x| x.split_ascii_whitespace());
269
270                if let Some(s) = s {
271                    for class in s {
272                        self.classes
273                            .entry(class.into())
274                            .or_insert_with(InlineVec::new)
275                            .push(handle);
276                    }
277                }
278            }
279
280            if let (true, Some(bytes)) = (track_ids, &tag._attributes.id) {
281                self.ids.insert(bytes.clone(), handle);
282            }
283        }
284    }
285
286    #[cold]
287    #[inline(never)]
288    fn read_markdown(&mut self) -> Option<()> {
289        let start = self.stream.idx - 1; // position of the < which is needed when registering the comment
290
291        self.stream.advance(); // skip !
292
293        let is_comment = self
294            .stream
295            .slice_len(self.stream.idx, 2)
296            .eq(constants::COMMENT);
297
298        if is_comment {
299            let comment = self.skip_comment_with_start(start);
300            let comment = self.register_tag(Node::Comment(comment.into()));
301            self.add_to_parent(comment);
302        } else {
303            let tag = self.read_ident()?;
304
305            self.skip_whitespaces();
306
307            if simd::matches_case_insensitive(tag, *b"doctype") {
308                let doctype = self.read_ident()?;
309
310                let html5 = simd::matches_case_insensitive(doctype, *b"html");
311
312                if html5 {
313                    self.version = Some(HTMLVersion::HTML5);
314                }
315
316                self.skip_whitespaces();
317                self.stream.advance(); // skip >
318            }
319        }
320
321        Some(())
322    }
323
324    fn parse_tag(&mut self) -> Option<()> {
325        let start = self.stream.idx;
326
327        self.stream.advance();
328        self.skip_whitespaces();
329        let cur = self.stream.current_cpy()?;
330
331        match cur {
332            b'/' => self.read_end(),
333            b'!' => {
334                self.read_markdown();
335            }
336            _ => {
337                let name = self.read_ident()?;
338                self.skip_whitespaces();
339
340                let attr = self.parse_attributes()?;
341
342                let is_self_closing = self.stream.expect_and_skip_cond(b'/');
343
344                self.stream.expect_and_skip(b'>')?;
345
346                let this = self.register_tag(Node::Tag(HTMLTag::new(
347                    name.into(),
348                    attr,
349                    InlineVec::new(),
350                    self.stream.slice(start, self.stream.idx).into(),
351                )));
352
353                self.add_to_parent(this);
354
355                // some tags are self closing, so even though there might not be a /,
356                // we don't always want to push them to the stack
357                // e.g. <br><p>Hello</p>
358                // <p> should not be a subtag of <br>
359                if !is_self_closing && !constants::VOID_TAGS.contains(&name) {
360                    self.stack.push(this);
361                }
362            }
363        };
364
365        Some(())
366    }
367
368    pub(crate) fn parse_single(&mut self) -> Option<()> {
369        loop {
370            let cur = self.stream.current()?;
371
372            if *cur == b'<' {
373                self.parse_tag();
374            } else {
375                let raw = Node::Raw(self.read_to(b'<').into());
376                let handle = self.register_tag(raw);
377                self.add_to_parent(handle);
378            }
379        }
380    }
381
382    /// Resolves an internal Node ID obtained from a NodeHandle to a Node
383    #[inline]
384    pub fn resolve_node_id(&self, id: InnerNodeHandle) -> Option<&Node<'a>> {
385        self.tags.get(id as usize)
386    }
387
388    /// Resolves an internal Node ID obtained from a NodeHandle to a Node
389    #[inline]
390    pub fn resolve_node_id_mut(&mut self, id: InnerNodeHandle) -> Option<&mut Node<'a>> {
391        self.tags.get_mut(id as usize)
392    }
393
394    pub(crate) fn parse(&mut self) -> Result<(), ParseError> {
395        if self.stream.len() > u32::MAX as usize {
396            return Err(ParseError::InvalidLength);
397        }
398
399        while !self.stream.is_eof() {
400            self.parse_single();
401        }
402
403        Ok(())
404    }
405}