1use super::{
2 constants,
3 handle::NodeHandle,
4 tag::{Attributes, HTMLTag, Node},
5};
6use crate::InnerNodeHandle;
7use crate::{bytes::Bytes, inline::vec::InlineVec, simd, ParseError};
8use crate::{stream::Stream, ParserOptions};
9use std::collections::HashMap;
10
11pub type Tree<'a> = Vec<Node<'a>>;
13
14pub type ClassVec = InlineVec<NodeHandle, 2>;
16
17#[derive(Debug, Copy, Clone, PartialEq)]
19#[repr(C)]
20pub enum HTMLVersion {
21 HTML5,
23 StrictHTML401,
25 TransitionalHTML401,
27 FramesetHTML401,
29}
30#[derive(Debug)]
35pub struct Parser<'a> {
36 pub(crate) stream: Stream<'a, u8>,
38 pub(crate) stack: Vec<NodeHandle>,
39 pub(crate) options: ParserOptions,
41 pub(crate) tags: Tree<'a>,
45 pub(crate) ast: Vec<NodeHandle>,
47 pub(crate) ids: HashMap<Bytes<'a>, NodeHandle>,
49 pub(crate) classes: HashMap<Bytes<'a>, ClassVec>,
51 pub(crate) version: Option<HTMLVersion>,
53}
54
55impl<'a> Parser<'a> {
56 pub(crate) fn new(input: &str, options: ParserOptions) -> Parser<'_> {
57 Parser {
58 stack: Vec::with_capacity(4),
59 options,
60 tags: Vec::new(),
61 stream: Stream::new(input.as_bytes()),
62 ast: Vec::new(),
63 ids: HashMap::new(),
64 classes: HashMap::new(),
65 version: None,
66 }
67 }
68
69 #[inline(always)]
70 fn register_tag(&mut self, node: Node<'a>) -> NodeHandle {
71 self.tags.push(node);
72 NodeHandle::new((self.tags.len() - 1) as u32)
73 }
74
75 #[inline(always)]
76 fn skip_whitespaces(&mut self) {
77 self.read_while2(b' ', b'\n');
78 }
79
80 fn read_to(&mut self, needle: u8) -> &'a [u8] {
81 let start = self.stream.idx;
82 let bytes = &self.stream.data()[start..];
83
84 let end = simd::find(bytes, needle).unwrap_or_else(|| self.stream.len() - start);
85
86 self.stream.idx += end;
87 self.stream.slice(start, start + end)
88 }
89
90 fn read_to3(&mut self, needle: [u8; 3]) -> &'a [u8] {
91 let start = self.stream.idx;
92 let bytes = &self.stream.data()[start..];
93
94 let end = simd::find3(bytes, needle).unwrap_or_else(|| self.stream.len() - start);
95
96 self.stream.idx += end;
97 self.stream.slice(start, start + end)
98 }
99
100 fn read_while2(&mut self, needle1: u8, needle2: u8) -> Option<()> {
101 loop {
102 let ch = self.stream.current_cpy()?;
103
104 let eq1 = ch == needle1;
105 let eq2 = ch == needle2;
106
107 if !eq1 & !eq2 {
108 return Some(());
109 }
110
111 self.stream.advance();
112 }
113 }
114
115 fn read_ident(&mut self) -> Option<&'a [u8]> {
116 let start = self.stream.idx;
117 let bytes = &self.stream.data()[start..];
118
119 let end = simd::search_non_ident(bytes).unwrap_or_else(|| self.stream.len() - start);
122
123 self.stream.idx += end;
124 Some(self.stream.slice(start, start + end))
125 }
126
127 fn skip_comment_with_start(&mut self, start: usize) -> &'a [u8] {
128 while !self.stream.is_eof() {
129 let idx = self.stream.idx;
130
131 if self
132 .stream
133 .slice_len(idx, constants::COMMENT.len())
134 .eq(constants::COMMENT)
135 {
136 self.stream.advance_by(constants::COMMENT.len());
137
138 let is_end_of_comment = self.stream.expect_and_skip_cond(b'>');
139
140 if is_end_of_comment {
141 return self.stream.slice(start, self.stream.idx);
142 }
143 }
144
145 self.stream.advance();
146 }
147
148 &[]
149 }
150
151 fn parse_attribute(&mut self) -> Option<(&'a [u8], Option<&'a [u8]>)> {
152 let name = self.read_ident()?;
153 self.skip_whitespaces();
154
155 let has_value = self.stream.expect_and_skip_cond(b'=');
156 if !has_value {
157 return Some((name, None));
158 }
159
160 self.skip_whitespaces();
161
162 let value = if let Some(quote) = self.stream.expect_oneof_and_skip(b"\"'") {
163 self.read_to(quote)
164 } else {
165 self.read_to3([b' ', b'\n', b'>'])
166 };
167
168 Some((name, Some(value)))
169 }
170
171 fn parse_attributes(&mut self) -> Option<Attributes<'a>> {
172 let mut attributes = Attributes::new();
173
174 loop {
175 self.skip_whitespaces();
176
177 let cur = self.stream.current_cpy()?;
178
179 if simd::is_closing(cur) {
180 break;
181 }
182
183 if let Some((key, value)) = self.parse_attribute() {
184 let value: Option<Bytes<'a>> = value.map(Into::into);
185
186 match key {
187 b"id" => attributes.id = value,
188 b"class" => attributes.class = value,
189 _ => attributes.raw.insert(key.into(), value),
190 };
191 }
192
193 if !simd::is_closing(self.stream.current_cpy()?) {
194 self.stream.advance();
195 }
196 }
197
198 Some(attributes)
199 }
200
201 #[inline]
202 fn add_to_parent(&mut self, handle: NodeHandle) {
203 if let Some(last) = self.stack.last() {
204 let last = self
205 .tags
206 .get_mut(last.get_inner() as usize)
207 .unwrap()
208 .as_tag_mut()
209 .unwrap();
210
211 last._children.push(handle);
212 } else {
213 self.ast.push(handle);
214 }
215 }
216
217 fn read_end(&mut self) {
218 self.stream.advance();
219
220 let closing_tag_name = self.read_to(b'>');
221
222 self.stream.expect_and_skip_cond(b'>');
223
224 let closing_tag_matches_parent = self
225 .stack
226 .last()
227 .and_then(|last_handle| last_handle.get(self))
228 .and_then(|last_item| last_item.as_tag())
229 .is_some_and(|last_tag| last_tag.name() == closing_tag_name);
230
231 if !closing_tag_matches_parent {
232 return;
233 }
234
235 if let Some(handle) = self.stack.pop() {
236 let tag = self
237 .tags
238 .get_mut(handle.get_inner() as usize)
239 .unwrap()
240 .as_tag_mut()
241 .unwrap();
242
243 let ptr = self.stream.data().as_ptr() as usize;
244 let offset = tag._raw.as_ptr() as usize;
245 let offset = offset - ptr;
246
247 tag._raw = self.stream.slice(offset, self.stream.idx).into();
248
249 let (track_classes, track_ids) = (
250 self.options.is_tracking_classes(),
251 self.options.is_tracking_ids(),
252 );
253
254 if let (true, Some(bytes)) = (track_classes, &tag._attributes.class) {
255 let s = bytes
256 .as_bytes_borrowed()
257 .and_then(|x| std::str::from_utf8(x).ok())
258 .map(|x| x.split_ascii_whitespace());
259
260 if let Some(s) = s {
261 for class in s {
262 self.classes
263 .entry(class.into())
264 .or_insert_with(InlineVec::new)
265 .push(handle);
266 }
267 }
268 }
269
270 if let (true, Some(bytes)) = (track_ids, &tag._attributes.id) {
271 self.ids.insert(bytes.clone(), handle);
272 }
273 }
274 }
275
276 #[cold]
277 #[inline(never)]
278 fn read_markdown(&mut self) -> Option<()> {
279 let start = self.stream.idx - 1; self.stream.advance(); let is_comment = self
284 .stream
285 .slice_len(self.stream.idx, 2)
286 .eq(constants::COMMENT);
287
288 if is_comment {
289 let comment = self.skip_comment_with_start(start);
290 let comment = self.register_tag(Node::Comment(comment.into()));
291 self.add_to_parent(comment);
292 } else {
293 let tag = self.read_ident()?;
294
295 self.skip_whitespaces();
296
297 if simd::matches_case_insensitive(tag, *b"doctype") {
298 let doctype = self.read_ident()?;
299
300 let html5 = simd::matches_case_insensitive(doctype, *b"html");
301
302 if html5 {
303 self.version = Some(HTMLVersion::HTML5);
304 }
305
306 self.skip_whitespaces();
307 self.stream.advance(); }
309 }
310
311 Some(())
312 }
313
314 fn parse_tag(&mut self) -> Option<()> {
315 let start = self.stream.idx;
316
317 self.stream.advance();
318 self.skip_whitespaces();
319 let cur = self.stream.current_cpy()?;
320
321 match cur {
322 b'/' => self.read_end(),
323 b'!' => {
324 self.read_markdown();
325 }
326 _ => {
327 let name = self.read_ident()?;
328 self.skip_whitespaces();
329
330 let attr = self.parse_attributes()?;
331
332 let is_self_closing = self.stream.expect_and_skip_cond(b'/');
333
334 self.stream.expect_and_skip(b'>')?;
335
336 let this = self.register_tag(Node::Tag(HTMLTag::new(
337 name.into(),
338 attr,
339 InlineVec::new(),
340 self.stream.slice(start, self.stream.idx).into(),
341 )));
342
343 self.add_to_parent(this);
344
345 if !is_self_closing && !constants::VOID_TAGS.contains(&name) {
350 self.stack.push(this);
351 }
352 }
353 };
354
355 Some(())
356 }
357
358 pub(crate) fn parse_single(&mut self) -> Option<()> {
359 loop {
360 let cur = self.stream.current()?;
361
362 if *cur == b'<' {
363 self.parse_tag();
364 } else {
365 let raw = Node::Raw(self.read_to(b'<').into());
366 let handle = self.register_tag(raw);
367 self.add_to_parent(handle);
368 }
369 }
370 }
371
372 #[inline]
374 pub fn resolve_node_id(&self, id: InnerNodeHandle) -> Option<&Node<'a>> {
375 self.tags.get(id as usize)
376 }
377
378 #[inline]
380 pub fn resolve_node_id_mut(&mut self, id: InnerNodeHandle) -> Option<&mut Node<'a>> {
381 self.tags.get_mut(id as usize)
382 }
383
384 pub(crate) fn parse(&mut self) -> Result<(), ParseError> {
385 if self.stream.len() > u32::MAX as usize {
386 return Err(ParseError::InvalidLength);
387 }
388
389 while !self.stream.is_eof() {
390 self.parse_single();
391 }
392
393 Ok(())
394 }
395}