1use super::{
2 constants,
3 handle::NodeHandle,
4 tag::{Attributes, HTMLTag, Node},
5};
6use crate::InnerNodeHandle;
7use crate::{bytes::Bytes, inline::vec::InlineVec, simd, ParseError};
8use crate::{stream::Stream, ParserOptions};
9use std::collections::HashMap;
10
11pub type Tree<'a> = Vec<Node<'a>>;
13
14pub type ClassVec = InlineVec<NodeHandle, 2>;
16
17#[derive(Debug, Copy, Clone, PartialEq)]
19#[repr(C)]
20pub enum HTMLVersion {
21 HTML5,
23 StrictHTML401,
25 TransitionalHTML401,
27 FramesetHTML401,
29}
30#[derive(Debug)]
35pub struct Parser<'a> {
36 pub(crate) stream: Stream<'a, u8>,
38 pub(crate) stack: Vec<NodeHandle>,
39 pub(crate) options: ParserOptions,
41 pub(crate) tags: Tree<'a>,
45 pub(crate) ast: Vec<NodeHandle>,
47 pub(crate) ids: HashMap<Bytes<'a>, NodeHandle>,
49 pub(crate) classes: HashMap<Bytes<'a>, ClassVec>,
51 pub(crate) version: Option<HTMLVersion>,
53}
54
55impl<'a> Parser<'a> {
56 pub(crate) fn new(input: &str, options: ParserOptions) -> Parser<'_> {
57 Parser {
58 stack: Vec::with_capacity(4),
59 options,
60 tags: Vec::new(),
61 stream: Stream::new(input.as_bytes()),
62 ast: Vec::new(),
63 ids: HashMap::new(),
64 classes: HashMap::new(),
65 version: None,
66 }
67 }
68
69 #[inline(always)]
70 fn register_tag(&mut self, node: Node<'a>) -> NodeHandle {
71 self.tags.push(node);
72 NodeHandle::new((self.tags.len() - 1) as u32)
73 }
74
75 #[inline(always)]
76 fn skip_whitespaces(&mut self) {
77 self.read_while2(b' ', b'\n');
78 }
79
80 fn read_to(&mut self, needle: u8) -> &'a [u8] {
81 let start = self.stream.idx;
82 let bytes = &self.stream.data()[start..];
83
84 let end = simd::find(bytes, needle).unwrap_or_else(|| self.stream.len() - start);
85
86 self.stream.idx += end;
87 self.stream.slice(start, start + end)
88 }
89
90 fn read_to3(&mut self, needle: [u8; 3]) -> &'a [u8] {
91 let start = self.stream.idx;
92 let bytes = &self.stream.data()[start..];
93
94 let end = simd::find3(bytes, needle).unwrap_or_else(|| self.stream.len() - start);
95
96 self.stream.idx += end;
97 self.stream.slice(start, start + end)
98 }
99
100 fn read_while2(&mut self, needle1: u8, needle2: u8) -> Option<()> {
101 loop {
102 let ch = self.stream.current_cpy()?;
103
104 let eq1 = ch == needle1;
105 let eq2 = ch == needle2;
106
107 if !eq1 & !eq2 {
108 return Some(());
109 }
110
111 self.stream.advance();
112 }
113 }
114
115 fn read_ident(&mut self) -> Option<&'a [u8]> {
116 let start = self.stream.idx;
117 let bytes = &self.stream.data()[start..];
118
119 let end = simd::search_non_ident(bytes).unwrap_or_else(|| self.stream.len() - start);
122
123 if end == 0 {
125 return None;
126 }
127
128 self.stream.idx += end;
129 Some(self.stream.slice(start, start + end))
130 }
131
132 fn skip_comment_with_start(&mut self, start: usize) -> &'a [u8] {
133 while !self.stream.is_eof() {
134 let idx = self.stream.idx;
135
136 if self
137 .stream
138 .slice_len(idx, constants::COMMENT.len())
139 .eq(constants::COMMENT)
140 {
141 self.stream.advance_by(constants::COMMENT.len());
142
143 let is_end_of_comment = self.stream.expect_and_skip_cond(b'>');
144
145 if is_end_of_comment {
146 return self.stream.slice(start, self.stream.idx);
147 }
148 }
149
150 self.stream.advance();
151 }
152
153 &[]
154 }
155
156 fn parse_attribute(&mut self) -> Option<(&'a [u8], Option<&'a [u8]>)> {
157 let name = self.read_ident()?;
158 self.skip_whitespaces();
159
160 let has_value = self.stream.expect_and_skip_cond(b'=');
161 if !has_value {
162 return Some((name, None));
163 }
164
165 self.skip_whitespaces();
166
167 let value = if let Some(quote) = self.stream.expect_oneof_and_skip(b"\"'") {
168 self.read_to(quote)
169 } else {
170 self.read_to3([b' ', b'\n', b'>'])
171 };
172
173 Some((name, Some(value)))
174 }
175
176 fn parse_attributes(&mut self) -> Option<Attributes<'a>> {
177 let mut attributes = Attributes::new();
178
179 loop {
180 self.skip_whitespaces();
181
182 let cur = self.stream.current_cpy()?;
183
184 if simd::is_closing(cur) {
185 break;
186 }
187
188 if let Some((key, value)) = self.parse_attribute() {
189 let has_value = value.is_some();
190 let value: Option<Bytes<'a>> = value.map(Into::into);
191
192 match key {
193 b"id" => attributes.id = value,
194 b"class" => attributes.class = value,
195 _ => attributes.raw.insert(key.into(), value),
196 };
197
198 if has_value && !simd::is_closing(self.stream.current_cpy()?) {
200 self.stream.advance();
201 }
202 } else {
203 self.stream.advance();
205 }
206 }
207
208 Some(attributes)
209 }
210
211 #[inline]
212 fn add_to_parent(&mut self, handle: NodeHandle) {
213 if let Some(last) = self.stack.last() {
214 let last = self
215 .tags
216 .get_mut(last.get_inner() as usize)
217 .unwrap()
218 .as_tag_mut()
219 .unwrap();
220
221 last._children.push(handle);
222 } else {
223 self.ast.push(handle);
224 }
225 }
226
227 fn read_end(&mut self) {
228 self.stream.advance();
229
230 let closing_tag_name = self.read_to(b'>');
231
232 self.stream.expect_and_skip_cond(b'>');
233
234 let closing_tag_matches_parent = self
235 .stack
236 .last()
237 .and_then(|last_handle| last_handle.get(self))
238 .and_then(|last_item| last_item.as_tag())
239 .is_some_and(|last_tag| last_tag.name() == closing_tag_name);
240
241 if !closing_tag_matches_parent {
242 return;
243 }
244
245 if let Some(handle) = self.stack.pop() {
246 let tag = self
247 .tags
248 .get_mut(handle.get_inner() as usize)
249 .unwrap()
250 .as_tag_mut()
251 .unwrap();
252
253 let ptr = self.stream.data().as_ptr() as usize;
254 let offset = tag._raw.as_ptr() as usize;
255 let offset = offset - ptr;
256
257 tag._raw = self.stream.slice(offset, self.stream.idx).into();
258
259 let (track_classes, track_ids) = (
260 self.options.is_tracking_classes(),
261 self.options.is_tracking_ids(),
262 );
263
264 if let (true, Some(bytes)) = (track_classes, &tag._attributes.class) {
265 let s = bytes
266 .as_bytes_borrowed()
267 .and_then(|x| std::str::from_utf8(x).ok())
268 .map(|x| x.split_ascii_whitespace());
269
270 if let Some(s) = s {
271 for class in s {
272 self.classes
273 .entry(class.into())
274 .or_insert_with(InlineVec::new)
275 .push(handle);
276 }
277 }
278 }
279
280 if let (true, Some(bytes)) = (track_ids, &tag._attributes.id) {
281 self.ids.insert(bytes.clone(), handle);
282 }
283 }
284 }
285
286 #[cold]
287 #[inline(never)]
288 fn read_markdown(&mut self) -> Option<()> {
289 let start = self.stream.idx - 1; self.stream.advance(); let is_comment = self
294 .stream
295 .slice_len(self.stream.idx, 2)
296 .eq(constants::COMMENT);
297
298 if is_comment {
299 let comment = self.skip_comment_with_start(start);
300 let comment = self.register_tag(Node::Comment(comment.into()));
301 self.add_to_parent(comment);
302 } else {
303 let tag = self.read_ident()?;
304
305 self.skip_whitespaces();
306
307 if simd::matches_case_insensitive(tag, *b"doctype") {
308 let doctype = self.read_ident()?;
309
310 let html5 = simd::matches_case_insensitive(doctype, *b"html");
311
312 if html5 {
313 self.version = Some(HTMLVersion::HTML5);
314 }
315
316 self.skip_whitespaces();
317 self.stream.advance(); }
319 }
320
321 Some(())
322 }
323
324 fn parse_tag(&mut self) -> Option<()> {
325 let start = self.stream.idx;
326
327 self.stream.advance();
328 self.skip_whitespaces();
329 let cur = self.stream.current_cpy()?;
330
331 match cur {
332 b'/' => self.read_end(),
333 b'!' => {
334 self.read_markdown();
335 }
336 _ => {
337 let name = self.read_ident()?;
338 self.skip_whitespaces();
339
340 let attr = self.parse_attributes()?;
341
342 let is_self_closing = self.stream.expect_and_skip_cond(b'/');
343
344 self.stream.expect_and_skip(b'>')?;
345
346 let this = self.register_tag(Node::Tag(HTMLTag::new(
347 name.into(),
348 attr,
349 InlineVec::new(),
350 self.stream.slice(start, self.stream.idx).into(),
351 )));
352
353 self.add_to_parent(this);
354
355 if !is_self_closing && !constants::VOID_TAGS.contains(&name) {
360 self.stack.push(this);
361 }
362 }
363 };
364
365 Some(())
366 }
367
368 pub(crate) fn parse_single(&mut self) -> Option<()> {
369 loop {
370 let cur = self.stream.current()?;
371
372 if *cur == b'<' {
373 self.parse_tag();
374 } else {
375 let raw = Node::Raw(self.read_to(b'<').into());
376 let handle = self.register_tag(raw);
377 self.add_to_parent(handle);
378 }
379 }
380 }
381
382 #[inline]
384 pub fn resolve_node_id(&self, id: InnerNodeHandle) -> Option<&Node<'a>> {
385 self.tags.get(id as usize)
386 }
387
388 #[inline]
390 pub fn resolve_node_id_mut(&mut self, id: InnerNodeHandle) -> Option<&mut Node<'a>> {
391 self.tags.get_mut(id as usize)
392 }
393
394 pub(crate) fn parse(&mut self) -> Result<(), ParseError> {
395 if self.stream.len() > u32::MAX as usize {
396 return Err(ParseError::InvalidLength);
397 }
398
399 while !self.stream.is_eof() {
400 self.parse_single();
401 }
402
403 Ok(())
404 }
405}