1use std::{collections::HashMap, io, io::Read};
2
3use lssg_char_reader::CharReader;
4
5use crate::DomNode;
6
7pub fn parse_html_from_string(input: &String) -> Result<Vec<Html>, io::Error> {
8 parse_html(input.as_bytes())
9}
10
11pub fn parse_html(input: impl Read) -> Result<Vec<Html>, io::Error> {
13 let mut reader = CharReader::new(input);
14
15 let mut tokens = vec![];
16
17 loop {
18 match read_token(&mut reader)? {
19 None => break,
20 Some(t) => tokens.push(t),
21 }
22 }
23
24 let mut reduced_tokens = vec![];
26 for token in tokens.into_iter() {
27 if let Some(Html::Text { text: a }) = reduced_tokens.last_mut() {
28 if let Html::Text { text: b } = &token {
29 *a += b;
30 continue;
31 }
32 }
33 reduced_tokens.push(token)
34 }
35
36 Ok(reduced_tokens)
37}
38
39fn attributes(start_tag_content: &str) -> Result<HashMap<String, String>, io::Error> {
40 let start_tag_content = start_tag_content.trim();
42 let chars: Vec<char> = start_tag_content.chars().collect();
43 let mut attributes = HashMap::new();
44 let mut key = String::new();
45 let mut value = String::new();
46 let mut in_value = false;
47 let mut quote_char: Option<char> = None;
48 let mut i = 0;
49 while i < chars.len() {
50 match chars[i] {
51 ' ' | '\n' if !in_value => {
52 if !key.is_empty() {
53 attributes.insert(key, value);
54 key = String::new();
55 value = String::new();
56 in_value = false;
57 }
58 }
59 '=' => match chars.get(i + 1) {
60 Some(&q @ '"') | Some(&q @ '\'') => {
61 i += 1;
62 in_value = true;
63 quote_char = Some(q);
64 }
65 _ => {
66 if in_value {
68 value.push('=')
69 } else {
70 key.push('=')
71 }
72 }
73 },
74 '\'' | '"' if in_value && Some(chars[i]) == quote_char => {
75 in_value = false;
76 quote_char = None;
77 }
78 c => {
79 if in_value {
80 value.push(c)
81 } else {
82 key.push(c)
83 }
84 }
85 }
86 i += 1;
87 }
88 if !key.is_empty() {
89 attributes.insert(key, value);
90 }
91
92 Ok(attributes)
93}
94
95type ElementStartTag = (String, HashMap<String, String>, usize, bool);
96
97fn element_start_tag(
101 reader: &mut CharReader<impl Read>,
102) -> Result<Option<ElementStartTag>, io::Error> {
103 let mut inside_single_quotes = false;
104 let mut inside_double_quotes = false;
105 let mut i = 1;
106 while let Some(c) = reader.peek_char(i)? {
107 match c {
108 '>' if !inside_single_quotes && !inside_double_quotes => {
109 let tag_content = reader.peek_string(i + 1)?;
110
111 let mut tag = String::new();
112 for c in tag_content.chars().skip(1) {
113 match c {
114 ' ' | '\n' | '>' | '/' => break,
115 _ => tag.push(c),
116 }
117 }
118
119 let has_self_closing_slash = reader.peek_char(i - 1)? == Some('/');
121 let void_element = is_void_element(&tag);
122
123 let attributes_end = if has_self_closing_slash {
125 tag_content.len() - 2
127 } else {
128 tag_content.len() - 1
130 };
131
132 let attributes = attributes(&tag_content[tag.len() + 1..attributes_end])?;
133
134 return Ok(Some((tag, attributes, i + 1, void_element)));
135 }
136 '"' if !inside_single_quotes => inside_double_quotes = !inside_double_quotes,
137 '\'' if !inside_double_quotes => inside_single_quotes = !inside_single_quotes,
138 _ => {}
139 }
140 i += 1;
141 }
142 Ok(None)
143}
144
145fn find_matching_closing_tag(
147 reader: &mut CharReader<impl Read>,
148 tag: &str,
149 start_offset: usize,
150) -> Result<Option<usize>, io::Error> {
151 let start_tag = format!("<{}", tag);
152 let end_tag = format!("</{}>", tag);
153 let mut depth = 0;
154 let mut i = start_offset;
155 let mut in_double_quotes = false;
156 let mut in_single_quotes = false;
157
158 loop {
159 let peek_char = reader.peek_char(i)?;
161 if peek_char.is_none() {
162 return Ok(None);
163 }
164
165 let current_char = peek_char.unwrap();
166
167 match current_char {
169 '"' if !in_single_quotes => in_double_quotes = !in_double_quotes,
170 '\'' if !in_double_quotes => in_single_quotes = !in_single_quotes,
171 _ => {}
172 }
173
174 if !in_double_quotes && !in_single_quotes && current_char == '<' {
176 let start_tag_len = start_tag.len();
178 if let Ok(peek_start) = reader.peek_string_from(i, start_tag_len + 1) {
179 if peek_start.starts_with(&start_tag) {
180 if let Some(next_char) = peek_start.chars().nth(start_tag_len) {
182 if next_char == ' ' || next_char == '>' || next_char == '/' {
183 depth += 1;
184 i += start_tag_len;
185 continue;
186 }
187 }
188 }
189 }
190
191 let end_tag_len = end_tag.len();
193 if let Ok(peek_end) = reader.peek_string_from(i, end_tag_len) {
194 if peek_end == end_tag {
195 if depth == 0 {
196 return Ok(Some(i - start_offset));
197 }
198 depth -= 1;
199 i += end_tag_len;
200 continue;
201 }
202 }
203 }
204
205 i += 1;
206 }
207}
208
209type Element = (String, HashMap<String, String>, Option<String>);
210
211fn element(reader: &mut CharReader<impl Read>) -> Result<Option<Element>, io::Error> {
215 if let Some('<') = reader.peek_char(0)? {
216 if let Some((tag, attributes, tag_content_length, void_element)) =
217 element_start_tag(reader)?
218 {
219 if void_element {
221 reader.consume(tag_content_length)?;
222 return Ok(Some((tag, attributes, None)));
223 }
224
225 if let Some(content_length) =
227 find_matching_closing_tag(reader, &tag, tag_content_length)?
228 {
229 reader.consume(tag_content_length)?;
230 let content = reader.consume_string(content_length)?;
231 reader.consume(tag.len() + 3)?; return Ok(Some((tag, attributes, Some(content))));
234 }
235 }
236 }
237 Ok(None)
238}
239
240fn comment(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, io::Error> {
241 if "<!--" == reader.peek_string(4)? {
242 if let Some(text) = reader.peek_until_match_exclusive_from(4, "-->")? {
243 reader.consume(4)?; let text = reader.consume_string(text.len())?;
245 reader.consume(3)?; return Ok(Some(Html::Comment { text }));
247 }
248 }
249
250 Ok(None)
251}
252
253pub fn is_void_element(tag: &str) -> bool {
255 match tag {
256 "base" | "img" | "br" | "col" | "embed" | "hr" | "area" | "input" | "link" | "meta"
257 | "param" | "source" | "track" | "wbr"
258 | "circle" | "ellipse" | "line" | "path" | "polygon" | "polyline" | "rect"
260 | "stop" | "use" => true,
261 _ => false,
262 }
263}
264
265fn read_token(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, io::Error> {
270 while let Some(c) = reader.peek_char(0)? {
271 if c == '<' {
272 if let Some(comment) = comment(reader)? {
273 return Ok(Some(comment));
274 }
275
276 if let Some((tag, attributes, content)) = element(reader)? {
277 let mut children = vec![];
278 if let Some(content) = content {
279 let mut reader = CharReader::new(content.as_bytes());
280 while let Some(html) = read_token(&mut reader)? {
281 children.push(html);
282 }
283 }
284 return Ok(Some(Html::Element {
285 tag,
286 attributes,
287 children,
288 }));
289 }
290
291 reader.consume(1)?;
293 let mut text = "<".to_string();
294 text.push_str(&reader.consume_until_exclusive(|c| c == '<')?);
295 return Ok(Some(Html::Text { text }));
296 }
297
298 let text = reader.consume_until_exclusive(|c| c == '<')?;
299 if text.chars().any(|c| c != ' ' && c != '\n') {
301 return Ok(Some(Html::Text { text }));
302 }
303 }
304
305 Ok(None)
306}
307
308#[derive(Debug, Clone, PartialEq)]
310pub enum Html {
311 Comment {
312 text: String,
313 },
314 Text {
315 text: String,
316 },
317 Element {
318 tag: String,
319 attributes: HashMap<String, String>,
320 children: Vec<Html>,
321 },
322}
323
324impl From<DomNode> for Html {
325 fn from(value: DomNode) -> Self {
326 match &*value.kind() {
327 crate::DomNodeKind::Text { text } => Html::Text { text: text.clone() },
328 crate::DomNodeKind::Element { tag, attributes } => {
329 let children = value.children().map(|c| c.into()).collect();
330 Html::Element {
331 tag: tag.clone(),
332 attributes: attributes.clone(),
333 children,
334 }
335 }
336 }
337 }
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 pub fn to_attributes<I: IntoIterator<Item = (impl Into<String>, impl Into<String>)>>(
346 arr: I,
347 ) -> HashMap<String, String> {
348 arr.into_iter().map(|(k, v)| (k.into(), v.into())).collect()
349 }
350
351 #[test]
352 fn test_html() {
353 let input = r#"<a href="test.com"><i class="fa-solid fa-rss"></i>Test</a>
354<button disabled></button>"#;
355 let expected = vec![
356 Html::Element {
357 tag: "a".into(),
358 attributes: to_attributes([("href", "test.com")]),
359 children: vec![
360 Html::Element {
361 tag: "i".into(),
362 attributes: to_attributes([("class", "fa-solid fa-rss")]),
363 children: vec![],
364 },
365 Html::Text {
366 text: "Test".into(),
367 },
368 ],
369 },
370 Html::Element {
371 tag: "button".into(),
372 attributes: to_attributes([("disabled", "")]),
373 children: vec![],
374 },
375 ];
376
377 let tokens = parse_html(input.as_bytes()).unwrap();
378 assert_eq!(expected, tokens);
379
380 let input = r#"<div>
381<a href="link.com">[other](other.com)</a>
382</div>"#;
383 let expected = vec![Html::Element {
384 tag: "div".into(),
385 attributes: HashMap::new(),
386 children: vec![Html::Element {
387 tag: "a".into(),
388 attributes: to_attributes([("href", "link.com")]),
389 children: vec".into(),
391 }],
392 }],
393 }];
394 let tokens = parse_html(input.as_bytes()).unwrap();
395 assert_eq!(expected, tokens);
396 }
397
398 #[test]
399 fn test_text_looks_like_html() {
400 let input = r#"<Lots of people say Rust > c++. even though it might be
401< then c++. Who knows?
402<>
403<nonclosing>
404This should be text
405"#;
406 let expected = vec![Html::Text {
407 text: "<Lots of people say Rust > c++. even though it might be
408< then c++. Who knows?
409<>
410<nonclosing>
411This should be text
412"
413 .into(),
414 }];
415
416 let tokens = parse_html(input.as_bytes()).unwrap();
417 assert_eq!(expected, tokens);
418 }
419
420 #[test]
421 fn test_js_in_attribute() {
422 let input = r#"<div onclick="() => test()"></div>"#;
423
424 let expected = vec![Html::Element {
425 tag: "div".into(),
426 attributes: to_attributes([("onclick", "() => test()")]),
427 children: vec![],
428 }];
429 let tokens = parse_html(input.as_bytes()).unwrap();
430 assert_eq!(expected, tokens);
431 }
432
433 #[test]
434 fn test_nested_elements() {
435 let input = r#"<div class="a">
436 <div class="b">
437 <div class="c">
438 </div>
439 </div>
440 </div>
441 "#;
442 let expected = vec![Html::Element {
443 tag: "div".into(),
444 attributes: to_attributes([("class", "a")]),
445 children: vec![Html::Element {
446 tag: "div".into(),
447 attributes: to_attributes([("class", "b")]),
448 children: vec![Html::Element {
449 tag: "div".into(),
450 attributes: to_attributes([("class", "c")]),
451 children: vec![],
452 }],
453 }],
454 }];
455 let tokens = parse_html(input.as_bytes()).unwrap();
456 assert_eq!(expected, tokens);
457 }
458
459 #[test]
460 fn test_full_html_document() {
461 let input = r#"<!doctype html>
462<html>
463 <head>
464 <meta content="art,simulation,technology" name="keywords" />
465 <script type="module" crossorigin src="./assets/main-B0Asn3MK.js"></script>
466 <link rel="modulepreload" crossorigin href="./assets/creature-BZHPYSn1.js">
467 <link rel="stylesheet" crossorigin href="./assets/main-CjrOOoWN.css">
468 </head>
469 <body>
470 <div id="messages"></div>
471 <div id="debug"></div>
472 <canvas id="root">Your browser does not support the HTML canvas tag.</canvas>
473 <a id="qr-link" target="_blank">
474 <div id="qr"></div>
475 </a>
476 </body>
477</html>"#;
478 let expected = vec![
479 Html::Text {
480 text: "<!doctype html>\n".into(),
481 },
482 Html::Element {
483 tag: "html".into(),
484 attributes: HashMap::new(),
485 children: vec![
486 Html::Element {
487 tag: "head".into(),
488 attributes: HashMap::new(),
489 children: vec![
490 Html::Element {
491 tag: "meta".into(),
492 attributes: to_attributes([
493 ("content", "art,simulation,technology"),
494 ("name", "keywords"),
495 ]),
496 children: vec![],
497 },
498 Html::Element {
499 tag: "script".into(),
500 attributes: to_attributes([
501 ("type", "module"),
502 ("crossorigin", ""),
503 ("src", "./assets/main-B0Asn3MK.js"),
504 ]),
505 children: vec![],
506 },
507 Html::Element {
508 tag: "link".into(),
509 attributes: to_attributes([
510 ("rel", "modulepreload"),
511 ("crossorigin", ""),
512 ("href", "./assets/creature-BZHPYSn1.js"),
513 ]),
514 children: vec![],
515 },
516 Html::Element {
517 tag: "link".into(),
518 attributes: to_attributes([
519 ("rel", "stylesheet"),
520 ("crossorigin", ""),
521 ("href", "./assets/main-CjrOOoWN.css"),
522 ]),
523 children: vec![],
524 },
525 ],
526 },
527 Html::Element {
528 tag: "body".into(),
529 attributes: HashMap::new(),
530 children: vec![
531 Html::Element {
532 tag: "div".into(),
533 attributes: to_attributes([("id", "messages")]),
534 children: vec![],
535 },
536 Html::Element {
537 tag: "div".into(),
538 attributes: to_attributes([("id", "debug")]),
539 children: vec![],
540 },
541 Html::Element {
542 tag: "canvas".into(),
543 attributes: to_attributes([("id", "root")]),
544 children: vec![Html::Text {
545 text: "Your browser does not support the HTML canvas tag."
546 .into(),
547 }],
548 },
549 Html::Element {
550 tag: "a".into(),
551 attributes: to_attributes([
552 ("id", "qr-link"),
553 ("target", "_blank"),
554 ]),
555 children: vec![Html::Element {
556 tag: "div".into(),
557 attributes: to_attributes([("id", "qr")]),
558 children: vec![],
559 }],
560 },
561 ],
562 },
563 ],
564 },
565 ];
566 let tokens = parse_html(input.as_bytes()).unwrap();
567 assert_eq!(expected, tokens);
568 }
569
570 #[test]
571 fn test_svg() {
572 let input = r#"<svg xmlns="http://www.w3.org/2000/svg" width="20" viewBox="0 0 640 640" height="20"><path d="M451.5 160C434.9 160 418.8 164.5 404.7 172.7"/></svg>"#;
573 let expected = vec![Html::Element {
574 tag: "svg".into(),
575 attributes: to_attributes([
576 ("xmlns", "http://www.w3.org/2000/svg"),
577 ("width", "20"),
578 ("viewBox", "0 0 640 640"),
579 ("height", "20"),
580 ]),
581 children: vec![Html::Element {
582 tag: "path".into(),
583 attributes: to_attributes([("d", "M451.5 160C434.9 160 418.8 164.5 404.7 172.7")]),
584 children: vec![],
585 }],
586 }];
587 let tokens = parse_html(input.as_bytes()).unwrap();
588 assert_eq!(expected, tokens);
589 }
590
591 #[test]
592 fn test_void_elements_with_and_without_self_closing() {
593 let input = r#"<meta charset="utf-8">
595<link rel="stylesheet" href="style.css">
596<img src="image.jpg" alt="test">"#;
597 let tokens = parse_html(input.as_bytes()).unwrap();
598 assert_eq!(tokens.len(), 3);
599 assert!(matches!(tokens[0], Html::Element { ref tag, .. } if tag == "meta"));
600 assert!(matches!(tokens[1], Html::Element { ref tag, .. } if tag == "link"));
601 assert!(matches!(tokens[2], Html::Element { ref tag, .. } if tag == "img"));
602
603 let input = r#"<meta charset="utf-8" />
605<link rel="stylesheet" href="style.css" />
606<img src="image.jpg" alt="test" />"#;
607 let tokens = parse_html(input.as_bytes()).unwrap();
608 assert_eq!(tokens.len(), 3);
609 assert!(matches!(tokens[0], Html::Element { ref tag, .. } if tag == "meta"));
610 assert!(matches!(tokens[1], Html::Element { ref tag, .. } if tag == "link"));
611 assert!(matches!(tokens[2], Html::Element { ref tag, .. } if tag == "img"));
612 }
613}