1use std::{collections::HashMap, io, io::Read};
2
3use lssg_char_reader::CharReader;
4
5use crate::DomNode;
6
7pub fn parse_html_from_string(input: &String) -> Result<Vec<Html>, io::Error> {
8 parse_html(input.as_bytes())
9}
10
11pub fn parse_html(input: impl Read) -> Result<Vec<Html>, io::Error> {
13 let mut reader = CharReader::new(input);
14
15 let mut tokens = vec![];
16
17 loop {
18 match read_token(&mut reader)? {
19 None => break,
20 Some(t) => tokens.push(t),
21 }
22 }
23
24 let mut reduced_tokens = vec![];
26 for token in tokens.into_iter() {
27 if let Some(Html::Text { text: a }) = reduced_tokens.last_mut() {
28 if let Html::Text { text: b } = &token {
29 *a += b;
30 continue;
31 }
32 }
33 reduced_tokens.push(token)
34 }
35
36 Ok(reduced_tokens)
37}
38
39fn attributes(start_tag_content: &str) -> Result<HashMap<String, String>, io::Error> {
40 let start_tag_content = start_tag_content.trim();
42 let chars: Vec<char> = start_tag_content.chars().collect();
43 let mut attributes = HashMap::new();
44 let mut key = String::new();
45 let mut value = String::new();
46 let mut in_value = false;
47 let mut i = 0;
48 while i < chars.len() {
49 match chars[i] {
50 ' ' | '\n' if !in_value => {
51 if !key.is_empty() {
52 attributes.insert(key, value);
53 key = String::new();
54 value = String::new();
55 in_value = false;
56 }
57 }
58 '=' => match chars.get(i + 1) {
59 Some('"') | Some('\'') => {
60 i += 1;
61 in_value = true
62 }
63 _ => {
64 if in_value {
66 value.push('=')
67 } else {
68 key.push('=')
69 }
70 }
71 },
72 '\'' | '"' if in_value => in_value = false,
73 c => {
74 if in_value {
75 value.push(c)
76 } else {
77 key.push(c)
78 }
79 }
80 }
81 i += 1;
82 }
83 if !key.is_empty() {
84 attributes.insert(key, value);
85 }
86
87 Ok(attributes)
88}
89
90type ElementStartTag = (String, HashMap<String, String>, usize, bool);
91
92fn element_start_tag(
96 reader: &mut CharReader<impl Read>,
97) -> Result<Option<ElementStartTag>, io::Error> {
98 let mut inside_single_quotes = false;
99 let mut inside_double_quotes = false;
100 let mut i = 1;
101 while let Some(c) = reader.peek_char(i)? {
102 match c {
103 '>' if !inside_single_quotes && !inside_double_quotes => {
104 let tag_content = reader.peek_string(i + 1)?;
105
106 let mut tag = String::new();
107 for c in tag_content.chars().skip(1) {
108 match c {
109 ' ' | '\n' | '>' | '/' => break,
110 _ => tag.push(c),
111 }
112 }
113
114 let has_self_closing_slash = reader.peek_char(i - 1)? == Some('/');
116 let void_element = is_void_element(&tag);
117
118 let attributes_end = if has_self_closing_slash {
120 tag_content.len() - 2
122 } else {
123 tag_content.len() - 1
125 };
126
127 let attributes = attributes(&tag_content[tag.len() + 1..attributes_end])?;
128
129 return Ok(Some((tag, attributes, i + 1, void_element)));
130 }
131 '"' if !inside_single_quotes => inside_double_quotes = !inside_double_quotes,
132 '\'' if !inside_double_quotes => inside_single_quotes = !inside_single_quotes,
133 _ => {}
134 }
135 i += 1;
136 }
137 Ok(None)
138}
139
140fn find_matching_closing_tag(
142 reader: &mut CharReader<impl Read>,
143 tag: &str,
144 start_offset: usize,
145) -> Result<Option<usize>, io::Error> {
146 let start_tag = format!("<{}", tag);
147 let end_tag = format!("</{}>", tag);
148 let mut depth = 0;
149 let mut i = start_offset;
150 let mut in_double_quotes = false;
151 let mut in_single_quotes = false;
152
153 loop {
154 let peek_char = reader.peek_char(i)?;
156 if peek_char.is_none() {
157 return Ok(None);
158 }
159
160 let current_char = peek_char.unwrap();
161
162 match current_char {
164 '"' if !in_single_quotes => in_double_quotes = !in_double_quotes,
165 '\'' if !in_double_quotes => in_single_quotes = !in_single_quotes,
166 _ => {}
167 }
168
169 if !in_double_quotes && !in_single_quotes && current_char == '<' {
171 let start_tag_len = start_tag.len();
173 if let Ok(peek_start) = reader.peek_string_from(i, start_tag_len + 1) {
174 if peek_start.starts_with(&start_tag) {
175 if let Some(next_char) = peek_start.chars().nth(start_tag_len) {
177 if next_char == ' ' || next_char == '>' || next_char == '/' {
178 depth += 1;
179 i += start_tag_len;
180 continue;
181 }
182 }
183 }
184 }
185
186 let end_tag_len = end_tag.len();
188 if let Ok(peek_end) = reader.peek_string_from(i, end_tag_len) {
189 if peek_end == end_tag {
190 if depth == 0 {
191 return Ok(Some(i - start_offset));
192 }
193 depth -= 1;
194 i += end_tag_len;
195 continue;
196 }
197 }
198 }
199
200 i += 1;
201 }
202}
203
204type Element = (String, HashMap<String, String>, Option<String>);
205
206fn element(reader: &mut CharReader<impl Read>) -> Result<Option<Element>, io::Error> {
210 if let Some('<') = reader.peek_char(0)? {
211 if let Some((tag, attributes, tag_content_length, void_element)) =
212 element_start_tag(reader)?
213 {
214 if void_element {
216 reader.consume(tag_content_length)?;
217 return Ok(Some((tag, attributes, None)));
218 }
219
220 if let Some(content_length) =
222 find_matching_closing_tag(reader, &tag, tag_content_length)?
223 {
224 reader.consume(tag_content_length)?;
225 let content = reader.consume_string(content_length)?;
226 reader.consume(tag.len() + 3)?; return Ok(Some((tag, attributes, Some(content))));
229 }
230 }
231 }
232 Ok(None)
233}
234
235fn comment(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, io::Error> {
236 if "<!--" == reader.peek_string(4)? {
237 if let Some(text) = reader.peek_until_match_exclusive_from(4, "-->")? {
238 reader.consume(4)?; let text = reader.consume_string(text.len())?;
240 reader.consume(3)?; return Ok(Some(Html::Comment { text }));
242 }
243 }
244
245 Ok(None)
246}
247
248pub fn is_void_element(tag: &str) -> bool {
250 match tag {
251 "base" | "img" | "br" | "col" | "embed" | "hr" | "area" | "input" | "link" | "meta"
252 | "param" | "source" | "track" | "wbr"
253 | "circle" | "ellipse" | "line" | "path" | "polygon" | "polyline" | "rect"
255 | "stop" | "use" => true,
256 _ => false,
257 }
258}
259
260fn read_token(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, io::Error> {
265 while let Some(c) = reader.peek_char(0)? {
266 if c == '<' {
267 if let Some(comment) = comment(reader)? {
268 return Ok(Some(comment));
269 }
270
271 if let Some((tag, attributes, content)) = element(reader)? {
272 let mut children = vec![];
273 if let Some(content) = content {
274 let mut reader = CharReader::new(content.as_bytes());
275 while let Some(html) = read_token(&mut reader)? {
276 children.push(html);
277 }
278 }
279 return Ok(Some(Html::Element {
280 tag,
281 attributes,
282 children,
283 }));
284 }
285
286 reader.consume(1)?;
288 let mut text = "<".to_string();
289 text.push_str(&reader.consume_until_exclusive(|c| c == '<')?);
290 return Ok(Some(Html::Text { text }));
291 }
292
293 let text = reader.consume_until_exclusive(|c| c == '<')?;
294 if text.chars().any(|c| c != ' ' && c != '\n') {
296 return Ok(Some(Html::Text { text }));
297 }
298 }
299
300 Ok(None)
301}
302
303#[derive(Debug, Clone, PartialEq)]
305pub enum Html {
306 Comment {
307 text: String,
308 },
309 Text {
310 text: String,
311 },
312 Element {
313 tag: String,
314 attributes: HashMap<String, String>,
315 children: Vec<Html>,
316 },
317}
318
319impl From<DomNode> for Html {
320 fn from(value: DomNode) -> Self {
321 match &*value.kind() {
322 crate::DomNodeKind::Text { text } => Html::Text { text: text.clone() },
323 crate::DomNodeKind::Element { tag, attributes } => {
324 let children = value.children().map(|c| c.into()).collect();
325 Html::Element {
326 tag: tag.clone(),
327 attributes: attributes.clone(),
328 children,
329 }
330 }
331 }
332 }
333}
334
335#[cfg(test)]
336mod tests {
337 use super::*;
338
339 pub fn to_attributes<I: IntoIterator<Item = (impl Into<String>, impl Into<String>)>>(
341 arr: I,
342 ) -> HashMap<String, String> {
343 arr.into_iter().map(|(k, v)| (k.into(), v.into())).collect()
344 }
345
346 #[test]
347 fn test_html() {
348 let input = r#"<a href="test.com"><i class="fa-solid fa-rss"></i>Test</a>
349<button disabled></button>"#;
350 let expected = vec![
351 Html::Element {
352 tag: "a".into(),
353 attributes: to_attributes([("href", "test.com")]),
354 children: vec![
355 Html::Element {
356 tag: "i".into(),
357 attributes: to_attributes([("class", "fa-solid fa-rss")]),
358 children: vec![],
359 },
360 Html::Text {
361 text: "Test".into(),
362 },
363 ],
364 },
365 Html::Element {
366 tag: "button".into(),
367 attributes: to_attributes([("disabled", "")]),
368 children: vec![],
369 },
370 ];
371
372 let tokens = parse_html(input.as_bytes()).unwrap();
373 assert_eq!(expected, tokens);
374
375 let input = r#"<div>
376<a href="link.com">[other](other.com)</a>
377</div>"#;
378 let expected = vec![Html::Element {
379 tag: "div".into(),
380 attributes: HashMap::new(),
381 children: vec![Html::Element {
382 tag: "a".into(),
383 attributes: to_attributes([("href", "link.com")]),
384 children: vec".into(),
386 }],
387 }],
388 }];
389 let tokens = parse_html(input.as_bytes()).unwrap();
390 assert_eq!(expected, tokens);
391 }
392
393 #[test]
394 fn test_text_looks_like_html() {
395 let input = r#"<Lots of people say Rust > c++. even though it might be
396< then c++. Who knows?
397<>
398<nonclosing>
399This should be text
400"#;
401 let expected = vec![Html::Text {
402 text: "<Lots of people say Rust > c++. even though it might be
403< then c++. Who knows?
404<>
405<nonclosing>
406This should be text
407"
408 .into(),
409 }];
410
411 let tokens = parse_html(input.as_bytes()).unwrap();
412 assert_eq!(expected, tokens);
413 }
414
415 #[test]
416 fn test_js_in_attribute() {
417 let input = r#"<div onclick="() => test()"></div>"#;
418
419 let expected = vec![Html::Element {
420 tag: "div".into(),
421 attributes: to_attributes([("onclick", "() => test()")]),
422 children: vec![],
423 }];
424 let tokens = parse_html(input.as_bytes()).unwrap();
425 assert_eq!(expected, tokens);
426 }
427
428 #[test]
429 fn test_nested_elements() {
430 let input = r#"<div class="a">
431 <div class="b">
432 <div class="c">
433 </div>
434 </div>
435 </div>
436 "#;
437 let expected = vec![Html::Element {
438 tag: "div".into(),
439 attributes: to_attributes([("class", "a")]),
440 children: vec![Html::Element {
441 tag: "div".into(),
442 attributes: to_attributes([("class", "b")]),
443 children: vec![Html::Element {
444 tag: "div".into(),
445 attributes: to_attributes([("class", "c")]),
446 children: vec![],
447 }],
448 }],
449 }];
450 let tokens = parse_html(input.as_bytes()).unwrap();
451 assert_eq!(expected, tokens);
452 }
453
454 #[test]
455 fn test_full_html_document() {
456 let input = r#"<!doctype html>
457<html>
458 <head>
459 <meta content="art,simulation,technology" name="keywords" />
460 <script type="module" crossorigin src="./assets/main-B0Asn3MK.js"></script>
461 <link rel="modulepreload" crossorigin href="./assets/creature-BZHPYSn1.js">
462 <link rel="stylesheet" crossorigin href="./assets/main-CjrOOoWN.css">
463 </head>
464 <body>
465 <div id="messages"></div>
466 <div id="debug"></div>
467 <canvas id="root">Your browser does not support the HTML canvas tag.</canvas>
468 <a id="qr-link" target="_blank">
469 <div id="qr"></div>
470 </a>
471 </body>
472</html>"#;
473 let expected = vec![
474 Html::Text {
475 text: "<!doctype html>\n".into(),
476 },
477 Html::Element {
478 tag: "html".into(),
479 attributes: HashMap::new(),
480 children: vec![
481 Html::Element {
482 tag: "head".into(),
483 attributes: HashMap::new(),
484 children: vec![
485 Html::Element {
486 tag: "meta".into(),
487 attributes: to_attributes([
488 ("content", "art,simulation,technology"),
489 ("name", "keywords"),
490 ]),
491 children: vec![],
492 },
493 Html::Element {
494 tag: "script".into(),
495 attributes: to_attributes([
496 ("type", "module"),
497 ("crossorigin", ""),
498 ("src", "./assets/main-B0Asn3MK.js"),
499 ]),
500 children: vec![],
501 },
502 Html::Element {
503 tag: "link".into(),
504 attributes: to_attributes([
505 ("rel", "modulepreload"),
506 ("crossorigin", ""),
507 ("href", "./assets/creature-BZHPYSn1.js"),
508 ]),
509 children: vec![],
510 },
511 Html::Element {
512 tag: "link".into(),
513 attributes: to_attributes([
514 ("rel", "stylesheet"),
515 ("crossorigin", ""),
516 ("href", "./assets/main-CjrOOoWN.css"),
517 ]),
518 children: vec![],
519 },
520 ],
521 },
522 Html::Element {
523 tag: "body".into(),
524 attributes: HashMap::new(),
525 children: vec![
526 Html::Element {
527 tag: "div".into(),
528 attributes: to_attributes([("id", "messages")]),
529 children: vec![],
530 },
531 Html::Element {
532 tag: "div".into(),
533 attributes: to_attributes([("id", "debug")]),
534 children: vec![],
535 },
536 Html::Element {
537 tag: "canvas".into(),
538 attributes: to_attributes([("id", "root")]),
539 children: vec![Html::Text {
540 text: "Your browser does not support the HTML canvas tag."
541 .into(),
542 }],
543 },
544 Html::Element {
545 tag: "a".into(),
546 attributes: to_attributes([
547 ("id", "qr-link"),
548 ("target", "_blank"),
549 ]),
550 children: vec![Html::Element {
551 tag: "div".into(),
552 attributes: to_attributes([("id", "qr")]),
553 children: vec![],
554 }],
555 },
556 ],
557 },
558 ],
559 },
560 ];
561 let tokens = parse_html(input.as_bytes()).unwrap();
562 assert_eq!(expected, tokens);
563 }
564
565 #[test]
566 fn test_svg() {
567 let input = r#"<svg xmlns="http://www.w3.org/2000/svg" width="20" viewBox="0 0 640 640" height="20"><path d="M451.5 160C434.9 160 418.8 164.5 404.7 172.7"/></svg>"#;
568 let expected = vec![Html::Element {
569 tag: "svg".into(),
570 attributes: to_attributes([
571 ("xmlns", "http://www.w3.org/2000/svg"),
572 ("width", "20"),
573 ("viewBox", "0 0 640 640"),
574 ("height", "20"),
575 ]),
576 children: vec![Html::Element {
577 tag: "path".into(),
578 attributes: to_attributes([("d", "M451.5 160C434.9 160 418.8 164.5 404.7 172.7")]),
579 children: vec![],
580 }],
581 }];
582 let tokens = parse_html(input.as_bytes()).unwrap();
583 assert_eq!(expected, tokens);
584 }
585
586 #[test]
587 fn test_void_elements_with_and_without_self_closing() {
588 let input = r#"<meta charset="utf-8">
590<link rel="stylesheet" href="style.css">
591<img src="image.jpg" alt="test">"#;
592 let tokens = parse_html(input.as_bytes()).unwrap();
593 assert_eq!(tokens.len(), 3);
594 assert!(matches!(tokens[0], Html::Element { ref tag, .. } if tag == "meta"));
595 assert!(matches!(tokens[1], Html::Element { ref tag, .. } if tag == "link"));
596 assert!(matches!(tokens[2], Html::Element { ref tag, .. } if tag == "img"));
597
598 let input = r#"<meta charset="utf-8" />
600<link rel="stylesheet" href="style.css" />
601<img src="image.jpg" alt="test" />"#;
602 let tokens = parse_html(input.as_bytes()).unwrap();
603 assert_eq!(tokens.len(), 3);
604 assert!(matches!(tokens[0], Html::Element { ref tag, .. } if tag == "meta"));
605 assert!(matches!(tokens[1], Html::Element { ref tag, .. } if tag == "link"));
606 assert!(matches!(tokens[2], Html::Element { ref tag, .. } if tag == "img"));
607 }
608}