virtual_dom/
html.rs

1use std::{collections::HashMap, io, io::Read};
2
3use lssg_char_reader::CharReader;
4
5use crate::DomNode;
6
7pub fn parse_html_from_string(input: &String) -> Result<Vec<Html>, io::Error> {
8    parse_html(input.as_bytes())
9}
10
11// TODO: return DomNode directly instead of parsing to intermediary representation
12pub fn parse_html(input: impl Read) -> Result<Vec<Html>, io::Error> {
13    let mut reader = CharReader::new(input);
14
15    let mut tokens = vec![];
16
17    loop {
18        match read_token(&mut reader)? {
19            None => break,
20            Some(t) => tokens.push(t),
21        }
22    }
23
24    // add texts together
25    let mut reduced_tokens = vec![];
26    for token in tokens.into_iter() {
27        if let Some(Html::Text { text: a }) = reduced_tokens.last_mut() {
28            if let Html::Text { text: b } = &token {
29                *a += b;
30                continue;
31            }
32        }
33        reduced_tokens.push(token)
34    }
35
36    Ok(reduced_tokens)
37}
38
39fn attributes(start_tag_content: &str) -> Result<HashMap<String, String>, io::Error> {
40    // remove whitespace before and after text
41    let start_tag_content = start_tag_content.trim();
42    let chars: Vec<char> = start_tag_content.chars().collect();
43    let mut attributes = HashMap::new();
44    let mut key = String::new();
45    let mut value = String::new();
46    let mut in_value = false;
47    let mut i = 0;
48    while i < chars.len() {
49        match chars[i] {
50            ' ' | '\n' if !in_value => {
51                if !key.is_empty() {
52                    attributes.insert(key, value);
53                    key = String::new();
54                    value = String::new();
55                    in_value = false;
56                }
57            }
58            '=' => match chars.get(i + 1) {
59                Some('"') | Some('\'') => {
60                    i += 1;
61                    in_value = true
62                }
63                _ => {
64                    // '=' not followed by a quote
65                    if in_value {
66                        value.push('=')
67                    } else {
68                        key.push('=')
69                    }
70                }
71            },
72            '\'' | '"' if in_value => in_value = false,
73            c => {
74                if in_value {
75                    value.push(c)
76                } else {
77                    key.push(c)
78                }
79            }
80        }
81        i += 1;
82    }
83    if !key.is_empty() {
84        attributes.insert(key, value);
85    }
86
87    Ok(attributes)
88}
89
90type ElementStartTag = (String, HashMap<String, String>, usize, bool);
91
92/// Get the start tag with its attributes starts after the opening tag '<'
93///
94/// returns (tag, attributes, tag_content_length, void_element)
95fn element_start_tag(
96    reader: &mut CharReader<impl Read>,
97) -> Result<Option<ElementStartTag>, io::Error> {
98    let mut inside_single_quotes = false;
99    let mut inside_double_quotes = false;
100    let mut i = 1;
101    while let Some(c) = reader.peek_char(i)? {
102        match c {
103            '>' if !inside_single_quotes && !inside_double_quotes => {
104                let tag_content = reader.peek_string(i + 1)?;
105
106                let mut tag = String::new();
107                for c in tag_content.chars().skip(1) {
108                    match c {
109                        ' ' | '\n' | '>' | '/' => break,
110                        _ => tag.push(c),
111                    }
112                }
113
114                // Check if this is a void element (with or without self-closing /)
115                let has_self_closing_slash = reader.peek_char(i - 1)? == Some('/');
116                let void_element = is_void_element(&tag);
117
118                // Calculate attributes end position
119                let attributes_end = if has_self_closing_slash {
120                    // if it has self-closing slash, exclude the / and >
121                    tag_content.len() - 2
122                } else {
123                    // otherwise just exclude the >
124                    tag_content.len() - 1
125                };
126
127                let attributes = attributes(&tag_content[tag.len() + 1..attributes_end])?;
128
129                return Ok(Some((tag, attributes, i + 1, void_element)));
130            }
131            '"' if !inside_single_quotes => inside_double_quotes = !inside_double_quotes,
132            '\'' if !inside_double_quotes => inside_single_quotes = !inside_single_quotes,
133            _ => {}
134        }
135        i += 1;
136    }
137    Ok(None)
138}
139
140/// Find the matching closing tag while respecting nesting
141fn find_matching_closing_tag(
142    reader: &mut CharReader<impl Read>,
143    tag: &str,
144    start_offset: usize,
145) -> Result<Option<usize>, io::Error> {
146    let start_tag = format!("<{}", tag);
147    let end_tag = format!("</{}>", tag);
148    let mut depth = 0;
149    let mut i = start_offset;
150    let mut in_double_quotes = false;
151    let mut in_single_quotes = false;
152
153    loop {
154        // Try to peek ahead to see if we have more content
155        let peek_char = reader.peek_char(i)?;
156        if peek_char.is_none() {
157            return Ok(None);
158        }
159
160        let current_char = peek_char.unwrap();
161
162        // Track quote state to ignore tags inside attribute values
163        match current_char {
164            '"' if !in_single_quotes => in_double_quotes = !in_double_quotes,
165            '\'' if !in_double_quotes => in_single_quotes = !in_single_quotes,
166            _ => {}
167        }
168
169        // Only look for tags when not inside quotes
170        if !in_double_quotes && !in_single_quotes && current_char == '<' {
171            // Check if we can match the start tag at position i
172            let start_tag_len = start_tag.len();
173            if let Ok(peek_start) = reader.peek_string_from(i, start_tag_len + 1) {
174                if peek_start.starts_with(&start_tag) {
175                    // Make sure it's actually a tag (followed by space, >, or /)
176                    if let Some(next_char) = peek_start.chars().nth(start_tag_len) {
177                        if next_char == ' ' || next_char == '>' || next_char == '/' {
178                            depth += 1;
179                            i += start_tag_len;
180                            continue;
181                        }
182                    }
183                }
184            }
185
186            // Check if we can match the end tag at position i
187            let end_tag_len = end_tag.len();
188            if let Ok(peek_end) = reader.peek_string_from(i, end_tag_len) {
189                if peek_end == end_tag {
190                    if depth == 0 {
191                        return Ok(Some(i - start_offset));
192                    }
193                    depth -= 1;
194                    i += end_tag_len;
195                    continue;
196                }
197            }
198        }
199
200        i += 1;
201    }
202}
203
204type Element = (String, HashMap<String, String>, Option<String>);
205
206/// parse html from start to end and return (tag, attributes, innerHtml)
207///
208/// seperated to make logic more reusable
209fn element(reader: &mut CharReader<impl Read>) -> Result<Option<Element>, io::Error> {
210    if let Some('<') = reader.peek_char(0)? {
211        if let Some((tag, attributes, tag_content_length, void_element)) =
212            element_start_tag(reader)?
213        {
214            // <{start_tag}/>
215            if void_element {
216                reader.consume(tag_content_length)?;
217                return Ok(Some((tag, attributes, None)));
218            }
219
220            // <{start_tag}>{content}</{start_tag}>
221            if let Some(content_length) =
222                find_matching_closing_tag(reader, &tag, tag_content_length)?
223            {
224                reader.consume(tag_content_length)?;
225                let content = reader.consume_string(content_length)?;
226                reader.consume(tag.len() + 3)?; // </{tag}>
227
228                return Ok(Some((tag, attributes, Some(content))));
229            }
230        }
231    }
232    Ok(None)
233}
234
235fn comment(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, io::Error> {
236    if "<!--" == reader.peek_string(4)? {
237        if let Some(text) = reader.peek_until_match_exclusive_from(4, "-->")? {
238            reader.consume(4)?; // skip start
239            let text = reader.consume_string(text.len())?;
240            reader.consume(3)?; // skip end
241            return Ok(Some(Html::Comment { text }));
242        }
243    }
244
245    Ok(None)
246}
247
248/// check if a html tag is a void tag (it can not have children)
249pub fn is_void_element(tag: &str) -> bool {
250    match tag {
251        "base" | "img" | "br" | "col" | "embed" | "hr" | "area" | "input" | "link" | "meta"
252        | "param" | "source" | "track" | "wbr" 
253        // SVG void-like elements
254        | "circle" | "ellipse" | "line" | "path" | "polygon" | "polyline" | "rect" 
255        | "stop" | "use" => true,
256        _ => false,
257    }
258}
259
260/// A "simple" streaming html parser function. This is a fairly simplified way of parsing html
261/// ignoring a lot of edge cases and validation normally seen when parsing html.
262///
263/// **NOTE: Might return multiple Text tokens one after another.**
264fn read_token(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, io::Error> {
265    while let Some(c) = reader.peek_char(0)? {
266        if c == '<' {
267            if let Some(comment) = comment(reader)? {
268                return Ok(Some(comment));
269            }
270
271            if let Some((tag, attributes, content)) = element(reader)? {
272                let mut children = vec![];
273                if let Some(content) = content {
274                    let mut reader = CharReader::new(content.as_bytes());
275                    while let Some(html) = read_token(&mut reader)? {
276                        children.push(html);
277                    }
278                }
279                return Ok(Some(Html::Element {
280                    tag,
281                    attributes,
282                    children,
283                }));
284            }
285
286            // non html opening
287            reader.consume(1)?;
288            let mut text = "<".to_string();
289            text.push_str(&reader.consume_until_exclusive(|c| c == '<')?);
290            return Ok(Some(Html::Text { text }));
291        }
292
293        let text = reader.consume_until_exclusive(|c| c == '<')?;
294        // only valid text if it contains a non whitespace character
295        if text.chars().any(|c| c != ' ' && c != '\n') {
296            return Ok(Some(Html::Text { text }));
297        }
298    }
299
300    Ok(None)
301}
302
303/// Simple parsed html representation with recursively added children
304#[derive(Debug, Clone, PartialEq)]
305pub enum Html {
306    Comment {
307        text: String,
308    },
309    Text {
310        text: String,
311    },
312    Element {
313        tag: String,
314        attributes: HashMap<String, String>,
315        children: Vec<Html>,
316    },
317}
318
319impl From<DomNode> for Html {
320    fn from(value: DomNode) -> Self {
321        match &*value.kind() {
322            crate::DomNodeKind::Text { text } => Html::Text { text: text.clone() },
323            crate::DomNodeKind::Element { tag, attributes } => {
324                let children = value.children().map(|c| c.into()).collect();
325                Html::Element {
326                    tag: tag.clone(),
327                    attributes: attributes.clone(),
328                    children,
329                }
330            }
331        }
332    }
333}
334
335#[cfg(test)]
336mod tests {
337    use super::*;
338
339    /// Utility function to convert iteratables into attributes hashmap
340    pub fn to_attributes<I: IntoIterator<Item = (impl Into<String>, impl Into<String>)>>(
341        arr: I,
342    ) -> HashMap<String, String> {
343        arr.into_iter().map(|(k, v)| (k.into(), v.into())).collect()
344    }
345
346    #[test]
347    fn test_html() {
348        let input = r#"<a href="test.com"><i class="fa-solid fa-rss"></i>Test</a>
349<button disabled></button>"#;
350        let expected = vec![
351            Html::Element {
352                tag: "a".into(),
353                attributes: to_attributes([("href", "test.com")]),
354                children: vec![
355                    Html::Element {
356                        tag: "i".into(),
357                        attributes: to_attributes([("class", "fa-solid fa-rss")]),
358                        children: vec![],
359                    },
360                    Html::Text {
361                        text: "Test".into(),
362                    },
363                ],
364            },
365            Html::Element {
366                tag: "button".into(),
367                attributes: to_attributes([("disabled", "")]),
368                children: vec![],
369            },
370        ];
371
372        let tokens = parse_html(input.as_bytes()).unwrap();
373        assert_eq!(expected, tokens);
374
375        let input = r#"<div>
376<a href="link.com">[other](other.com)</a>
377</div>"#;
378        let expected = vec![Html::Element {
379            tag: "div".into(),
380            attributes: HashMap::new(),
381            children: vec![Html::Element {
382                tag: "a".into(),
383                attributes: to_attributes([("href", "link.com")]),
384                children: vec![Html::Text {
385                    text: "[other](other.com)".into(),
386                }],
387            }],
388        }];
389        let tokens = parse_html(input.as_bytes()).unwrap();
390        assert_eq!(expected, tokens);
391    }
392
393    #[test]
394    fn test_text_looks_like_html() {
395        let input = r#"<Lots of people say Rust > c++. even though it might be
396< then c++. Who knows? 
397<>
398<nonclosing>
399This should be text
400"#;
401        let expected = vec![Html::Text {
402            text: "<Lots of people say Rust > c++. even though it might be
403< then c++. Who knows? 
404<>
405<nonclosing>
406This should be text
407"
408            .into(),
409        }];
410
411        let tokens = parse_html(input.as_bytes()).unwrap();
412        assert_eq!(expected, tokens);
413    }
414
415    #[test]
416    fn test_js_in_attribute() {
417        let input = r#"<div onclick="() => test()"></div>"#;
418
419        let expected = vec![Html::Element {
420            tag: "div".into(),
421            attributes: to_attributes([("onclick", "() => test()")]),
422            children: vec![],
423        }];
424        let tokens = parse_html(input.as_bytes()).unwrap();
425        assert_eq!(expected, tokens);
426    }
427
428    #[test]
429    fn test_nested_elements() {
430        let input = r#"<div class="a">
431            <div class="b">
432                <div class="c">
433                </div>
434            </div>
435        </div>
436        "#;
437        let expected = vec![Html::Element {
438            tag: "div".into(),
439            attributes: to_attributes([("class", "a")]),
440            children: vec![Html::Element {
441                tag: "div".into(),
442                attributes: to_attributes([("class", "b")]),
443                children: vec![Html::Element {
444                    tag: "div".into(),
445                    attributes: to_attributes([("class", "c")]),
446                    children: vec![],
447                }],
448            }],
449        }];
450        let tokens = parse_html(input.as_bytes()).unwrap();
451        assert_eq!(expected, tokens);
452    }
453
454    #[test]
455    fn test_full_html_document() {
456        let input = r#"<!doctype html>
457<html>
458  <head>
459    <meta content="art,simulation,technology" name="keywords" />
460    <script type="module" crossorigin src="./assets/main-B0Asn3MK.js"></script>
461    <link rel="modulepreload" crossorigin href="./assets/creature-BZHPYSn1.js">
462    <link rel="stylesheet" crossorigin href="./assets/main-CjrOOoWN.css">
463  </head>
464  <body>
465    <div id="messages"></div>
466    <div id="debug"></div>
467    <canvas id="root">Your browser does not support the HTML canvas tag.</canvas>
468    <a id="qr-link" target="_blank">
469      <div id="qr"></div>
470    </a>
471  </body>
472</html>"#;
473        let expected = vec![
474            Html::Text {
475                text: "<!doctype html>\n".into(),
476            },
477            Html::Element {
478                tag: "html".into(),
479                attributes: HashMap::new(),
480                children: vec![
481                    Html::Element {
482                        tag: "head".into(),
483                        attributes: HashMap::new(),
484                        children: vec![
485                            Html::Element {
486                                tag: "meta".into(),
487                                attributes: to_attributes([
488                                    ("content", "art,simulation,technology"),
489                                    ("name", "keywords"),
490                                ]),
491                                children: vec![],
492                            },
493                            Html::Element {
494                                tag: "script".into(),
495                                attributes: to_attributes([
496                                    ("type", "module"),
497                                    ("crossorigin", ""),
498                                    ("src", "./assets/main-B0Asn3MK.js"),
499                                ]),
500                                children: vec![],
501                            },
502                            Html::Element {
503                                tag: "link".into(),
504                                attributes: to_attributes([
505                                    ("rel", "modulepreload"),
506                                    ("crossorigin", ""),
507                                    ("href", "./assets/creature-BZHPYSn1.js"),
508                                ]),
509                                children: vec![],
510                            },
511                            Html::Element {
512                                tag: "link".into(),
513                                attributes: to_attributes([
514                                    ("rel", "stylesheet"),
515                                    ("crossorigin", ""),
516                                    ("href", "./assets/main-CjrOOoWN.css"),
517                                ]),
518                                children: vec![],
519                            },
520                        ],
521                    },
522                    Html::Element {
523                        tag: "body".into(),
524                        attributes: HashMap::new(),
525                        children: vec![
526                            Html::Element {
527                                tag: "div".into(),
528                                attributes: to_attributes([("id", "messages")]),
529                                children: vec![],
530                            },
531                            Html::Element {
532                                tag: "div".into(),
533                                attributes: to_attributes([("id", "debug")]),
534                                children: vec![],
535                            },
536                            Html::Element {
537                                tag: "canvas".into(),
538                                attributes: to_attributes([("id", "root")]),
539                                children: vec![Html::Text {
540                                    text: "Your browser does not support the HTML canvas tag."
541                                        .into(),
542                                }],
543                            },
544                            Html::Element {
545                                tag: "a".into(),
546                                attributes: to_attributes([
547                                    ("id", "qr-link"),
548                                    ("target", "_blank"),
549                                ]),
550                                children: vec![Html::Element {
551                                    tag: "div".into(),
552                                    attributes: to_attributes([("id", "qr")]),
553                                    children: vec![],
554                                }],
555                            },
556                        ],
557                    },
558                ],
559            },
560        ];
561        let tokens = parse_html(input.as_bytes()).unwrap();
562        assert_eq!(expected, tokens);
563    }
564
565    #[test]
566    fn test_svg() {
567        let input = r#"<svg xmlns="http://www.w3.org/2000/svg" width="20" viewBox="0 0 640 640" height="20"><path d="M451.5 160C434.9 160 418.8 164.5 404.7 172.7"/></svg>"#;
568        let expected = vec![Html::Element {
569            tag: "svg".into(),
570            attributes: to_attributes([
571                ("xmlns", "http://www.w3.org/2000/svg"),
572                ("width", "20"),
573                ("viewBox", "0 0 640 640"),
574                ("height", "20"),
575            ]),
576            children: vec![Html::Element {
577                tag: "path".into(),
578                attributes: to_attributes([("d", "M451.5 160C434.9 160 418.8 164.5 404.7 172.7")]),
579                children: vec![],
580            }],
581        }];
582        let tokens = parse_html(input.as_bytes()).unwrap();
583        assert_eq!(expected, tokens);
584    }
585
586    #[test]
587    fn test_void_elements_with_and_without_self_closing() {
588        // Void elements without self-closing slash (HTML5 style)
589        let input = r#"<meta charset="utf-8">
590<link rel="stylesheet" href="style.css">
591<img src="image.jpg" alt="test">"#;
592        let tokens = parse_html(input.as_bytes()).unwrap();
593        assert_eq!(tokens.len(), 3);
594        assert!(matches!(tokens[0], Html::Element { ref tag, .. } if tag == "meta"));
595        assert!(matches!(tokens[1], Html::Element { ref tag, .. } if tag == "link"));
596        assert!(matches!(tokens[2], Html::Element { ref tag, .. } if tag == "img"));
597
598        // Void elements with self-closing slash (XHTML style)
599        let input = r#"<meta charset="utf-8" />
600<link rel="stylesheet" href="style.css" />
601<img src="image.jpg" alt="test" />"#;
602        let tokens = parse_html(input.as_bytes()).unwrap();
603        assert_eq!(tokens.len(), 3);
604        assert!(matches!(tokens[0], Html::Element { ref tag, .. } if tag == "meta"));
605        assert!(matches!(tokens[1], Html::Element { ref tag, .. } if tag == "link"));
606        assert!(matches!(tokens[2], Html::Element { ref tag, .. } if tag == "img"));
607    }
608}