Skip to main content

virtual_dom/
html.rs

1use std::{collections::HashMap, io, io::Read};
2
3use lssg_char_reader::CharReader;
4
5use crate::DomNode;
6
7pub fn parse_html_from_string(input: &String) -> Result<Vec<Html>, io::Error> {
8    parse_html(input.as_bytes())
9}
10
11// TODO: return DomNode directly instead of parsing to intermediary representation
12pub fn parse_html(input: impl Read) -> Result<Vec<Html>, io::Error> {
13    let mut reader = CharReader::new(input);
14
15    let mut tokens = vec![];
16
17    loop {
18        match read_token(&mut reader)? {
19            None => break,
20            Some(t) => tokens.push(t),
21        }
22    }
23
24    // add texts together
25    let mut reduced_tokens = vec![];
26    for token in tokens.into_iter() {
27        if let Some(Html::Text { text: a }) = reduced_tokens.last_mut() {
28            if let Html::Text { text: b } = &token {
29                *a += b;
30                continue;
31            }
32        }
33        reduced_tokens.push(token)
34    }
35
36    Ok(reduced_tokens)
37}
38
39fn attributes(start_tag_content: &str) -> Result<HashMap<String, String>, io::Error> {
40    // remove whitespace before and after text
41    let start_tag_content = start_tag_content.trim();
42    let chars: Vec<char> = start_tag_content.chars().collect();
43    let mut attributes = HashMap::new();
44    let mut key = String::new();
45    let mut value = String::new();
46    let mut in_value = false;
47    let mut quote_char: Option<char> = None;
48    let mut i = 0;
49    while i < chars.len() {
50        match chars[i] {
51            ' ' | '\n' if !in_value => {
52                if !key.is_empty() {
53                    attributes.insert(key, value);
54                    key = String::new();
55                    value = String::new();
56                    in_value = false;
57                }
58            }
59            '=' => match chars.get(i + 1) {
60                Some(&q @ '"') | Some(&q @ '\'') => {
61                    i += 1;
62                    in_value = true;
63                    quote_char = Some(q);
64                }
65                _ => {
66                    // '=' not followed by a quote
67                    if in_value {
68                        value.push('=')
69                    } else {
70                        key.push('=')
71                    }
72                }
73            },
74            '\'' | '"' if in_value && Some(chars[i]) == quote_char => {
75                in_value = false;
76                quote_char = None;
77            }
78            c => {
79                if in_value {
80                    value.push(c)
81                } else {
82                    key.push(c)
83                }
84            }
85        }
86        i += 1;
87    }
88    if !key.is_empty() {
89        attributes.insert(key, value);
90    }
91
92    Ok(attributes)
93}
94
95type ElementStartTag = (String, HashMap<String, String>, usize, bool);
96
97/// Get the start tag with its attributes starts after the opening tag '<'
98///
99/// returns (tag, attributes, tag_content_length, void_element)
100fn element_start_tag(
101    reader: &mut CharReader<impl Read>,
102) -> Result<Option<ElementStartTag>, io::Error> {
103    let mut inside_single_quotes = false;
104    let mut inside_double_quotes = false;
105    let mut i = 1;
106    while let Some(c) = reader.peek_char(i)? {
107        match c {
108            '>' if !inside_single_quotes && !inside_double_quotes => {
109                let tag_content = reader.peek_string(i + 1)?;
110
111                let mut tag = String::new();
112                for c in tag_content.chars().skip(1) {
113                    match c {
114                        ' ' | '\n' | '>' | '/' => break,
115                        _ => tag.push(c),
116                    }
117                }
118
119                // Check if this is a void element (with or without self-closing /)
120                let has_self_closing_slash = reader.peek_char(i - 1)? == Some('/');
121                let void_element = is_void_element(&tag);
122
123                // Calculate attributes end position
124                let attributes_end = if has_self_closing_slash {
125                    // if it has self-closing slash, exclude the / and >
126                    tag_content.len() - 2
127                } else {
128                    // otherwise just exclude the >
129                    tag_content.len() - 1
130                };
131
132                let attributes = attributes(&tag_content[tag.len() + 1..attributes_end])?;
133
134                return Ok(Some((tag, attributes, i + 1, void_element)));
135            }
136            '"' if !inside_single_quotes => inside_double_quotes = !inside_double_quotes,
137            '\'' if !inside_double_quotes => inside_single_quotes = !inside_single_quotes,
138            _ => {}
139        }
140        i += 1;
141    }
142    Ok(None)
143}
144
145/// Find the matching closing tag while respecting nesting
146fn find_matching_closing_tag(
147    reader: &mut CharReader<impl Read>,
148    tag: &str,
149    start_offset: usize,
150) -> Result<Option<usize>, io::Error> {
151    let start_tag = format!("<{}", tag);
152    let end_tag = format!("</{}>", tag);
153    let mut depth = 0;
154    let mut i = start_offset;
155    let mut in_double_quotes = false;
156    let mut in_single_quotes = false;
157
158    loop {
159        // Try to peek ahead to see if we have more content
160        let peek_char = reader.peek_char(i)?;
161        if peek_char.is_none() {
162            return Ok(None);
163        }
164
165        let current_char = peek_char.unwrap();
166
167        // Track quote state to ignore tags inside attribute values
168        match current_char {
169            '"' if !in_single_quotes => in_double_quotes = !in_double_quotes,
170            '\'' if !in_double_quotes => in_single_quotes = !in_single_quotes,
171            _ => {}
172        }
173
174        // Only look for tags when not inside quotes
175        if !in_double_quotes && !in_single_quotes && current_char == '<' {
176            // Check if we can match the start tag at position i
177            let start_tag_len = start_tag.len();
178            if let Ok(peek_start) = reader.peek_string_from(i, start_tag_len + 1) {
179                if peek_start.starts_with(&start_tag) {
180                    // Make sure it's actually a tag (followed by space, >, or /)
181                    if let Some(next_char) = peek_start.chars().nth(start_tag_len) {
182                        if next_char == ' ' || next_char == '>' || next_char == '/' {
183                            depth += 1;
184                            i += start_tag_len;
185                            continue;
186                        }
187                    }
188                }
189            }
190
191            // Check if we can match the end tag at position i
192            let end_tag_len = end_tag.len();
193            if let Ok(peek_end) = reader.peek_string_from(i, end_tag_len) {
194                if peek_end == end_tag {
195                    if depth == 0 {
196                        return Ok(Some(i - start_offset));
197                    }
198                    depth -= 1;
199                    i += end_tag_len;
200                    continue;
201                }
202            }
203        }
204
205        i += 1;
206    }
207}
208
209type Element = (String, HashMap<String, String>, Option<String>);
210
211/// parse html from start to end and return (tag, attributes, innerHtml)
212///
213/// seperated to make logic more reusable
214fn element(reader: &mut CharReader<impl Read>) -> Result<Option<Element>, io::Error> {
215    if let Some('<') = reader.peek_char(0)? {
216        if let Some((tag, attributes, tag_content_length, void_element)) =
217            element_start_tag(reader)?
218        {
219            // <{start_tag}/>
220            if void_element {
221                reader.consume(tag_content_length)?;
222                return Ok(Some((tag, attributes, None)));
223            }
224
225            // <{start_tag}>{content}</{start_tag}>
226            if let Some(content_length) =
227                find_matching_closing_tag(reader, &tag, tag_content_length)?
228            {
229                reader.consume(tag_content_length)?;
230                let content = reader.consume_string(content_length)?;
231                reader.consume(tag.len() + 3)?; // </{tag}>
232
233                return Ok(Some((tag, attributes, Some(content))));
234            }
235        }
236    }
237    Ok(None)
238}
239
240fn comment(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, io::Error> {
241    if "<!--" == reader.peek_string(4)? {
242        if let Some(text) = reader.peek_until_match_exclusive_from(4, "-->")? {
243            reader.consume(4)?; // skip start
244            let text = reader.consume_string(text.len())?;
245            reader.consume(3)?; // skip end
246            return Ok(Some(Html::Comment { text }));
247        }
248    }
249
250    Ok(None)
251}
252
253/// check if a html tag is a void tag (it can not have children)
254pub fn is_void_element(tag: &str) -> bool {
255    match tag {
256        "base" | "img" | "br" | "col" | "embed" | "hr" | "area" | "input" | "link" | "meta"
257        | "param" | "source" | "track" | "wbr" 
258        // SVG void-like elements
259        | "circle" | "ellipse" | "line" | "path" | "polygon" | "polyline" | "rect" 
260        | "stop" | "use" => true,
261        _ => false,
262    }
263}
264
265/// A "simple" streaming html parser function. This is a fairly simplified way of parsing html
266/// ignoring a lot of edge cases and validation normally seen when parsing html.
267///
268/// **NOTE: Might return multiple Text tokens one after another.**
269fn read_token(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, io::Error> {
270    while let Some(c) = reader.peek_char(0)? {
271        if c == '<' {
272            if let Some(comment) = comment(reader)? {
273                return Ok(Some(comment));
274            }
275
276            if let Some((tag, attributes, content)) = element(reader)? {
277                let mut children = vec![];
278                if let Some(content) = content {
279                    let mut reader = CharReader::new(content.as_bytes());
280                    while let Some(html) = read_token(&mut reader)? {
281                        children.push(html);
282                    }
283                }
284                return Ok(Some(Html::Element {
285                    tag,
286                    attributes,
287                    children,
288                }));
289            }
290
291            // non html opening
292            reader.consume(1)?;
293            let mut text = "<".to_string();
294            text.push_str(&reader.consume_until_exclusive(|c| c == '<')?);
295            return Ok(Some(Html::Text { text }));
296        }
297
298        let text = reader.consume_until_exclusive(|c| c == '<')?;
299        // only valid text if it contains a non whitespace character
300        if text.chars().any(|c| c != ' ' && c != '\n') {
301            return Ok(Some(Html::Text { text }));
302        }
303    }
304
305    Ok(None)
306}
307
308/// Simple parsed html representation with recursively added children
309#[derive(Debug, Clone, PartialEq)]
310pub enum Html {
311    Comment {
312        text: String,
313    },
314    Text {
315        text: String,
316    },
317    Element {
318        tag: String,
319        attributes: HashMap<String, String>,
320        children: Vec<Html>,
321    },
322}
323
324impl From<DomNode> for Html {
325    fn from(value: DomNode) -> Self {
326        match &*value.kind() {
327            crate::DomNodeKind::Text { text } => Html::Text { text: text.clone() },
328            crate::DomNodeKind::Element { tag, attributes } => {
329                let children = value.children().map(|c| c.into()).collect();
330                Html::Element {
331                    tag: tag.clone(),
332                    attributes: attributes.clone(),
333                    children,
334                }
335            }
336        }
337    }
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    /// Utility function to convert iteratables into attributes hashmap
345    pub fn to_attributes<I: IntoIterator<Item = (impl Into<String>, impl Into<String>)>>(
346        arr: I,
347    ) -> HashMap<String, String> {
348        arr.into_iter().map(|(k, v)| (k.into(), v.into())).collect()
349    }
350
351    #[test]
352    fn test_html() {
353        let input = r#"<a href="test.com"><i class="fa-solid fa-rss"></i>Test</a>
354<button disabled></button>"#;
355        let expected = vec![
356            Html::Element {
357                tag: "a".into(),
358                attributes: to_attributes([("href", "test.com")]),
359                children: vec![
360                    Html::Element {
361                        tag: "i".into(),
362                        attributes: to_attributes([("class", "fa-solid fa-rss")]),
363                        children: vec![],
364                    },
365                    Html::Text {
366                        text: "Test".into(),
367                    },
368                ],
369            },
370            Html::Element {
371                tag: "button".into(),
372                attributes: to_attributes([("disabled", "")]),
373                children: vec![],
374            },
375        ];
376
377        let tokens = parse_html(input.as_bytes()).unwrap();
378        assert_eq!(expected, tokens);
379
380        let input = r#"<div>
381<a href="link.com">[other](other.com)</a>
382</div>"#;
383        let expected = vec![Html::Element {
384            tag: "div".into(),
385            attributes: HashMap::new(),
386            children: vec![Html::Element {
387                tag: "a".into(),
388                attributes: to_attributes([("href", "link.com")]),
389                children: vec![Html::Text {
390                    text: "[other](other.com)".into(),
391                }],
392            }],
393        }];
394        let tokens = parse_html(input.as_bytes()).unwrap();
395        assert_eq!(expected, tokens);
396    }
397
398    #[test]
399    fn test_text_looks_like_html() {
400        let input = r#"<Lots of people say Rust > c++. even though it might be
401< then c++. Who knows? 
402<>
403<nonclosing>
404This should be text
405"#;
406        let expected = vec![Html::Text {
407            text: "<Lots of people say Rust > c++. even though it might be
408< then c++. Who knows? 
409<>
410<nonclosing>
411This should be text
412"
413            .into(),
414        }];
415
416        let tokens = parse_html(input.as_bytes()).unwrap();
417        assert_eq!(expected, tokens);
418    }
419
420    #[test]
421    fn test_js_in_attribute() {
422        let input = r#"<div onclick="() => test()"></div>"#;
423
424        let expected = vec![Html::Element {
425            tag: "div".into(),
426            attributes: to_attributes([("onclick", "() => test()")]),
427            children: vec![],
428        }];
429        let tokens = parse_html(input.as_bytes()).unwrap();
430        assert_eq!(expected, tokens);
431    }
432
433    #[test]
434    fn test_nested_elements() {
435        let input = r#"<div class="a">
436            <div class="b">
437                <div class="c">
438                </div>
439            </div>
440        </div>
441        "#;
442        let expected = vec![Html::Element {
443            tag: "div".into(),
444            attributes: to_attributes([("class", "a")]),
445            children: vec![Html::Element {
446                tag: "div".into(),
447                attributes: to_attributes([("class", "b")]),
448                children: vec![Html::Element {
449                    tag: "div".into(),
450                    attributes: to_attributes([("class", "c")]),
451                    children: vec![],
452                }],
453            }],
454        }];
455        let tokens = parse_html(input.as_bytes()).unwrap();
456        assert_eq!(expected, tokens);
457    }
458
459    #[test]
460    fn test_full_html_document() {
461        let input = r#"<!doctype html>
462<html>
463  <head>
464    <meta content="art,simulation,technology" name="keywords" />
465    <script type="module" crossorigin src="./assets/main-B0Asn3MK.js"></script>
466    <link rel="modulepreload" crossorigin href="./assets/creature-BZHPYSn1.js">
467    <link rel="stylesheet" crossorigin href="./assets/main-CjrOOoWN.css">
468  </head>
469  <body>
470    <div id="messages"></div>
471    <div id="debug"></div>
472    <canvas id="root">Your browser does not support the HTML canvas tag.</canvas>
473    <a id="qr-link" target="_blank">
474      <div id="qr"></div>
475    </a>
476  </body>
477</html>"#;
478        let expected = vec![
479            Html::Text {
480                text: "<!doctype html>\n".into(),
481            },
482            Html::Element {
483                tag: "html".into(),
484                attributes: HashMap::new(),
485                children: vec![
486                    Html::Element {
487                        tag: "head".into(),
488                        attributes: HashMap::new(),
489                        children: vec![
490                            Html::Element {
491                                tag: "meta".into(),
492                                attributes: to_attributes([
493                                    ("content", "art,simulation,technology"),
494                                    ("name", "keywords"),
495                                ]),
496                                children: vec![],
497                            },
498                            Html::Element {
499                                tag: "script".into(),
500                                attributes: to_attributes([
501                                    ("type", "module"),
502                                    ("crossorigin", ""),
503                                    ("src", "./assets/main-B0Asn3MK.js"),
504                                ]),
505                                children: vec![],
506                            },
507                            Html::Element {
508                                tag: "link".into(),
509                                attributes: to_attributes([
510                                    ("rel", "modulepreload"),
511                                    ("crossorigin", ""),
512                                    ("href", "./assets/creature-BZHPYSn1.js"),
513                                ]),
514                                children: vec![],
515                            },
516                            Html::Element {
517                                tag: "link".into(),
518                                attributes: to_attributes([
519                                    ("rel", "stylesheet"),
520                                    ("crossorigin", ""),
521                                    ("href", "./assets/main-CjrOOoWN.css"),
522                                ]),
523                                children: vec![],
524                            },
525                        ],
526                    },
527                    Html::Element {
528                        tag: "body".into(),
529                        attributes: HashMap::new(),
530                        children: vec![
531                            Html::Element {
532                                tag: "div".into(),
533                                attributes: to_attributes([("id", "messages")]),
534                                children: vec![],
535                            },
536                            Html::Element {
537                                tag: "div".into(),
538                                attributes: to_attributes([("id", "debug")]),
539                                children: vec![],
540                            },
541                            Html::Element {
542                                tag: "canvas".into(),
543                                attributes: to_attributes([("id", "root")]),
544                                children: vec![Html::Text {
545                                    text: "Your browser does not support the HTML canvas tag."
546                                        .into(),
547                                }],
548                            },
549                            Html::Element {
550                                tag: "a".into(),
551                                attributes: to_attributes([
552                                    ("id", "qr-link"),
553                                    ("target", "_blank"),
554                                ]),
555                                children: vec![Html::Element {
556                                    tag: "div".into(),
557                                    attributes: to_attributes([("id", "qr")]),
558                                    children: vec![],
559                                }],
560                            },
561                        ],
562                    },
563                ],
564            },
565        ];
566        let tokens = parse_html(input.as_bytes()).unwrap();
567        assert_eq!(expected, tokens);
568    }
569
570    #[test]
571    fn test_svg() {
572        let input = r#"<svg xmlns="http://www.w3.org/2000/svg" width="20" viewBox="0 0 640 640" height="20"><path d="M451.5 160C434.9 160 418.8 164.5 404.7 172.7"/></svg>"#;
573        let expected = vec![Html::Element {
574            tag: "svg".into(),
575            attributes: to_attributes([
576                ("xmlns", "http://www.w3.org/2000/svg"),
577                ("width", "20"),
578                ("viewBox", "0 0 640 640"),
579                ("height", "20"),
580            ]),
581            children: vec![Html::Element {
582                tag: "path".into(),
583                attributes: to_attributes([("d", "M451.5 160C434.9 160 418.8 164.5 404.7 172.7")]),
584                children: vec![],
585            }],
586        }];
587        let tokens = parse_html(input.as_bytes()).unwrap();
588        assert_eq!(expected, tokens);
589    }
590
591    #[test]
592    fn test_void_elements_with_and_without_self_closing() {
593        // Void elements without self-closing slash (HTML5 style)
594        let input = r#"<meta charset="utf-8">
595<link rel="stylesheet" href="style.css">
596<img src="image.jpg" alt="test">"#;
597        let tokens = parse_html(input.as_bytes()).unwrap();
598        assert_eq!(tokens.len(), 3);
599        assert!(matches!(tokens[0], Html::Element { ref tag, .. } if tag == "meta"));
600        assert!(matches!(tokens[1], Html::Element { ref tag, .. } if tag == "link"));
601        assert!(matches!(tokens[2], Html::Element { ref tag, .. } if tag == "img"));
602
603        // Void elements with self-closing slash (XHTML style)
604        let input = r#"<meta charset="utf-8" />
605<link rel="stylesheet" href="style.css" />
606<img src="image.jpg" alt="test" />"#;
607        let tokens = parse_html(input.as_bytes()).unwrap();
608        assert_eq!(tokens.len(), 3);
609        assert!(matches!(tokens[0], Html::Element { ref tag, .. } if tag == "meta"));
610        assert!(matches!(tokens[1], Html::Element { ref tag, .. } if tag == "link"));
611        assert!(matches!(tokens[2], Html::Element { ref tag, .. } if tag == "img"));
612    }
613}