use std::{collections::HashMap, io, io::Read};
use lssg_char_reader::CharReader;
use crate::DomNode;
pub fn parse_html_from_string(input: &String) -> Result<Vec<Html>, io::Error> {
parse_html(input.as_bytes())
}
pub fn parse_html(input: impl Read) -> Result<Vec<Html>, io::Error> {
let mut reader = CharReader::new(input);
let mut tokens = vec![];
loop {
match read_token(&mut reader)? {
None => break,
Some(t) => tokens.push(t),
}
}
let mut reduced_tokens = vec![];
for token in tokens.into_iter() {
if let Some(Html::Text { text: a }) = reduced_tokens.last_mut() {
if let Html::Text { text: b } = &token {
*a += b;
continue;
}
}
reduced_tokens.push(token)
}
Ok(reduced_tokens)
}
fn attributes(start_tag_content: &str) -> Result<HashMap<String, String>, io::Error> {
let start_tag_content = start_tag_content.trim();
let chars: Vec<char> = start_tag_content.chars().collect();
let mut attributes = HashMap::new();
let mut key = String::new();
let mut value = String::new();
let mut in_value = false;
let mut quote_char: Option<char> = None;
let mut i = 0;
while i < chars.len() {
match chars[i] {
' ' | '\n' if !in_value => {
if !key.is_empty() {
attributes.insert(key, value);
key = String::new();
value = String::new();
in_value = false;
}
}
'=' => match chars.get(i + 1) {
Some(&q @ '"') | Some(&q @ '\'') => {
i += 1;
in_value = true;
quote_char = Some(q);
}
_ => {
if in_value {
value.push('=')
} else {
key.push('=')
}
}
},
'\'' | '"' if in_value && Some(chars[i]) == quote_char => {
in_value = false;
quote_char = None;
}
c => {
if in_value {
value.push(c)
} else {
key.push(c)
}
}
}
i += 1;
}
if !key.is_empty() {
attributes.insert(key, value);
}
Ok(attributes)
}
type ElementStartTag = (String, HashMap<String, String>, usize, bool);
fn element_start_tag(
reader: &mut CharReader<impl Read>,
) -> Result<Option<ElementStartTag>, io::Error> {
let mut inside_single_quotes = false;
let mut inside_double_quotes = false;
let mut i = 1;
while let Some(c) = reader.peek_char(i)? {
match c {
'>' if !inside_single_quotes && !inside_double_quotes => {
let tag_content = reader.peek_string(i + 1)?;
let mut tag = String::new();
for c in tag_content.chars().skip(1) {
match c {
' ' | '\n' | '>' | '/' => break,
_ => tag.push(c),
}
}
let has_self_closing_slash = reader.peek_char(i - 1)? == Some('/');
let void_element = is_void_element(&tag);
let attributes_end = if has_self_closing_slash {
tag_content.len() - 2
} else {
tag_content.len() - 1
};
let attributes = attributes(&tag_content[tag.len() + 1..attributes_end])?;
return Ok(Some((tag, attributes, i + 1, void_element)));
}
'"' if !inside_single_quotes => inside_double_quotes = !inside_double_quotes,
'\'' if !inside_double_quotes => inside_single_quotes = !inside_single_quotes,
_ => {}
}
i += 1;
}
Ok(None)
}
fn find_matching_closing_tag(
reader: &mut CharReader<impl Read>,
tag: &str,
start_offset: usize,
) -> Result<Option<usize>, io::Error> {
let start_tag = format!("<{}", tag);
let end_tag = format!("</{}>", tag);
let mut depth = 0;
let mut i = start_offset;
let mut in_double_quotes = false;
let mut in_single_quotes = false;
loop {
let peek_char = reader.peek_char(i)?;
if peek_char.is_none() {
return Ok(None);
}
let current_char = peek_char.unwrap();
match current_char {
'"' if !in_single_quotes => in_double_quotes = !in_double_quotes,
'\'' if !in_double_quotes => in_single_quotes = !in_single_quotes,
_ => {}
}
if !in_double_quotes && !in_single_quotes && current_char == '<' {
let start_tag_len = start_tag.len();
if let Ok(peek_start) = reader.peek_string_from(i, start_tag_len + 1) {
if peek_start.starts_with(&start_tag) {
if let Some(next_char) = peek_start.chars().nth(start_tag_len) {
if next_char == ' ' || next_char == '>' || next_char == '/' {
depth += 1;
i += start_tag_len;
continue;
}
}
}
}
let end_tag_len = end_tag.len();
if let Ok(peek_end) = reader.peek_string_from(i, end_tag_len) {
if peek_end == end_tag {
if depth == 0 {
return Ok(Some(i - start_offset));
}
depth -= 1;
i += end_tag_len;
continue;
}
}
}
i += 1;
}
}
type Element = (String, HashMap<String, String>, Option<String>);
fn element(reader: &mut CharReader<impl Read>) -> Result<Option<Element>, io::Error> {
if let Some('<') = reader.peek_char(0)? {
if let Some((tag, attributes, tag_content_length, void_element)) =
element_start_tag(reader)?
{
if void_element {
reader.consume(tag_content_length)?;
return Ok(Some((tag, attributes, None)));
}
if let Some(content_length) =
find_matching_closing_tag(reader, &tag, tag_content_length)?
{
reader.consume(tag_content_length)?;
let content = reader.consume_string(content_length)?;
reader.consume(tag.len() + 3)?;
return Ok(Some((tag, attributes, Some(content))));
}
}
}
Ok(None)
}
fn comment(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, io::Error> {
if "<!--" == reader.peek_string(4)? {
if let Some(text) = reader.peek_until_match_exclusive_from(4, "-->")? {
reader.consume(4)?; let text = reader.consume_string(text.len())?;
reader.consume(3)?; return Ok(Some(Html::Comment { text }));
}
}
Ok(None)
}
pub fn is_void_element(tag: &str) -> bool {
match tag {
"base" | "img" | "br" | "col" | "embed" | "hr" | "area" | "input" | "link" | "meta"
| "param" | "source" | "track" | "wbr"
| "circle" | "ellipse" | "line" | "path" | "polygon" | "polyline" | "rect"
| "stop" | "use" => true,
_ => false,
}
}
fn read_token(reader: &mut CharReader<impl Read>) -> Result<Option<Html>, io::Error> {
while let Some(c) = reader.peek_char(0)? {
if c == '<' {
if let Some(comment) = comment(reader)? {
return Ok(Some(comment));
}
if let Some((tag, attributes, content)) = element(reader)? {
let mut children = vec![];
if let Some(content) = content {
let mut reader = CharReader::new(content.as_bytes());
while let Some(html) = read_token(&mut reader)? {
children.push(html);
}
}
return Ok(Some(Html::Element {
tag,
attributes,
children,
}));
}
reader.consume(1)?;
let mut text = "<".to_string();
text.push_str(&reader.consume_until_exclusive(|c| c == '<')?);
return Ok(Some(Html::Text { text }));
}
let text = reader.consume_until_exclusive(|c| c == '<')?;
if text.chars().any(|c| c != ' ' && c != '\n') {
return Ok(Some(Html::Text { text }));
}
}
Ok(None)
}
#[derive(Debug, Clone, PartialEq)]
pub enum Html {
Comment {
text: String,
},
Text {
text: String,
},
Element {
tag: String,
attributes: HashMap<String, String>,
children: Vec<Html>,
},
}
impl From<DomNode> for Html {
fn from(value: DomNode) -> Self {
match &*value.kind() {
crate::DomNodeKind::Text { text } => Html::Text { text: text.clone() },
crate::DomNodeKind::Element { tag, attributes } => {
let children = value.children().map(|c| c.into()).collect();
Html::Element {
tag: tag.clone(),
attributes: attributes.clone(),
children,
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
pub fn to_attributes<I: IntoIterator<Item = (impl Into<String>, impl Into<String>)>>(
arr: I,
) -> HashMap<String, String> {
arr.into_iter().map(|(k, v)| (k.into(), v.into())).collect()
}
#[test]
fn test_html() {
let input = r#"<a href="test.com"><i class="fa-solid fa-rss"></i>Test</a>
<button disabled></button>"#;
let expected = vec![
Html::Element {
tag: "a".into(),
attributes: to_attributes([("href", "test.com")]),
children: vec![
Html::Element {
tag: "i".into(),
attributes: to_attributes([("class", "fa-solid fa-rss")]),
children: vec![],
},
Html::Text {
text: "Test".into(),
},
],
},
Html::Element {
tag: "button".into(),
attributes: to_attributes([("disabled", "")]),
children: vec![],
},
];
let tokens = parse_html(input.as_bytes()).unwrap();
assert_eq!(expected, tokens);
let input = r#"<div>
<a href="link.com">[other](other.com)</a>
</div>"#;
let expected = vec![Html::Element {
tag: "div".into(),
attributes: HashMap::new(),
children: vec![Html::Element {
tag: "a".into(),
attributes: to_attributes([("href", "link.com")]),
children: vec".into(),
}],
}],
}];
let tokens = parse_html(input.as_bytes()).unwrap();
assert_eq!(expected, tokens);
}
#[test]
fn test_text_looks_like_html() {
let input = r#"<Lots of people say Rust > c++. even though it might be
< then c++. Who knows?
<>
<nonclosing>
This should be text
"#;
let expected = vec![Html::Text {
text: "<Lots of people say Rust > c++. even though it might be
< then c++. Who knows?
<>
<nonclosing>
This should be text
"
.into(),
}];
let tokens = parse_html(input.as_bytes()).unwrap();
assert_eq!(expected, tokens);
}
#[test]
fn test_js_in_attribute() {
let input = r#"<div onclick="() => test()"></div>"#;
let expected = vec![Html::Element {
tag: "div".into(),
attributes: to_attributes([("onclick", "() => test()")]),
children: vec![],
}];
let tokens = parse_html(input.as_bytes()).unwrap();
assert_eq!(expected, tokens);
}
#[test]
fn test_nested_elements() {
let input = r#"<div class="a">
<div class="b">
<div class="c">
</div>
</div>
</div>
"#;
let expected = vec![Html::Element {
tag: "div".into(),
attributes: to_attributes([("class", "a")]),
children: vec![Html::Element {
tag: "div".into(),
attributes: to_attributes([("class", "b")]),
children: vec![Html::Element {
tag: "div".into(),
attributes: to_attributes([("class", "c")]),
children: vec![],
}],
}],
}];
let tokens = parse_html(input.as_bytes()).unwrap();
assert_eq!(expected, tokens);
}
#[test]
fn test_full_html_document() {
let input = r#"<!doctype html>
<html>
<head>
<meta content="art,simulation,technology" name="keywords" />
<script type="module" crossorigin src="./assets/main-B0Asn3MK.js"></script>
<link rel="modulepreload" crossorigin href="./assets/creature-BZHPYSn1.js">
<link rel="stylesheet" crossorigin href="./assets/main-CjrOOoWN.css">
</head>
<body>
<div id="messages"></div>
<div id="debug"></div>
<canvas id="root">Your browser does not support the HTML canvas tag.</canvas>
<a id="qr-link" target="_blank">
<div id="qr"></div>
</a>
</body>
</html>"#;
let expected = vec![
Html::Text {
text: "<!doctype html>\n".into(),
},
Html::Element {
tag: "html".into(),
attributes: HashMap::new(),
children: vec![
Html::Element {
tag: "head".into(),
attributes: HashMap::new(),
children: vec![
Html::Element {
tag: "meta".into(),
attributes: to_attributes([
("content", "art,simulation,technology"),
("name", "keywords"),
]),
children: vec![],
},
Html::Element {
tag: "script".into(),
attributes: to_attributes([
("type", "module"),
("crossorigin", ""),
("src", "./assets/main-B0Asn3MK.js"),
]),
children: vec![],
},
Html::Element {
tag: "link".into(),
attributes: to_attributes([
("rel", "modulepreload"),
("crossorigin", ""),
("href", "./assets/creature-BZHPYSn1.js"),
]),
children: vec![],
},
Html::Element {
tag: "link".into(),
attributes: to_attributes([
("rel", "stylesheet"),
("crossorigin", ""),
("href", "./assets/main-CjrOOoWN.css"),
]),
children: vec![],
},
],
},
Html::Element {
tag: "body".into(),
attributes: HashMap::new(),
children: vec![
Html::Element {
tag: "div".into(),
attributes: to_attributes([("id", "messages")]),
children: vec![],
},
Html::Element {
tag: "div".into(),
attributes: to_attributes([("id", "debug")]),
children: vec![],
},
Html::Element {
tag: "canvas".into(),
attributes: to_attributes([("id", "root")]),
children: vec![Html::Text {
text: "Your browser does not support the HTML canvas tag."
.into(),
}],
},
Html::Element {
tag: "a".into(),
attributes: to_attributes([
("id", "qr-link"),
("target", "_blank"),
]),
children: vec![Html::Element {
tag: "div".into(),
attributes: to_attributes([("id", "qr")]),
children: vec![],
}],
},
],
},
],
},
];
let tokens = parse_html(input.as_bytes()).unwrap();
assert_eq!(expected, tokens);
}
#[test]
fn test_svg() {
let input = r#"<svg xmlns="http://www.w3.org/2000/svg" width="20" viewBox="0 0 640 640" height="20"><path d="M451.5 160C434.9 160 418.8 164.5 404.7 172.7"/></svg>"#;
let expected = vec![Html::Element {
tag: "svg".into(),
attributes: to_attributes([
("xmlns", "http://www.w3.org/2000/svg"),
("width", "20"),
("viewBox", "0 0 640 640"),
("height", "20"),
]),
children: vec![Html::Element {
tag: "path".into(),
attributes: to_attributes([("d", "M451.5 160C434.9 160 418.8 164.5 404.7 172.7")]),
children: vec![],
}],
}];
let tokens = parse_html(input.as_bytes()).unwrap();
assert_eq!(expected, tokens);
}
#[test]
fn test_void_elements_with_and_without_self_closing() {
let input = r#"<meta charset="utf-8">
<link rel="stylesheet" href="style.css">
<img src="image.jpg" alt="test">"#;
let tokens = parse_html(input.as_bytes()).unwrap();
assert_eq!(tokens.len(), 3);
assert!(matches!(tokens[0], Html::Element { ref tag, .. } if tag == "meta"));
assert!(matches!(tokens[1], Html::Element { ref tag, .. } if tag == "link"));
assert!(matches!(tokens[2], Html::Element { ref tag, .. } if tag == "img"));
let input = r#"<meta charset="utf-8" />
<link rel="stylesheet" href="style.css" />
<img src="image.jpg" alt="test" />"#;
let tokens = parse_html(input.as_bytes()).unwrap();
assert_eq!(tokens.len(), 3);
assert!(matches!(tokens[0], Html::Element { ref tag, .. } if tag == "meta"));
assert!(matches!(tokens[1], Html::Element { ref tag, .. } if tag == "link"));
assert!(matches!(tokens[2], Html::Element { ref tag, .. } if tag == "img"));
}
}