use std::borrow::Cow;
#[derive(Clone, Debug, PartialEq)]
pub enum HtmlToken<'a> {
StartTag {
name: Cow<'a, str>,
attrs: Vec<(Cow<'a, str>, Cow<'a, str>)>,
self_closing: bool,
},
EndTag {
name: Cow<'a, str>,
},
Text(Cow<'a, str>),
Comment(Cow<'a, str>),
Doctype(Cow<'a, str>),
}
pub struct HtmlTokenizer<'a> {
input: &'a str,
bytes: &'a [u8],
pos: usize,
}
impl<'a> HtmlTokenizer<'a> {
pub fn new(input: &'a str) -> Self {
Self {
input,
bytes: input.as_bytes(),
pos: 0,
}
}
pub fn next_token(&mut self) -> Option<HtmlToken<'a>> {
if self.pos >= self.bytes.len() {
return None;
}
if self.bytes[self.pos] == b'<' {
self.parse_tag_or_comment()
} else {
self.parse_text()
}
}
fn parse_text(&mut self) -> Option<HtmlToken<'a>> {
let start = self.pos;
while self.pos < self.bytes.len() && self.bytes[self.pos] != b'<' {
self.pos += 1;
}
if self.pos > start {
let text = &self.input[start..self.pos];
let decoded = decode_entities(text);
Some(HtmlToken::Text(decoded))
} else {
None
}
}
fn parse_tag_or_comment(&mut self) -> Option<HtmlToken<'a>> {
debug_assert_eq!(self.bytes[self.pos], b'<');
self.pos += 1;
if self.pos >= self.bytes.len() {
return Some(HtmlToken::Text(Cow::Borrowed("<")));
}
if self.bytes[self.pos..].starts_with(b"!--") {
return self.parse_comment();
}
if self.bytes[self.pos..].starts_with(b"!DOCTYPE")
|| self.bytes[self.pos..].starts_with(b"!doctype")
{
return self.parse_doctype();
}
if self.bytes[self.pos..].starts_with(b"![CDATA[") {
return self.parse_cdata();
}
if self.bytes[self.pos] == b'/' {
return self.parse_end_tag();
}
self.parse_start_tag()
}
fn parse_comment(&mut self) -> Option<HtmlToken<'a>> {
self.pos += 3; let start = self.pos;
while self.pos + 2 < self.bytes.len() {
if &self.bytes[self.pos..self.pos + 3] == b"-->" {
let comment = &self.input[start..self.pos];
self.pos += 3; return Some(HtmlToken::Comment(Cow::Borrowed(comment)));
}
self.pos += 1;
}
self.pos = self.bytes.len();
Some(HtmlToken::Comment(Cow::Borrowed(&self.input[start..])))
}
fn parse_doctype(&mut self) -> Option<HtmlToken<'a>> {
let start = self.pos - 1;
while self.pos < self.bytes.len() && self.bytes[self.pos] != b'>' {
self.pos += 1;
}
if self.pos < self.bytes.len() {
self.pos += 1; }
Some(HtmlToken::Doctype(Cow::Borrowed(
&self.input[start..self.pos],
)))
}
fn parse_cdata(&mut self) -> Option<HtmlToken<'a>> {
self.pos += 8; let start = self.pos;
while self.pos + 2 < self.bytes.len() {
if &self.bytes[self.pos..self.pos + 3] == b"]]>" {
let text = &self.input[start..self.pos];
self.pos += 3; return Some(HtmlToken::Text(Cow::Borrowed(text)));
}
self.pos += 1;
}
self.pos = self.bytes.len();
Some(HtmlToken::Text(Cow::Borrowed(&self.input[start..])))
}
fn parse_end_tag(&mut self) -> Option<HtmlToken<'a>> {
self.pos += 1;
self.skip_whitespace();
let name = self.parse_tag_name();
if name.is_empty() {
return Some(HtmlToken::Text(Cow::Borrowed("</")));
}
self.skip_whitespace();
while self.pos < self.bytes.len() && self.bytes[self.pos] != b'>' {
self.pos += 1;
}
if self.pos < self.bytes.len() {
self.pos += 1; }
Some(HtmlToken::EndTag {
name: Cow::Owned(name.to_ascii_lowercase()),
})
}
fn parse_start_tag(&mut self) -> Option<HtmlToken<'a>> {
self.skip_whitespace();
let name = self.parse_tag_name();
if name.is_empty() {
return Some(HtmlToken::Text(Cow::Borrowed("<")));
}
let mut attrs = Vec::new();
let mut self_closing = false;
loop {
self.skip_whitespace();
if self.pos >= self.bytes.len() {
break;
}
let b = self.bytes[self.pos];
if b == b'>' {
self.pos += 1;
break;
}
if b == b'/' {
self.pos += 1;
self.skip_whitespace();
if self.pos < self.bytes.len() && self.bytes[self.pos] == b'>' {
self.pos += 1;
self_closing = true;
}
break;
}
if let Some((attr_name, attr_value)) = self.parse_attribute() {
attrs.push((attr_name, attr_value));
} else {
self.pos += 1;
}
}
if is_void_element(&name) {
self_closing = true;
}
Some(HtmlToken::StartTag {
name: Cow::Owned(name.to_ascii_lowercase()),
attrs,
self_closing,
})
}
fn parse_tag_name(&mut self) -> String {
let start = self.pos;
while self.pos < self.bytes.len() {
let b = self.bytes[self.pos];
if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b':' {
self.pos += 1;
} else {
break;
}
}
self.input[start..self.pos].to_string()
}
fn parse_attribute(&mut self) -> Option<(Cow<'a, str>, Cow<'a, str>)> {
let name_start = self.pos;
while self.pos < self.bytes.len() {
let b = self.bytes[self.pos];
if b.is_ascii_alphanumeric()
|| b == b'-'
|| b == b'_'
|| b == b':'
|| b == b'.'
|| b == b'@'
{
self.pos += 1;
} else {
break;
}
}
if self.pos == name_start {
return None;
}
let name = &self.input[name_start..self.pos];
self.skip_whitespace();
if self.pos >= self.bytes.len() || self.bytes[self.pos] != b'=' {
return Some((Cow::Owned(name.to_ascii_lowercase()), Cow::Borrowed("")));
}
self.pos += 1; self.skip_whitespace();
let value = if self.pos < self.bytes.len() {
let quote = self.bytes[self.pos];
if quote == b'"' || quote == b'\'' {
self.pos += 1;
let value_start = self.pos;
while self.pos < self.bytes.len() && self.bytes[self.pos] != quote {
self.pos += 1;
}
let value = &self.input[value_start..self.pos];
if self.pos < self.bytes.len() {
self.pos += 1; }
decode_entities(value)
} else {
let value_start = self.pos;
while self.pos < self.bytes.len() {
let b = self.bytes[self.pos];
if b.is_ascii_whitespace() || b == b'>' || b == b'/' {
break;
}
self.pos += 1;
}
let value = &self.input[value_start..self.pos];
decode_entities(value)
}
} else {
Cow::Borrowed("")
};
Some((Cow::Owned(name.to_ascii_lowercase()), value))
}
fn skip_whitespace(&mut self) {
while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_whitespace() {
self.pos += 1;
}
}
}
fn is_void_element(name: &str) -> bool {
matches!(
name.to_ascii_lowercase().as_str(),
"area"
| "base"
| "br"
| "col"
| "embed"
| "hr"
| "img"
| "input"
| "link"
| "meta"
| "param"
| "source"
| "track"
| "wbr"
)
}
fn decode_entities(s: &str) -> Cow<'_, str> {
if !s.contains('&') {
return Cow::Borrowed(s);
}
let mut result = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c == '&' {
let mut entity = String::new();
let mut found_semi = false;
for ch in chars.by_ref() {
if ch == ';' {
found_semi = true;
break;
}
if entity.len() > 10 || (!ch.is_ascii_alphanumeric() && ch != '#') {
break;
}
entity.push(ch);
}
if found_semi && let Some(decoded) = decode_entity(&entity) {
result.push(decoded);
continue;
}
result.push('&');
result.push_str(&entity);
if found_semi {
result.push(';');
}
} else {
result.push(c);
}
}
Cow::Owned(result)
}
fn decode_entity(entity: &str) -> Option<char> {
if let Some(rest) = entity.strip_prefix('#') {
let codepoint = if let Some(hex) = rest.strip_prefix('x').or_else(|| rest.strip_prefix('X'))
{
u32::from_str_radix(hex, 16).ok()?
} else {
rest.parse::<u32>().ok()?
};
return char::from_u32(codepoint);
}
Some(match entity {
"amp" => '&',
"lt" => '<',
"gt" => '>',
"quot" => '"',
"apos" => '\'',
"nbsp" => '\u{00A0}',
"copy" => '\u{00A9}',
"reg" => '\u{00AE}',
"trade" => '\u{2122}',
"mdash" => '\u{2014}',
"ndash" => '\u{2013}',
"ldquo" => '\u{201C}',
"rdquo" => '\u{201D}',
"lsquo" => '\u{2018}',
"rsquo" => '\u{2019}',
"hellip" => '\u{2026}',
"bull" => '\u{2022}',
"middot" => '\u{00B7}',
"laquo" => '\u{00AB}',
"raquo" => '\u{00BB}',
"euro" => '\u{20AC}',
"pound" => '\u{00A3}',
"yen" => '\u{00A5}',
"cent" => '\u{00A2}',
"deg" => '\u{00B0}',
"plusmn" => '\u{00B1}',
"times" => '\u{00D7}',
"divide" => '\u{00F7}',
"frac12" => '\u{00BD}',
"frac14" => '\u{00BC}',
"frac34" => '\u{00BE}',
_ => return None,
})
}
#[cfg(test)]
mod tests {
use super::*;
fn tokenize(html: &str) -> Vec<HtmlToken<'_>> {
let mut tokenizer = HtmlTokenizer::new(html);
let mut tokens = Vec::new();
while let Some(token) = tokenizer.next_token() {
tokens.push(token);
}
tokens
}
#[test]
fn test_simple_tag() {
let tokens = tokenize("<p>Hello</p>");
assert_eq!(tokens.len(), 3);
assert!(matches!(&tokens[0], HtmlToken::StartTag { name, .. } if name == "p"));
assert!(matches!(&tokens[1], HtmlToken::Text(t) if t == "Hello"));
assert!(matches!(&tokens[2], HtmlToken::EndTag { name } if name == "p"));
}
#[test]
fn test_attributes() {
let tokens = tokenize(r#"<a href="https://example.com" title='Test'>Link</a>"#);
assert_eq!(tokens.len(), 3);
if let HtmlToken::StartTag { name, attrs, .. } = &tokens[0] {
assert_eq!(name, "a");
assert_eq!(attrs.len(), 2);
assert_eq!(attrs[0].0, "href");
assert_eq!(attrs[0].1, "https://example.com");
assert_eq!(attrs[1].0, "title");
assert_eq!(attrs[1].1, "Test");
} else {
panic!("Expected StartTag");
}
}
#[test]
fn test_self_closing() {
let tokens = tokenize("<br /><hr><img src='test.png' />");
assert_eq!(tokens.len(), 3);
assert!(
matches!(&tokens[0], HtmlToken::StartTag { name, self_closing, .. } if name == "br" && *self_closing)
);
assert!(
matches!(&tokens[1], HtmlToken::StartTag { name, self_closing, .. } if name == "hr" && *self_closing)
);
assert!(
matches!(&tokens[2], HtmlToken::StartTag { name, self_closing, .. } if name == "img" && *self_closing)
);
}
#[test]
fn test_comment() {
let tokens = tokenize("<!-- This is a comment --><p>Text</p>");
assert_eq!(tokens.len(), 4);
assert!(matches!(&tokens[0], HtmlToken::Comment(c) if c == " This is a comment "));
}
#[test]
fn test_entities() {
let tokens = tokenize("<p>& < > "</p>");
if let HtmlToken::Text(t) = &tokens[1] {
assert_eq!(t, "& < > \"");
} else {
panic!("Expected Text");
}
}
#[test]
fn test_nested_tags() {
let tokens = tokenize("<div><p>Hello <strong>world</strong></p></div>");
assert_eq!(tokens.len(), 8);
}
}