use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq)]
pub enum HtmlToken {
StartTag {
name: String,
attrs: Vec<(String, String)>,
self_closing: bool,
},
EndTag {
name: String,
},
Text(String),
Comment(String),
Doctype(String),
RawText {
host_tag: String,
body: String,
},
Eof,
}
pub fn is_void_element(name: &str) -> bool {
matches!(
name,
"area"
| "base"
| "br"
| "col"
| "embed"
| "hr"
| "img"
| "input"
| "link"
| "meta"
| "param"
| "source"
| "track"
| "wbr"
)
}
pub fn tokenize(input: &str) -> Vec<HtmlToken> {
let mut t = Tokenizer::new(input);
let mut out = Vec::new();
loop {
match t.next_token() {
HtmlToken::Eof => {
out.push(HtmlToken::Eof);
return out;
},
tok => out.push(tok),
}
}
}
struct Tokenizer<'a> {
input: &'a str,
pos: usize,
}
impl<'a> Tokenizer<'a> {
fn new(input: &'a str) -> Self {
Self { input, pos: 0 }
}
fn peek(&self) -> Option<char> {
self.input[self.pos..].chars().next()
}
fn bump(&mut self) -> Option<char> {
let c = self.peek()?;
self.pos += c.len_utf8();
Some(c)
}
fn starts_with(&self, s: &str) -> bool {
self.input[self.pos..].starts_with(s)
}
fn starts_with_ignore_case(&self, s: &str) -> bool {
let mut rest = self.input[self.pos..].chars();
for needle_ch in s.chars() {
match rest.next() {
Some(c) if c.eq_ignore_ascii_case(&needle_ch) => {},
_ => return false,
}
}
true
}
fn next_token(&mut self) -> HtmlToken {
let Some(c) = self.peek() else {
return HtmlToken::Eof;
};
if c == '<' {
self.consume_tag_or_text()
} else {
self.consume_text()
}
}
fn consume_tag_or_text(&mut self) -> HtmlToken {
if self.starts_with("<!--") {
self.pos += 4;
return self.consume_comment();
}
if self.starts_with_ignore_case("<!doctype") {
return self.consume_doctype();
}
if self.starts_with("</") {
self.pos += 2;
return self.consume_end_tag();
}
if matches!(
self.input[self.pos + 1..].chars().next(),
Some(c) if c.is_ascii_alphabetic()
) {
self.pos += 1; return self.consume_start_tag();
}
self.bump();
HtmlToken::Text("<".to_string())
}
fn consume_comment(&mut self) -> HtmlToken {
let start = self.pos;
while self.pos < self.input.len() {
if self.starts_with("-->") {
let body = self.input[start..self.pos].to_string();
self.pos += 3;
return HtmlToken::Comment(body);
}
self.bump();
}
HtmlToken::Comment(self.input[start..self.pos].to_string())
}
fn consume_doctype(&mut self) -> HtmlToken {
let start = self.pos;
while let Some(c) = self.peek() {
if c == '>' {
let body = self.input[start..self.pos].to_string();
self.bump();
return HtmlToken::Doctype(body);
}
self.bump();
}
HtmlToken::Doctype(self.input[start..self.pos].to_string())
}
fn consume_end_tag(&mut self) -> HtmlToken {
let name = self.consume_tag_name();
while let Some(c) = self.peek() {
if c == '>' {
self.bump();
break;
}
self.bump();
}
HtmlToken::EndTag {
name: name.to_ascii_lowercase(),
}
}
fn consume_start_tag(&mut self) -> HtmlToken {
let name_lower = self.consume_tag_name().to_ascii_lowercase();
let mut attrs: Vec<(String, String)> = Vec::new();
let mut self_closing = false;
loop {
self.skip_html_ws();
match self.peek() {
None => break,
Some('>') => {
self.bump();
break;
},
Some('/') => {
self.bump();
self_closing = true;
continue;
},
Some(_) => {
if let Some(attr) = self.consume_attribute() {
attrs.push(attr);
} else {
self.bump();
}
},
}
}
if !self_closing && is_void_element(&name_lower) {
self_closing = true;
}
if !self_closing && (name_lower == "style" || name_lower == "script") {
return self.swallow_raw_text(name_lower, attrs);
}
HtmlToken::StartTag {
name: name_lower,
attrs,
self_closing,
}
}
fn swallow_raw_text(&mut self, host_tag: String, attrs: Vec<(String, String)>) -> HtmlToken {
let body_start = self.pos;
let needle = format!("</{host_tag}");
let mut body_end = self.pos;
while self.pos < self.input.len() {
if self.starts_with_ignore_case(&needle) {
break;
}
self.bump();
body_end = self.pos;
}
let body = self.input[body_start..body_end].to_string();
self.pos += needle.len();
while let Some(c) = self.peek() {
if c == '>' {
self.bump();
break;
}
self.bump();
}
let _ = attrs; HtmlToken::RawText { host_tag, body }
}
fn consume_tag_name(&mut self) -> String {
let start = self.pos;
while let Some(c) = self.peek() {
if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == ':' {
self.bump();
} else {
break;
}
}
self.input[start..self.pos].to_string()
}
fn consume_attribute(&mut self) -> Option<(String, String)> {
let name_start = self.pos;
while let Some(c) = self.peek() {
if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == ':' {
self.bump();
} else {
break;
}
}
let name = self.input[name_start..self.pos].to_ascii_lowercase();
if name.is_empty() {
return None;
}
self.skip_html_ws();
if self.peek() != Some('=') {
return Some((name, String::new()));
}
self.bump(); self.skip_html_ws();
let value = match self.peek() {
Some('"') => {
self.bump();
self.consume_quoted_attr_value('"')
},
Some('\'') => {
self.bump();
self.consume_quoted_attr_value('\'')
},
_ => self.consume_unquoted_attr_value(),
};
Some((name, value))
}
fn consume_quoted_attr_value(&mut self, quote: char) -> String {
let mut buf = String::new();
while let Some(c) = self.peek() {
if c == quote {
self.bump();
break;
}
if c == '&' {
buf.push_str(&self.consume_entity());
} else {
buf.push(c);
self.bump();
}
}
buf
}
fn consume_unquoted_attr_value(&mut self) -> String {
let mut buf = String::new();
while let Some(c) = self.peek() {
if c.is_ascii_whitespace() || c == '>' || c == '/' {
break;
}
if c == '&' {
buf.push_str(&self.consume_entity());
} else {
buf.push(c);
self.bump();
}
}
buf
}
fn consume_text(&mut self) -> HtmlToken {
let mut buf = String::new();
while let Some(c) = self.peek() {
if c == '<' {
break;
}
if c == '&' {
buf.push_str(&self.consume_entity());
} else {
buf.push(c);
self.bump();
}
}
HtmlToken::Text(buf)
}
fn consume_entity(&mut self) -> String {
let restore = self.pos;
self.bump(); if matches!(self.peek(), Some('#')) {
self.bump();
let hex = matches!(self.peek(), Some('x') | Some('X'));
if hex {
self.bump();
}
let mut digits = String::new();
while let Some(c) = self.peek() {
if (hex && c.is_ascii_hexdigit()) || (!hex && c.is_ascii_digit()) {
digits.push(c);
self.bump();
} else {
break;
}
}
if matches!(self.peek(), Some(';')) {
self.bump();
}
let radix = if hex { 16 } else { 10 };
if let Ok(n) = u32::from_str_radix(&digits, radix) {
if let Some(ch) = char::from_u32(n) {
return ch.to_string();
}
}
return self.input[restore..self.pos].to_string();
}
let name_start = self.pos;
while let Some(c) = self.peek() {
if c.is_ascii_alphanumeric() {
self.bump();
} else {
break;
}
}
let name = &self.input[name_start..self.pos];
let trailing_semi = matches!(self.peek(), Some(';'));
if let Some(decoded) = NAMED_ENTITIES.get(name).copied() {
if trailing_semi {
self.bump();
}
return decoded.to_string();
}
self.pos = restore;
self.bump();
"&".to_string()
}
fn skip_html_ws(&mut self) {
while let Some(c) = self.peek() {
if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\x0c' {
self.bump();
} else {
break;
}
}
}
}
lazy_static::lazy_static! {
static ref NAMED_ENTITIES: HashMap<&'static str, &'static str> = {
let mut m = HashMap::new();
m.insert("amp", "&");
m.insert("lt", "<");
m.insert("gt", ">");
m.insert("quot", "\"");
m.insert("apos", "'");
m.insert("nbsp", "\u{00A0}");
m.insert("ndash", "–");
m.insert("mdash", "—");
m.insert("hellip", "…");
m.insert("lsquo", "‘");
m.insert("rsquo", "’");
m.insert("ldquo", "“");
m.insert("rdquo", "”");
m.insert("laquo", "«");
m.insert("raquo", "»");
m.insert("bull", "•");
m.insert("middot", "·");
m.insert("para", "¶");
m.insert("sect", "§");
m.insert("sup1", "¹");
m.insert("sup2", "²");
m.insert("sup3", "³");
m.insert("frac12", "½");
m.insert("frac14", "¼");
m.insert("frac34", "¾");
m.insert("copy", "©");
m.insert("reg", "®");
m.insert("trade", "™");
m.insert("deg", "°");
m.insert("times", "×");
m.insert("divide", "÷");
m.insert("plusmn", "±");
m.insert("micro", "µ");
m.insert("euro", "€");
m.insert("pound", "£");
m.insert("yen", "¥");
m.insert("cent", "¢");
m.insert("not", "¬");
m.insert("larr", "←");
m.insert("uarr", "↑");
m.insert("rarr", "→");
m.insert("darr", "↓");
m.insert("harr", "↔");
m.insert("crarr", "↵");
m.insert("infin", "∞");
m.insert("le", "≤");
m.insert("ge", "≥");
m.insert("ne", "≠");
m.insert("equiv", "≡");
m
};
}
#[cfg(test)]
mod tests {
use super::*;
fn types(toks: &[HtmlToken]) -> Vec<&'static str> {
toks.iter()
.map(|t| match t {
HtmlToken::StartTag { .. } => "Start",
HtmlToken::EndTag { .. } => "End",
HtmlToken::Text(_) => "Text",
HtmlToken::Comment(_) => "Comment",
HtmlToken::Doctype(_) => "Doctype",
HtmlToken::RawText { .. } => "RawText",
HtmlToken::Eof => "Eof",
})
.collect()
}
#[test]
fn simple_paragraph() {
let t = tokenize("<p>hello</p>");
assert_eq!(types(&t), vec!["Start", "Text", "End", "Eof"]);
}
#[test]
fn attributes_quoted_and_unquoted() {
let t = tokenize(r#"<a href="x" class=foo data-x>"#);
match &t[0] {
HtmlToken::StartTag { name, attrs, .. } => {
assert_eq!(name, "a");
assert_eq!(attrs.len(), 3);
assert_eq!(attrs[0], ("href".into(), "x".into()));
assert_eq!(attrs[1], ("class".into(), "foo".into()));
assert_eq!(attrs[2], ("data-x".into(), String::new()));
},
_ => panic!(),
}
}
#[test]
fn void_element_implicit_self_closing() {
let t = tokenize("<img src=x>");
match &t[0] {
HtmlToken::StartTag {
name, self_closing, ..
} => {
assert_eq!(name, "img");
assert!(self_closing);
},
_ => panic!(),
}
}
#[test]
fn explicit_self_closing() {
let t = tokenize("<br/>");
match &t[0] {
HtmlToken::StartTag { self_closing, .. } => assert!(self_closing),
_ => panic!(),
}
}
#[test]
fn named_entities_decoded() {
let t = tokenize("Bread & butter — ©");
if let HtmlToken::Text(s) = &t[0] {
assert_eq!(s, "Bread & butter — \u{00A0} ©");
} else {
panic!();
}
}
#[test]
fn numeric_entities() {
let t = tokenize("A A");
if let HtmlToken::Text(s) = &t[0] {
assert_eq!(s, "A A");
} else {
panic!();
}
}
#[test]
fn comments() {
let t = tokenize("<!-- hello --> world");
assert_eq!(types(&t), vec!["Comment", "Text", "Eof"]);
if let HtmlToken::Comment(s) = &t[0] {
assert_eq!(s, " hello ");
} else {
panic!();
}
}
#[test]
fn doctype() {
let t = tokenize("<!doctype html><p>x</p>");
assert_eq!(types(&t), vec!["Doctype", "Start", "Text", "End", "Eof"]);
}
#[test]
fn style_block_treated_as_raw_text() {
let t = tokenize("<style>p { color: red < blue; }</style>");
let kinds = types(&t);
assert!(kinds.contains(&"RawText"));
if let Some(HtmlToken::RawText { host_tag, body }) = t
.iter()
.find(|tok| matches!(tok, HtmlToken::RawText { .. }))
{
assert_eq!(host_tag, "style");
assert!(body.contains("p { color: red < blue; }"));
}
}
#[test]
fn nested_tags() {
let t = tokenize("<div><p>a</p><p>b</p></div>");
let kinds = types(&t);
assert_eq!(kinds.iter().filter(|k| **k == "Start").count(), 3);
assert_eq!(kinds.iter().filter(|k| **k == "End").count(), 3);
assert_eq!(kinds.iter().filter(|k| **k == "Text").count(), 2);
}
#[test]
fn entity_in_attribute() {
let t = tokenize(r#"<a title="A & B">x</a>"#);
if let HtmlToken::StartTag { attrs, .. } = &t[0] {
assert_eq!(attrs[0].1, "A & B");
} else {
panic!();
}
}
#[test]
fn case_insensitive_tag_names() {
let t = tokenize("<DIV><P>x</P></DIV>");
match (&t[0], &t[1]) {
(HtmlToken::StartTag { name: a, .. }, HtmlToken::StartTag { name: b, .. }) => {
assert_eq!(a, "div");
assert_eq!(b, "p");
},
_ => panic!(),
}
}
#[test]
fn multibyte_codepoint_does_not_panic_on_ignore_case_lookahead() {
let _ = tokenize("<style>p::before { content: \"» \"; }</style>");
let _ = tokenize("«guillemets at start»");
let _ = tokenize("❤<p>after-heart</p>");
}
#[test]
fn lone_less_than_passes_through() {
let t = tokenize("a < b");
let s: String = t
.iter()
.filter_map(|tok| match tok {
HtmlToken::Text(s) => Some(s.clone()),
_ => None,
})
.collect::<Vec<_>>()
.join("");
assert!(s.contains("<"));
}
}