use crate::attr::{Attribute, Attributes};
use crate::entity;
use crate::node::{is_escapable_raw_text_element, is_raw_text_element};
use crate::span::{Position, Span};
use crate::token::Token;
#[must_use]
pub fn tokenize(source: &str) -> Vec<Token> {
let bytes = source.as_bytes();
let initial = Position::new(1, 1, 0);
drain_tokens(bytes, source, 0, initial, Vec::new())
}
fn drain_tokens(
bytes: &[u8],
source: &str,
pos: usize,
position: Position,
acc: Vec<Token>,
) -> Vec<Token> {
if pos >= bytes.len() {
append(
acc,
Token::Eof {
span: Span::new(position, position),
},
)
} else if at_byte(bytes, pos, b'<') {
let (token, next_pos, next_position) = scan_tag_like(bytes, source, pos, position);
let raw_text_name = raw_text_target(&token);
let extended = append(acc, token);
if let Some((name, escapable)) = raw_text_name {
let (text_token, after_text_pos, after_text_position) =
scan_raw_text(bytes, source, next_pos, next_position, &name, escapable);
let after_text_acc =
text_token.map_or_else(|| extended.clone(), |t| append(extended.clone(), t));
drain_tokens(
bytes,
source,
after_text_pos,
after_text_position,
after_text_acc,
)
} else {
drain_tokens(bytes, source, next_pos, next_position, extended)
}
} else {
let (text_token, next_pos, next_position) = scan_text(bytes, source, pos, position);
let extended = text_token.map_or_else(|| acc.clone(), |t| append(acc.clone(), t));
drain_tokens(bytes, source, next_pos, next_position, extended)
}
}
fn raw_text_target(token: &Token) -> Option<(String, bool)> {
match token {
Token::StartTag {
name, self_closing, ..
} if !self_closing
&& (is_raw_text_element(name) || is_escapable_raw_text_element(name)) =>
{
Some((name.clone(), is_escapable_raw_text_element(name)))
}
_other => None,
}
}
fn append(acc: Vec<Token>, token: Token) -> Vec<Token> {
acc.into_iter().chain(std::iter::once(token)).collect()
}
fn scan_text(
bytes: &[u8],
source: &str,
start: usize,
position: Position,
) -> (Option<Token>, usize, Position) {
let end = scan_until(bytes, start, |b| b == b'<');
let raw = source.get(start..end).unwrap_or("");
let decoded = decode_entities(raw);
let end_position = advance_position(position, raw);
let token = if decoded.is_empty() {
None
} else {
Some(Token::Text {
content: decoded,
span: Span::new(position, end_position),
})
};
(token, end, end_position)
}
fn scan_raw_text(
bytes: &[u8],
source: &str,
start: usize,
position: Position,
name: &str,
escapable: bool,
) -> (Option<Token>, usize, Position) {
let end = find_raw_text_end(bytes, source, start, name);
let raw = source.get(start..end).unwrap_or("");
let content = if escapable {
decode_entities(raw)
} else {
raw.to_owned()
};
let end_position = advance_position(position, raw);
let token = if content.is_empty() {
None
} else {
Some(Token::Text {
content,
span: Span::new(position, end_position),
})
};
(token, end, end_position)
}
fn find_raw_text_end(bytes: &[u8], source: &str, start: usize, name: &str) -> usize {
let lower_name = name.to_ascii_lowercase();
step_to_close_tag(bytes, source, start, &lower_name)
}
fn step_to_close_tag(bytes: &[u8], source: &str, start: usize, lower_name: &str) -> usize {
if start >= bytes.len() {
bytes.len()
} else if at_byte(bytes, start, b'<')
&& at_byte(bytes, start + 1, b'/')
&& matches_tag_name_ci(source, start + 2, lower_name)
&& is_tag_name_terminator(bytes, start + 2 + lower_name.len())
{
start
} else {
step_to_close_tag(bytes, source, start + 1, lower_name)
}
}
fn matches_tag_name_ci(source: &str, start: usize, lower_name: &str) -> bool {
source
.get(start..start + lower_name.len())
.is_some_and(|slice| slice.eq_ignore_ascii_case(lower_name))
}
fn is_tag_name_terminator(bytes: &[u8], pos: usize) -> bool {
bytes
.get(pos)
.is_some_and(|b| matches!(*b, b' ' | b'\t' | b'\n' | b'\r' | b'/' | b'>'))
}
fn scan_tag_like(
bytes: &[u8],
source: &str,
start: usize,
position: Position,
) -> (Token, usize, Position) {
if at_byte(bytes, start + 1, b'!') {
scan_markup_declaration(bytes, source, start, position)
} else if at_byte(bytes, start + 1, b'/') {
scan_end_tag(bytes, source, start, position)
} else if is_ascii_letter(bytes.get(start + 1).copied()) {
scan_start_tag(bytes, source, start, position)
} else {
let next_position = advance_position(position, "<");
let token = Token::Text {
content: "<".to_owned(),
span: Span::new(position, next_position),
};
(token, start + 1, next_position)
}
}
fn scan_markup_declaration(
bytes: &[u8],
source: &str,
start: usize,
position: Position,
) -> (Token, usize, Position) {
if source.get(start..start + 4).is_some_and(|s| s == "<!--") {
scan_comment(bytes, source, start, position)
} else if source
.get(start..start + 9)
.is_some_and(|s| s.eq_ignore_ascii_case("<!doctype"))
{
scan_doctype(bytes, source, start, position)
} else {
scan_bogus_declaration(bytes, source, start, position)
}
}
fn scan_comment(
bytes: &[u8],
source: &str,
start: usize,
position: Position,
) -> (Token, usize, Position) {
let body_start = start + 4;
let body_end = find_comment_end(bytes, body_start);
let text = source.get(body_start..body_end).unwrap_or("").to_owned();
let after_close = (body_end + 3).min(bytes.len());
let raw = source.get(start..after_close).unwrap_or("");
let end_position = advance_position(position, raw);
(
Token::Comment {
text,
span: Span::new(position, end_position),
},
after_close,
end_position,
)
}
fn find_comment_end(bytes: &[u8], start: usize) -> usize {
if start + 2 >= bytes.len() {
bytes.len()
} else if bytes.get(start) == Some(&b'-')
&& bytes.get(start + 1) == Some(&b'-')
&& bytes.get(start + 2) == Some(&b'>')
{
start
} else {
find_comment_end(bytes, start + 1)
}
}
fn scan_doctype(
bytes: &[u8],
source: &str,
start: usize,
position: Position,
) -> (Token, usize, Position) {
let body_start = start + 9;
let end = scan_until(bytes, body_start, |b| b == b'>');
let body = source.get(body_start..end).unwrap_or("").trim();
let after_close = (end + 1).min(bytes.len());
let raw = source.get(start..after_close).unwrap_or("");
let end_position = advance_position(position, raw);
let (name, public_id, system_id) = parse_doctype_body(body);
(
Token::Doctype {
name,
public_id,
system_id,
span: Span::new(position, end_position),
},
after_close,
end_position,
)
}
fn parse_doctype_body(body: &str) -> (String, Option<String>, Option<String>) {
let mut parts = body.split_ascii_whitespace();
let name = parts.next().unwrap_or("").to_ascii_lowercase();
let kind = parts.next();
let public_id = if kind.is_some_and(|k| k.eq_ignore_ascii_case("PUBLIC")) {
parts
.next()
.map(|s| s.trim_matches('"').trim_matches('\'').to_owned())
} else {
None
};
let system_id = if kind.is_some_and(|k| k.eq_ignore_ascii_case("SYSTEM")) {
parts
.next()
.map(|s| s.trim_matches('"').trim_matches('\'').to_owned())
} else if public_id.is_some() {
parts
.next()
.map(|s| s.trim_matches('"').trim_matches('\'').to_owned())
} else {
None
};
(name, public_id, system_id)
}
fn scan_bogus_declaration(
bytes: &[u8],
source: &str,
start: usize,
position: Position,
) -> (Token, usize, Position) {
let end = scan_until(bytes, start, |b| b == b'>');
let body = source.get(start + 2..end).unwrap_or("").to_owned();
let after_close = (end + 1).min(bytes.len());
let raw = source.get(start..after_close).unwrap_or("");
let end_position = advance_position(position, raw);
(
Token::Comment {
text: body,
span: Span::new(position, end_position),
},
after_close,
end_position,
)
}
fn scan_end_tag(
bytes: &[u8],
source: &str,
start: usize,
position: Position,
) -> (Token, usize, Position) {
let name_start = start + 2;
let name_end = scan_while(bytes, name_start, is_ascii_alphanumeric_byte);
let name = source
.get(name_start..name_end)
.unwrap_or("")
.to_ascii_lowercase();
let end = scan_until(bytes, name_end, |b| b == b'>');
let after_close = (end + 1).min(bytes.len());
let raw = source.get(start..after_close).unwrap_or("");
let end_position = advance_position(position, raw);
(
Token::EndTag {
name,
span: Span::new(position, end_position),
},
after_close,
end_position,
)
}
fn scan_start_tag(
bytes: &[u8],
source: &str,
start: usize,
position: Position,
) -> (Token, usize, Position) {
let name_start = start + 1;
let name_end = scan_while(bytes, name_start, is_ascii_alphanumeric_byte);
let name = source
.get(name_start..name_end)
.unwrap_or("")
.to_ascii_lowercase();
let (attributes, after_attrs, self_closing) =
scan_attributes(bytes, source, name_end, Attributes::new());
let after_close = (after_attrs + 1).min(bytes.len());
let raw = source.get(start..after_close).unwrap_or("");
let end_position = advance_position(position, raw);
let is_void_self_close = self_closing || crate::node::is_void_element(&name);
(
Token::StartTag {
name,
attributes,
self_closing: is_void_self_close,
span: Span::new(position, end_position),
},
after_close,
end_position,
)
}
fn scan_attributes(
bytes: &[u8],
source: &str,
start: usize,
acc: Attributes,
) -> (Attributes, usize, bool) {
let skipped = skip_whitespace(bytes, start);
if at_byte(bytes, skipped, b'>') {
(acc, skipped, false)
} else if at_byte(bytes, skipped, b'/') && at_byte(bytes, skipped + 1, b'>') {
(acc, skipped + 1, true)
} else if skipped >= bytes.len() {
(acc, skipped, false)
} else {
let (attr, next_pos) = scan_one_attribute(bytes, source, skipped);
let extended = if let Some(a) = attr { acc.with(a) } else { acc };
scan_attributes(bytes, source, next_pos, extended)
}
}
fn scan_one_attribute(bytes: &[u8], source: &str, start: usize) -> (Option<Attribute>, usize) {
let name_end = scan_while(bytes, start, is_attribute_name_byte);
if name_end == start {
(None, start + 1)
} else {
let name = source
.get(start..name_end)
.unwrap_or("")
.to_ascii_lowercase();
let after_name_ws = skip_whitespace(bytes, name_end);
if at_byte(bytes, after_name_ws, b'=') {
let value_start = skip_whitespace(bytes, after_name_ws + 1);
let (value, after_value) = scan_attribute_value(bytes, source, value_start);
(Some(Attribute::new(name, value)), after_value)
} else {
(Some(Attribute::new(name, "")), name_end)
}
}
}
fn scan_attribute_value(bytes: &[u8], source: &str, start: usize) -> (String, usize) {
if at_byte(bytes, start, b'"') {
let end = scan_until(bytes, start + 1, |b| b == b'"');
let raw = source.get(start + 1..end).unwrap_or("");
((decode_entities(raw)), (end + 1).min(bytes.len()))
} else if at_byte(bytes, start, b'\'') {
let end = scan_until(bytes, start + 1, |b| b == b'\'');
let raw = source.get(start + 1..end).unwrap_or("");
((decode_entities(raw)), (end + 1).min(bytes.len()))
} else {
let end = scan_while(bytes, start, is_unquoted_value_byte);
let raw = source.get(start..end).unwrap_or("");
(decode_entities(raw), end)
}
}
fn decode_entities(raw: &str) -> String {
decode_entities_recursive(raw, 0, String::new())
}
#[allow(clippy::needless_pass_by_value)]
fn decode_entities_recursive(raw: &str, start: usize, acc: String) -> String {
let amp_rel = raw.get(start..).and_then(|tail| tail.find('&'));
if let Some(rel) = amp_rel {
let amp_pos = start + rel;
let prefix = raw.get(start..amp_pos).unwrap_or("");
let body_end = raw
.get(amp_pos + 1..)
.and_then(|tail| tail.find(|c: char| c == ';' || c.is_whitespace() || c == '<'))
.map_or(raw.len(), |p| amp_pos + 1 + p);
let semi = raw.get(body_end..body_end + 1) == Some(";");
if semi {
let body = raw.get(amp_pos + 1..body_end).unwrap_or("");
let next_start = body_end + 1;
let decoded = entity::decode(body).unwrap_or_else(|| format!("&{body};"));
decode_entities_recursive(raw, next_start, format!("{acc}{prefix}{decoded}"))
} else {
let next_start = amp_pos + 1;
decode_entities_recursive(raw, next_start, format!("{acc}{prefix}&"))
}
} else {
let tail = raw.get(start..).unwrap_or("");
format!("{acc}{tail}")
}
}
fn at_byte(bytes: &[u8], pos: usize, byte: u8) -> bool {
bytes.get(pos).is_some_and(|b| *b == byte)
}
fn is_ascii_letter(byte: Option<u8>) -> bool {
byte.is_some_and(|b| b.is_ascii_alphabetic())
}
fn is_ascii_alphanumeric_byte(byte: u8) -> bool {
byte.is_ascii_alphanumeric() || byte == b'-'
}
fn is_attribute_name_byte(byte: u8) -> bool {
!matches!(
byte,
b' ' | b'\t' | b'\n' | b'\r' | b'/' | b'>' | b'=' | b'"' | b'\'' | b'<'
)
}
fn is_unquoted_value_byte(byte: u8) -> bool {
!matches!(
byte,
b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'<' | b'"' | b'\'' | b'=' | b'`'
)
}
fn scan_until(bytes: &[u8], start: usize, stop: impl Fn(u8) -> bool) -> usize {
bytes
.iter()
.enumerate()
.skip(start)
.find(|(_, b)| stop(**b))
.map_or(bytes.len(), |(i, _)| i)
}
fn scan_while(bytes: &[u8], start: usize, take: impl Fn(u8) -> bool) -> usize {
bytes
.iter()
.enumerate()
.skip(start)
.find(|(_, b)| !take(**b))
.map_or(bytes.len(), |(i, _)| i)
}
fn skip_whitespace(bytes: &[u8], start: usize) -> usize {
scan_while(bytes, start, |b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
}
fn advance_position(start: Position, text: &str) -> Position {
text.chars().fold(start, |pos, c| {
if c == '\n' {
Position::new(pos.line() + 1, 1, pos.offset() + char_len(c))
} else {
Position::new(pos.line(), pos.column() + 1, pos.offset() + char_len(c))
}
})
}
fn char_len(c: char) -> u32 {
u32::try_from(c.len_utf8()).unwrap_or(1)
}