use super::{
constants,
handle::NodeHandle,
tag::{Attributes, HTMLTag, Node},
};
use crate::InnerNodeHandle;
use crate::asm_core::{self, AsmAttrRecord, AsmNodeRecord};
use crate::inline::hashmap::InlineHashMap;
use crate::{ParseError, bytes::Bytes, inline::vec::InlineVec};
use crate::{ParserOptions, stream::Stream};
use core::mem::MaybeUninit;
type StorageVec<T, const N: usize> = std::vec::Vec<T>;
type StorageMap<K, V, const N: usize> = InlineHashMap<K, V, N>;
fn new_vec_with_capacity<T, const N: usize>(capacity: usize) -> StorageVec<T, N> {
std::vec::Vec::with_capacity(capacity)
}
fn new_map<K, V, const N: usize>() -> StorageMap<K, V, N>
where
K: core::hash::Hash + Eq,
{
InlineHashMap::new()
}
fn push_vec<T, const N: usize>(
vec: &mut StorageVec<T, N>,
value: T,
_err: ParseError,
) -> Result<(), ParseError> {
vec.push(value);
Ok(())
}
#[inline]
fn push_class_handle<const N: usize>(
vec: &mut ClassVec<N>,
value: NodeHandle,
err: ParseError,
) -> Result<(), ParseError> {
vec.push_handle(value).map_err(|_| err)
}
fn insert_bytes_map<'a, V, const N: usize>(
map: &mut StorageMap<Bytes<'a>, V, N>,
key: Bytes<'a>,
value: V,
err: ParseError,
) -> Result<Option<V>, ParseError> {
if let Some(slot) = map.get_bytes_mut(&key) {
let old = core::mem::replace(slot, value);
Ok(Some(old))
} else {
map.insert(key, value).map_err(|_| err)?;
Ok(None)
}
}
pub type Tree<'a, const MAX_NODES: usize = 0> = StorageVec<Node<'a>, MAX_NODES>;
pub type ClassVec<const MAX_NODES: usize = 0> = InlineVec<NodeHandle, MAX_NODES>;
#[derive(Debug, Copy, Clone, PartialEq)]
#[repr(C)]
pub enum HTMLVersion {
HTML5,
StrictHTML401,
TransitionalHTML401,
FramesetHTML401,
}
#[derive(Debug)]
pub struct Parser<
'a,
const MAX_NODES: usize = 0,
const MAX_STACK: usize = 0,
const MAX_ROOTS: usize = 0,
const MAX_IDS: usize = 0,
const MAX_CLASSES: usize = 0,
const MAX_SELECTOR_NODES: usize = 0,
> {
pub(crate) stream: Stream<'a, u8>,
pub(crate) stack: StorageVec<NodeHandle, MAX_STACK>,
pub(crate) options: ParserOptions,
pub(crate) tags: Tree<'a, MAX_NODES>,
pub(crate) ast: StorageVec<NodeHandle, MAX_ROOTS>,
pub(crate) ids: StorageMap<Bytes<'a>, NodeHandle, MAX_IDS>,
pub(crate) classes: StorageMap<Bytes<'a>, ClassVec<MAX_NODES>, MAX_CLASSES>,
pub(crate) version: Option<HTMLVersion>,
}
#[allow(dead_code)]
impl<
'a,
const MAX_NODES: usize,
const MAX_STACK: usize,
const MAX_ROOTS: usize,
const MAX_IDS: usize,
const MAX_CLASSES: usize,
const MAX_SELECTOR_NODES: usize,
> Parser<'a, MAX_NODES, MAX_STACK, MAX_ROOTS, MAX_IDS, MAX_CLASSES, MAX_SELECTOR_NODES>
{
pub(crate) fn new(input: &'a str, options: ParserOptions) -> Self {
let (node_capacity, stack_capacity, root_capacity) = (0, 0, 0);
Parser {
stack: new_vec_with_capacity::<NodeHandle, MAX_STACK>(stack_capacity),
options,
tags: new_vec_with_capacity::<Node<'a>, MAX_NODES>(node_capacity),
stream: Stream::new(input.as_bytes()),
ast: new_vec_with_capacity::<NodeHandle, MAX_ROOTS>(root_capacity),
ids: new_map::<Bytes<'a>, NodeHandle, MAX_IDS>(),
classes: new_map::<Bytes<'a>, ClassVec<MAX_NODES>, MAX_CLASSES>(),
version: None,
}
}
#[inline(always)]
fn register_tag(&mut self, node: Node<'a>) -> Result<NodeHandle, ParseError> {
push_vec::<Node<'a>, MAX_NODES>(&mut self.tags, node, ParseError::NodeCapacityExceeded)?;
Ok(NodeHandle::new((self.tags.len() - 1) as u32))
}
#[inline(always)]
fn skip_whitespaces(&mut self) {
let start = self.stream.idx;
let skipped = asm_core::count_while2(&self.stream.data()[start..], [b' ', b'\n']);
self.stream.idx += skipped;
}
fn read_to(&mut self, needle: u8) -> &'a [u8] {
let start = self.stream.idx;
let bytes = &self.stream.data()[start..];
let end = asm_core::find(bytes, needle).unwrap_or_else(|| self.stream.len() - start);
self.stream.idx += end;
self.stream.slice(start, start + end)
}
fn read_ident(&mut self) -> Option<&'a [u8]> {
let start = self.stream.idx;
let bytes = &self.stream.data()[start..];
let end = asm_core::search_non_ident(bytes).unwrap_or_else(|| self.stream.len() - start);
if end == 0 {
return None;
}
self.stream.idx += end;
Some(self.stream.slice(start, start + end))
}
fn skip_comment_with_start(&mut self, start: usize) -> &'a [u8] {
let offset = asm_core::find_comment_end(&self.stream.data()[self.stream.idx..]);
if let Some(offset) = offset {
self.stream.advance_by(offset);
return self.stream.slice(start, self.stream.idx);
}
&[]
}
fn parse_attribute(&mut self) -> Option<(&'a [u8], Option<&'a [u8]>)> {
let attr = asm_core::parse_attr(self.stream.data(), self.stream.idx)?;
self.stream.idx = attr.next_idx;
let name = self
.stream
.slice(attr.name_start, attr.name_start + attr.name_len);
let value = (attr.has_value != 0).then(|| {
self.stream
.slice(attr.value_start, attr.value_start + attr.value_len)
});
Some((name, value))
}
fn parse_attributes(&mut self) -> Result<Option<Attributes<'a>>, ParseError> {
let mut attributes = Attributes::new();
loop {
self.skip_whitespaces();
let cur = match self.stream.current_cpy() {
Some(cur) => cur,
None => return Ok(None),
};
if asm_core::is_closing(cur) {
break;
}
if let Some((key, value)) = self.parse_attribute() {
let has_value = value.is_some();
let value: Option<Bytes<'a>> = value.map(Into::into);
match asm_core::attr_key_kind(key) {
1 => attributes.id = value,
2 => attributes.class = value,
_ => {
attributes
.raw
.insert(key.into(), value)
.map_err(|_| ParseError::AttributeCapacityExceeded)?;
}
}
let Some(cur) = self.stream.current_cpy() else {
return Ok(None);
};
if has_value && !asm_core::is_closing(cur) {
self.stream.advance();
}
} else {
self.stream.advance();
}
}
Ok(Some(attributes))
}
#[inline]
fn add_to_parent(&mut self, handle: NodeHandle) -> Result<(), ParseError> {
if let Some(last) = self.stack.last() {
let last = self
.tags
.get_mut(last.get_inner() as usize)
.unwrap()
.as_tag_mut()
.unwrap();
last._children
.push_handle(handle)
.map_err(|_| ParseError::ChildCapacityExceeded)?;
} else {
push_vec::<NodeHandle, MAX_ROOTS>(
&mut self.ast,
handle,
ParseError::RootCapacityExceeded,
)?;
}
Ok(())
}
fn read_end(&mut self) -> Result<(), ParseError> {
self.stream.advance();
let closing_tag_name = self.read_to(b'>');
if asm_core::byte_at_eq(self.stream.data(), self.stream.idx, b'>') {
self.stream.advance();
}
let closing_tag_matches_parent = self
.stack
.last()
.and_then(|last_handle| last_handle.get(self))
.and_then(|last_item| last_item.as_tag())
.is_some_and(|last_tag| {
asm_core::bytes_eq(last_tag.name().as_bytes(), closing_tag_name)
});
if !closing_tag_matches_parent {
return Ok(());
}
if let Some(handle) = self.stack.pop() {
let tag = self
.tags
.get_mut(handle.get_inner() as usize)
.unwrap()
.as_tag_mut()
.unwrap();
let ptr = self.stream.data().as_ptr() as usize;
let offset = tag._raw.as_ptr() as usize;
let offset = offset - ptr;
tag._raw = self.stream.slice(offset, self.stream.idx).into();
let (track_classes, track_ids) = (
self.options.is_tracking_classes(),
self.options.is_tracking_ids(),
);
if let (true, Some(bytes)) = (track_classes, &tag._attributes.class)
&& let Some(class_bytes) = bytes.as_bytes_borrowed()
{
let mut idx = 0;
while let Some((start, len, next)) = asm_core::next_ascii_token(class_bytes, idx) {
let key = Bytes::from(&class_bytes[start..start + len]);
if let Some(handles) = self.classes.get_bytes_mut(&key) {
push_class_handle::<MAX_NODES>(
handles,
handle,
ParseError::ClassCapacityExceeded,
)?;
} else {
let mut handles = ClassVec::<MAX_NODES>::new();
push_class_handle::<MAX_NODES>(
&mut handles,
handle,
ParseError::ClassCapacityExceeded,
)?;
insert_bytes_map::<ClassVec<MAX_NODES>, MAX_CLASSES>(
&mut self.classes,
key,
handles,
ParseError::ClassCapacityExceeded,
)?;
}
idx = next;
}
}
if let (true, Some(bytes)) = (track_ids, &tag._attributes.id) {
insert_bytes_map::<NodeHandle, MAX_IDS>(
&mut self.ids,
bytes.clone(),
handle,
ParseError::IdCapacityExceeded,
)?;
}
}
Ok(())
}
#[cold]
#[inline(never)]
fn read_markdown(&mut self) -> Result<Option<()>, ParseError> {
let start = self.stream.idx - 1;
self.stream.advance();
let is_comment = self
.stream
.slice_len(self.stream.idx, 2)
.eq(constants::COMMENT);
if is_comment {
let comment = self.skip_comment_with_start(start);
let comment = self.register_tag(Node::Comment(comment.into()))?;
self.add_to_parent(comment)?;
} else {
let Some(tag) = self.read_ident() else {
return Err(ParseError::UnsupportedAssemblySyntax);
};
self.skip_whitespaces();
if asm_core::matches_case_insensitive_exact(tag, *b"doctype") {
let Some(doctype) = self.read_ident() else {
return Ok(None);
};
let html5 = asm_core::matches_case_insensitive_exact(doctype, *b"html");
if html5 {
self.version = Some(HTMLVersion::HTML5);
}
self.skip_whitespaces();
self.stream.advance(); } else {
return Err(ParseError::UnsupportedAssemblySyntax);
}
}
Ok(Some(()))
}
fn parse_tag(&mut self) -> Result<Option<()>, ParseError> {
let start = self.stream.idx;
self.stream.advance();
self.skip_whitespaces();
let Some(cur) = self.stream.current_cpy() else {
return Ok(None);
};
match cur {
b'/' => self.read_end()?,
b'!' => {
self.read_markdown()?;
}
_ => {
let Some(name) = self.read_ident() else {
return Ok(None);
};
self.skip_whitespaces();
let Some(attr) = self.parse_attributes()? else {
return Ok(None);
};
let is_self_closing =
asm_core::byte_at_eq(self.stream.data(), self.stream.idx, b'/');
if is_self_closing {
self.stream.advance();
}
if !asm_core::byte_at_eq(self.stream.data(), self.stream.idx, b'>') {
return Ok(None);
}
self.stream.advance();
let this = self.register_tag(Node::Tag(HTMLTag::new(
name.into(),
attr,
InlineVec::new(),
self.stream.slice(start, self.stream.idx).into(),
)))?;
self.add_to_parent(this)?;
if !is_self_closing && !asm_core::is_void_tag(name) {
push_vec::<NodeHandle, MAX_STACK>(
&mut self.stack,
this,
ParseError::StackCapacityExceeded,
)?;
}
}
};
Ok(Some(()))
}
pub(crate) fn parse_single(&mut self) -> Result<Option<()>, ParseError> {
loop {
match asm_core::scan_html_event(self.stream.data(), self.stream.idx) {
(0, _, _) => return Ok(None),
(1, start, len) => {
self.stream.idx = start + len;
let raw = Node::Raw(self.stream.slice(start, start + len).into());
let handle = self.register_tag(raw)?;
self.add_to_parent(handle)?;
}
_ => {
self.parse_tag()?;
}
}
}
}
#[inline]
pub fn resolve_node_id(&self, id: InnerNodeHandle) -> Option<&Node<'a>> {
self.tags.get(id as usize)
}
#[inline]
pub fn resolve_node_id_mut(&mut self, id: InnerNodeHandle) -> Option<&mut Node<'a>> {
self.tags.get_mut(id as usize)
}
pub(crate) fn parse(&mut self) -> Result<(), ParseError> {
if !asm_core::len_fits_u32(self.stream.len()) {
return Err(ParseError::InvalidLength);
}
self.parse_asm_document()
}
fn parse_asm_document(&mut self) -> Result<(), ParseError> {
const STACK_NODE_CAP: usize = 192;
const STACK_ATTR_CAP: usize = 128;
const STACK_STACK_CAP: usize = 64;
let mut stack_nodes = [const { MaybeUninit::<AsmNodeRecord>::uninit() }; STACK_NODE_CAP];
let mut stack_attrs = [const { MaybeUninit::<AsmAttrRecord>::uninit() }; STACK_ATTR_CAP];
let mut asm_stack = [const { MaybeUninit::<u32>::uninit() }; STACK_STACK_CAP];
let mut stack_out = asm_core::AsmParseOutput::from_raw_parts(
asm_core::AsmBuffer::new(stack_nodes.as_mut_ptr().cast(), STACK_NODE_CAP),
asm_core::AsmBuffer::new(stack_attrs.as_mut_ptr().cast(), STACK_ATTR_CAP),
asm_core::AsmBuffer::new(core::ptr::null_mut(), 0),
asm_core::AsmBuffer::new(asm_stack.as_mut_ptr().cast(), STACK_STACK_CAP),
);
let stack_status = asm_core::parse_document(self.stream.data(), &mut stack_out);
match stack_status {
0 => {
let node_records = unsafe {
core::slice::from_raw_parts(stack_nodes.as_ptr().cast(), stack_out.nodes_len)
};
let attr_records = unsafe {
core::slice::from_raw_parts(stack_attrs.as_ptr().cast(), stack_out.attrs_len)
};
return self.finish_asm_document(stack_out.version, node_records, attr_records);
}
1..=4 => {}
5 => return Err(ParseError::UnsupportedAssemblySyntax),
_ => return Err(ParseError::UnsupportedAssemblySyntax),
}
let len = self.stream.len();
let max_cap = len + 1;
let mut node_cap = (len / 2).max(STACK_NODE_CAP * 2).min(max_cap);
let mut attr_cap = (len / 3).max(STACK_ATTR_CAP * 2).min(max_cap);
let mut side_cap = (len / 3).max(STACK_STACK_CAP * 2).min(max_cap);
loop {
let mut node_records = Vec::<AsmNodeRecord>::with_capacity(node_cap);
let mut attr_records = Vec::<AsmAttrRecord>::with_capacity(attr_cap);
let mut stack = Vec::<u32>::with_capacity(side_cap);
let mut out = asm_core::AsmParseOutput::from_raw_parts(
asm_core::AsmBuffer::new(node_records.as_mut_ptr(), node_records.capacity()),
asm_core::AsmBuffer::new(attr_records.as_mut_ptr(), attr_records.capacity()),
asm_core::AsmBuffer::new(core::ptr::null_mut(), 0),
asm_core::AsmBuffer::new(stack.as_mut_ptr(), stack.capacity()),
);
let status = asm_core::parse_document(self.stream.data(), &mut out);
match status {
0 => {
unsafe {
node_records.set_len(out.nodes_len);
attr_records.set_len(out.attrs_len);
}
return self.finish_asm_document(out.version, &node_records, &attr_records);
}
1..=4 if node_cap < max_cap || attr_cap < max_cap || side_cap < max_cap => {
node_cap = (node_cap.saturating_mul(2)).min(max_cap);
attr_cap = (attr_cap.saturating_mul(2)).min(max_cap);
side_cap = (side_cap.saturating_mul(2)).min(max_cap);
}
1 => return Err(ParseError::NodeCapacityExceeded),
2 => return Err(ParseError::AttributeCapacityExceeded),
3 => return Err(ParseError::RootCapacityExceeded),
4 => return Err(ParseError::StackCapacityExceeded),
5 => return Err(ParseError::UnsupportedAssemblySyntax),
_ => return Err(ParseError::UnsupportedAssemblySyntax),
}
}
}
fn finish_asm_document(
&mut self,
version: u32,
node_records: &[AsmNodeRecord],
attr_records: &[AsmAttrRecord],
) -> Result<(), ParseError> {
self.tags.clear();
self.ast.clear();
self.stack.clear();
self.version = (version == 1).then_some(HTMLVersion::HTML5);
let mut tags = Vec::<Node<'a>>::with_capacity(node_records.len());
let mut written = 0;
let tags_ptr: *mut Node<'a> = tags.as_mut_ptr();
for (idx, record) in node_records.iter().enumerate() {
let node = match record.kind {
1 => Node::Raw(self.asm_slice_fast(record.start, record.len).into()),
2 => {
let attr = self.materialize_attrs(record, attr_records);
Node::Tag(HTMLTag::new(
self.asm_slice_fast(record.name_start, record.name_len)
.into(),
attr,
InlineVec::with_capacity(record.flags as usize),
self.asm_slice_fast(record.start, record.len).into(),
))
}
3 => Node::Comment(self.asm_slice_fast(record.start, record.len).into()),
_ => {
unsafe {
tags.set_len(written);
}
return Err(ParseError::UnsupportedAssemblySyntax);
}
};
unsafe {
tags_ptr.add(idx).write(node);
}
written = idx + 1;
}
unsafe {
tags.set_len(written);
}
self.tags = tags;
self.ast = Vec::with_capacity(node_records.len().min(16));
for (idx, record) in node_records.iter().enumerate() {
let handle = NodeHandle::new(idx as u32);
if record.parent == u32::MAX {
self.ast.push(handle);
} else {
let parent = self
.tags
.get_mut(record.parent as usize)
.and_then(Node::as_tag_mut)
.ok_or(ParseError::UnsupportedAssemblySyntax)?;
if let Err(handle) = parent._children.push_inline_unchecked(handle) {
parent
._children
.push_handle(handle)
.map_err(|_| ParseError::ChildCapacityExceeded)?;
}
}
}
if self.options.is_tracking() {
self.build_tracking_indexes()?;
}
self.stream.idx = self.stream.len();
Ok(())
}
#[inline(always)]
fn asm_slice_fast(&self, start: u32, len: u32) -> &'a [u8] {
let start = start as usize;
let len = len as usize;
debug_assert!(start <= self.stream.len());
debug_assert!(start + len <= self.stream.len());
unsafe { core::slice::from_raw_parts(self.stream.data().as_ptr().add(start), len) }
}
#[allow(dead_code)]
fn asm_slice(&self, start: usize, len: usize) -> Result<&'a [u8], ParseError> {
let end = start
.checked_add(len)
.ok_or(ParseError::UnsupportedAssemblySyntax)?;
if end <= self.stream.len() {
Ok(self.stream.slice(start, end))
} else {
Err(ParseError::UnsupportedAssemblySyntax)
}
}
fn materialize_attrs(&self, record: &AsmNodeRecord, attrs: &[AsmAttrRecord]) -> Attributes<'a> {
let start = record.attr_start as usize;
let end = start + record.attr_count as usize;
let mut out = Attributes::new();
debug_assert!(end <= attrs.len());
for attr in &attrs[start..end] {
let key = self.asm_slice_fast(attr.name_start, attr.name_len);
let value = if attr.has_value != 0 {
Some(self.asm_slice_fast(attr.value_start, attr.value_len).into())
} else {
None
};
match attr.key_kind {
1 => out.id = value,
2 => out.class = value,
_ => {
let key = key.into();
if let Err((key, value)) = out.raw.push_inline_unchecked(key, value) {
let _ = out.raw.insert(key, value);
}
}
}
}
out
}
fn build_tracking_indexes(&mut self) -> Result<(), ParseError> {
let track_classes = self.options.is_tracking_classes();
let track_ids = self.options.is_tracking_ids();
for idx in 0..self.tags.len() {
let handle = NodeHandle::new(idx as u32);
let Some(tag) = self.tags[idx].as_tag() else {
continue;
};
if let (true, Some(bytes)) = (track_classes, &tag._attributes.class)
&& let Some(class_bytes) = bytes.as_bytes_borrowed()
{
let mut cursor = 0;
while let Some((start, len, next)) = asm_core::next_ascii_token(class_bytes, cursor)
{
let key = Bytes::from(&class_bytes[start..start + len]);
if let Some(handles) = self.classes.get_bytes_mut(&key) {
push_class_handle::<MAX_NODES>(
handles,
handle,
ParseError::ClassCapacityExceeded,
)?;
} else {
let mut handles = ClassVec::<MAX_NODES>::new();
push_class_handle::<MAX_NODES>(
&mut handles,
handle,
ParseError::ClassCapacityExceeded,
)?;
insert_bytes_map::<ClassVec<MAX_NODES>, MAX_CLASSES>(
&mut self.classes,
key,
handles,
ParseError::ClassCapacityExceeded,
)?;
}
cursor = next;
}
}
if let (true, Some(bytes)) = (track_ids, &tag._attributes.id) {
insert_bytes_map::<NodeHandle, MAX_IDS>(
&mut self.ids,
bytes.clone(),
handle,
ParseError::IdCapacityExceeded,
)?;
}
}
Ok(())
}
}