use std::collections::HashMap;
#[derive(Debug, Clone)]
pub(crate) enum HtmlNode {
Text(String),
Element {
tag: String,
attrs: HashMap<String, String>,
children: Vec<HtmlNode>,
},
}
impl HtmlNode {
pub(crate) fn tag_name(&self) -> Option<&str> {
match self {
HtmlNode::Element { tag, .. } => Some(tag),
HtmlNode::Text(_) => None,
}
}
pub(crate) fn as_text(&self) -> Option<&str> {
match self {
HtmlNode::Text(s) => Some(s),
HtmlNode::Element { .. } => None,
}
}
pub(crate) fn attr(&self, name: &str) -> Option<String> {
match self {
HtmlNode::Element { attrs, .. } => {
let lower = name.to_ascii_lowercase();
attrs
.iter()
.find(|(k, _)| k.to_ascii_lowercase() == lower)
.map(|(_, v)| v.clone())
}
HtmlNode::Text(_) => None,
}
}
#[allow(dead_code)]
pub(crate) fn child_elements(&self) -> impl Iterator<Item = &HtmlNode> {
match self {
HtmlNode::Element { children, .. } => children.iter(),
HtmlNode::Text(_) => [].iter(),
}
}
pub(crate) fn children(&self) -> impl Iterator<Item = &HtmlNode> {
match self {
HtmlNode::Element { children, .. } => children.iter(),
HtmlNode::Text(_) => [].iter(),
}
}
pub(crate) fn text_content(&self) -> String {
match self {
HtmlNode::Text(s) => s.clone(),
HtmlNode::Element { children, .. } => children
.iter()
.map(|child| child.text_content())
.collect::<Vec<_>>()
.join(""),
}
}
}
pub(crate) fn parse_html(html: &str) -> HtmlNode {
let mut parser = HtmlParser::new(html);
parser.parse_root()
}
struct HtmlParser {
input: Vec<char>,
pos: usize,
}
impl HtmlParser {
fn new(html: &str) -> Self {
HtmlParser {
input: html.chars().collect(),
pos: 0,
}
}
fn current(&self) -> Option<char> {
if self.pos < self.input.len() {
Some(self.input[self.pos])
} else {
None
}
}
fn peek(&self, offset: usize) -> Option<char> {
let p = self.pos + offset;
if p < self.input.len() {
Some(self.input[p])
} else {
None
}
}
fn advance(&mut self) {
self.pos += 1;
}
fn skip_whitespace(&mut self) {
while let Some(c) = self.current() {
if c.is_whitespace() {
self.advance();
} else {
break;
}
}
}
fn read_until(&mut self, terminator: char) -> String {
let mut result = String::new();
while let Some(c) = self.current() {
if c == terminator {
break;
}
result.push(c);
self.advance();
}
result
}
fn read_tag_name(&mut self) -> String {
let mut result = String::new();
while let Some(c) = self.current() {
if c.is_ascii_alphanumeric() || c == '-' {
result.push(c);
self.advance();
} else {
break;
}
}
result.to_lowercase()
}
fn read_attribute_name(&mut self) -> String {
let mut result = String::new();
while let Some(c) = self.current() {
if c.is_ascii_alphanumeric() || c == '-' || c == ':' {
result.push(c);
self.advance();
} else {
break;
}
}
result.to_lowercase()
}
fn read_attribute_value(&mut self) -> String {
self.skip_whitespace();
if self.current() != Some('=') {
return String::new();
}
self.advance(); self.skip_whitespace();
let quote = self.current();
if quote == Some('"') || quote == Some('\'') {
self.advance();
let value = self.read_until(quote.unwrap());
if self.current() == quote {
self.advance();
}
Self::decode_html_entities(&value)
} else {
let mut result = String::new();
while let Some(c) = self.current() {
if c.is_whitespace() || c == '>' {
break;
}
result.push(c);
self.advance();
}
Self::decode_html_entities(&result)
}
}
fn read_attributes(&mut self) -> HashMap<String, String> {
let mut attrs = HashMap::new();
loop {
self.skip_whitespace();
if self.current() == Some('>') || self.current() == Some('/') {
break;
}
let name = self.read_attribute_name();
if name.is_empty() {
break;
}
let value = self.read_attribute_value();
attrs.insert(name, value);
}
attrs
}
fn parse_tag(&mut self) -> Option<(String, HashMap<String, String>, bool)> {
if self.current() != Some('<') {
return None;
}
self.advance();
if self.current() == Some('!') && self.peek(1) == Some('-') && self.peek(2) == Some('-') {
self.advance(); self.advance(); self.advance(); while self.current().is_some() {
if self.current() == Some('-')
&& self.peek(1) == Some('-')
&& self.peek(2) == Some('>')
{
self.advance();
self.advance();
self.advance();
break;
}
self.advance();
}
return None; }
if self.current() == Some('/') {
return None;
}
let tag_name = self.read_tag_name();
if tag_name.is_empty() {
return None;
}
let attrs = self.read_attributes();
let self_closing = self.current() == Some('/');
if self_closing {
self.advance();
}
if self.current() == Some('>') {
self.advance();
}
Some((tag_name, attrs, self_closing))
}
fn is_self_closing_tag(tag: &str) -> bool {
matches!(
tag,
"br" | "hr"
| "img"
| "input"
| "meta"
| "link"
| "area"
| "base"
| "col"
| "embed"
| "source"
| "track"
| "wbr"
)
}
fn decode_html_entities(text: &str) -> String {
let mut result = String::new();
let mut chars = text.chars().peekable();
while let Some(c) = chars.next() {
if c == '&' {
let mut entity = String::new();
while let Some(&next) = chars.peek() {
if next == ';' {
chars.next();
break;
}
entity.push(next);
chars.next();
}
let decoded: String = match entity.as_str() {
"amp" => "&".to_string(),
"lt" => "<".to_string(),
"gt" => ">".to_string(),
"quot" => "\"".to_string(),
"apos" => "'".to_string(),
"nbsp" => "\u{00A0}".to_string(),
_ if entity.starts_with('#') => {
if let Ok(code) = entity[1..].parse::<u32>() {
if let Some(ch) = char::from_u32(code) {
ch.to_string()
} else {
format!("&{};", entity)
}
} else {
format!("&{};", entity)
}
}
_ => format!("&{};", entity),
};
result.push_str(&decoded);
} else {
result.push(c);
}
}
result
}
fn parse_root(&mut self) -> HtmlNode {
let mut stack: Vec<(String, HashMap<String, String>, Vec<HtmlNode>)> = Vec::new(); stack.push(("root".to_string(), HashMap::new(), Vec::new()));
while self.current().is_some() && !stack.is_empty() {
self.skip_whitespace();
if self.current() == Some('<') {
if self.peek(1) == Some('/') {
self.advance(); self.advance(); let closing_tag = self.read_tag_name();
self.skip_whitespace();
if self.current() == Some('>') {
self.advance();
}
if let Some((tag, _, _)) = stack.last() && closing_tag == *tag {
let (tag, attrs, children) = stack.pop().unwrap();
let node = HtmlNode::Element {
tag,
attrs,
children,
};
if let Some((_, _, parent_children)) = stack.last_mut() {
parent_children.push(node);
}
}
} else if let Some((tag, attrs, is_self_closing)) = self.parse_tag() {
if is_self_closing || Self::is_self_closing_tag(&tag) {
let node = HtmlNode::Element {
tag,
attrs,
children: Vec::new(),
};
if let Some((_, _, children)) = stack.last_mut() {
children.push(node);
}
} else {
stack.push((tag, attrs, Vec::new()));
}
}
} else {
let mut text = String::new();
while let Some(c) = self.current() {
if c == '<' {
break;
}
text.push(c);
self.advance();
}
let decoded = Self::decode_html_entities(&text);
if !decoded.trim().is_empty() && let Some((_, _, children)) = stack.last_mut() {
children.push(HtmlNode::Text(decoded));
}
}
}
if let Some((_, _, children)) = stack.pop() {
HtmlNode::Element {
tag: "root".to_string(),
attrs: HashMap::new(),
children,
}
} else {
HtmlNode::Element {
tag: "root".to_string(),
attrs: HashMap::new(),
children: Vec::new(),
}
}
}
}