#![forbid(unsafe_code)]
#![doc = include_str!("../README.md")]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlAttribute {
pub name: String,
pub value: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlElement {
pub name: String,
pub attributes: Vec<HtmlAttribute>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlLink {
pub text: String,
pub href: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlHeading {
pub level: u8,
pub text: String,
}
#[must_use]
pub fn looks_like_html(input: &str) -> bool {
let trimmed = input.trim();
if trimmed.is_empty() {
return false;
}
let bytes = trimmed.as_bytes();
bytes.windows(2).any(|window| {
window[0] == b'<' && (window[1].is_ascii_alphabetic() || matches!(window[1], b'/' | b'!'))
}) && trimmed.contains('>')
}
#[must_use]
pub fn strip_html_comments(input: &str) -> String {
let mut result = String::new();
let mut remainder = input;
while let Some(start) = remainder.find("<!--") {
result.push_str(&remainder[..start]);
let comment = &remainder[start + 4..];
if let Some(end) = comment.find("-->") {
remainder = &comment[end + 3..];
} else {
remainder = "";
break;
}
}
result.push_str(remainder);
result
}
#[must_use]
pub fn strip_tags_basic(input: &str) -> String {
let mut result = String::new();
let mut inside_tag = false;
for character in input.chars() {
match character {
'<' => inside_tag = true,
'>' => inside_tag = false,
_ if !inside_tag => result.push(character),
_ => {}
}
}
result
}
#[must_use]
pub fn escape_html(input: &str) -> String {
input
.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
.replace('"', """)
.replace('\'', "'")
}
#[must_use]
pub fn unescape_html(input: &str) -> String {
input
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("&", "&")
}
#[must_use]
pub fn extract_links(input: &str) -> Vec<HtmlLink> {
let lower = input.to_ascii_lowercase();
let mut results = Vec::new();
let mut search_start = 0;
while let Some(start_offset) = lower[search_start..].find("<a") {
let start = search_start + start_offset;
let Some(open_end_offset) = lower[start..].find('>') else {
break;
};
let open_end = start + open_end_offset;
let Some(close_offset) = lower[open_end + 1..].find("</a>") else {
break;
};
let close_start = open_end + 1 + close_offset;
let element = &input[start..=open_end];
if let Some(href) = get_attribute(element, "href") {
let text = strip_tags_basic(&input[open_end + 1..close_start])
.trim()
.to_string();
results.push(HtmlLink { text, href });
}
search_start = close_start + 4;
}
results
}
#[must_use]
pub fn extract_headings(input: &str) -> Vec<HtmlHeading> {
let lower = input.to_ascii_lowercase();
let mut results = Vec::new();
let mut search_start = 0;
while let Some(start_offset) = lower[search_start..].find("<h") {
let start = search_start + start_offset;
let bytes = lower.as_bytes();
let Some(level_byte) = bytes.get(start + 2) else {
break;
};
if !(b'1'..=b'6').contains(level_byte) {
search_start = start + 2;
continue;
}
let after_level = bytes.get(start + 3).copied();
if let Some(after_level) = after_level {
if after_level != b'>' && !after_level.is_ascii_whitespace() {
search_start = start + 2;
continue;
}
}
let Some(open_end_offset) = lower[start..].find('>') else {
break;
};
let open_end = start + open_end_offset;
let level = level_byte - b'0';
let close_tag = format!("</h{level}>");
let Some(close_offset) = lower[open_end + 1..].find(&close_tag) else {
break;
};
let close_start = open_end + 1 + close_offset;
let text = strip_tags_basic(&input[open_end + 1..close_start])
.trim()
.to_string();
results.push(HtmlHeading { level, text });
search_start = close_start + close_tag.len();
}
results
}
#[must_use]
pub fn extract_title(input: &str) -> Option<String> {
let lower = input.to_ascii_lowercase();
let start = lower.find("<title>")? + 7;
let end = lower[start..].find("</title>")? + start;
Some(strip_tags_basic(&input[start..end]).trim().to_string())
}
#[must_use]
pub fn extract_meta_content(input: &str, name: &str) -> Option<String> {
let lower = input.to_ascii_lowercase();
let mut search_start = 0;
while let Some(start_offset) = lower[search_start..].find("<meta") {
let start = search_start + start_offset;
let end_offset = lower[start..].find('>')?;
let end = start + end_offset;
let element = &input[start..=end];
if get_attribute(element, "name")
.as_deref()
.is_some_and(|value| value.eq_ignore_ascii_case(name))
{
return get_attribute(element, "content");
}
search_start = end + 1;
}
None
}
#[must_use]
pub fn extract_attributes(element: &str) -> Vec<HtmlAttribute> {
let trimmed = element.trim();
if !trimmed.starts_with('<') {
return Vec::new();
}
let mut inner = trimmed.trim_start_matches('<').trim_end_matches('>').trim();
inner = inner.strip_suffix('/').unwrap_or(inner).trim_end();
let mut index = 0;
let bytes = inner.as_bytes();
while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
index += 1;
}
let mut attributes = Vec::new();
while index < bytes.len() {
while index < bytes.len() && bytes[index].is_ascii_whitespace() {
index += 1;
}
if index >= bytes.len() {
break;
}
let name_start = index;
while index < bytes.len() && !bytes[index].is_ascii_whitespace() && bytes[index] != b'=' {
index += 1;
}
let name = inner[name_start..index].trim();
if name.is_empty() {
break;
}
while index < bytes.len() && bytes[index].is_ascii_whitespace() {
index += 1;
}
let value = if index < bytes.len() && bytes[index] == b'=' {
index += 1;
while index < bytes.len() && bytes[index].is_ascii_whitespace() {
index += 1;
}
if index >= bytes.len() {
Some(String::new())
} else {
let quote = bytes[index];
if quote == b'\'' || quote == b'"' {
index += 1;
let value_start = index;
while index < bytes.len() && bytes[index] != quote {
index += 1;
}
let parsed = inner[value_start..index].to_string();
if index < bytes.len() {
index += 1;
}
Some(parsed)
} else {
let value_start = index;
while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
index += 1;
}
Some(inner[value_start..index].to_string())
}
}
} else {
None
};
attributes.push(HtmlAttribute {
name: name.to_ascii_lowercase(),
value,
});
}
attributes
}
#[must_use]
pub fn get_attribute(element: &str, name: &str) -> Option<String> {
let requested = name.trim().to_ascii_lowercase();
extract_attributes(element)
.into_iter()
.find(|attribute| attribute.name == requested)
.and_then(|attribute| attribute.value)
}