use std::fmt::{Display, Formatter};
use scraper::{ElementRef, Html, Selector};
use serde::Deserialize;
use url::Url;
const MAX_LAYOUT_SPACER_LINES: u8 = 3;
const INDEX_MANIFEST_VERSION: &str = "index.idx/v1";
const MAX_MANIFEST_BYTES: usize = 32 * 1024;
const MAX_MANIFEST_HINTS: usize = 64;
const MAX_MANIFEST_STRING_LEN: usize = 256;
const MAX_MANIFEST_SELECTOR_COMPLEXITY: usize = 16;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexManifest {
pub version: String,
pub source_url: String,
pub scope: String,
pub content: IndexContentHint,
pub regions: Vec<IndexRegionHint>,
pub fields: Vec<IndexFieldHint>,
pub forms: Vec<IndexFormHint>,
pub dates: Vec<IndexDateHint>,
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct IndexContentHint {
pub main_selector: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexRegionHint {
pub role: String,
pub selector: String,
pub collapsed: bool,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexFieldHint {
pub name: String,
pub label: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexFormHint {
pub name: String,
pub selector: Option<String>,
pub note: Option<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndexDateStyle {
Date,
DateTime,
}
impl IndexDateStyle {
fn parse(input: &str) -> Option<Self> {
match input.trim().to_ascii_lowercase().as_str() {
"date" => Some(Self::Date),
"datetime" | "date-time" => Some(Self::DateTime),
_ => None,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexDateHint {
pub field: String,
pub style: IndexDateStyle,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum IndexManifestError {
TooLarge {
max_bytes: usize,
actual_bytes: usize,
},
InvalidJson(String),
UnsupportedVersion(String),
InvalidSourceUrl(String),
InvalidPageUrl(String),
CrossOrigin {
source_url: String,
page_url: String,
},
InvalidScope(String),
OutOfScope {
scope: String,
page_path: String,
},
TooManyHints {
kind: &'static str,
max: usize,
},
InvalidHint {
kind: &'static str,
reason: String,
},
}
impl Display for IndexManifestError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::TooLarge {
max_bytes,
actual_bytes,
} => {
write!(
f,
"manifest exceeds limit: {actual_bytes} bytes (max {max_bytes})"
)
}
Self::InvalidJson(error) => write!(f, "manifest JSON is invalid: {error}"),
Self::UnsupportedVersion(version) => {
write!(f, "unsupported manifest version: {version}")
}
Self::InvalidSourceUrl(url) => write!(f, "manifest source URL is invalid: {url}"),
Self::InvalidPageUrl(url) => write!(f, "page URL is invalid: {url}"),
Self::CrossOrigin {
source_url,
page_url,
} => write!(
f,
"manifest source must be same-origin: {source_url} vs {page_url}"
),
Self::InvalidScope(scope) => write!(f, "manifest scope is invalid: {scope}"),
Self::OutOfScope { scope, page_path } => {
write!(f, "page path {page_path} is outside manifest scope {scope}")
}
Self::TooManyHints { kind, max } => {
write!(f, "manifest has too many {kind} hints (max {max})")
}
Self::InvalidHint { kind, reason } => {
write!(f, "manifest {kind} hint is invalid: {reason}")
}
}
}
}
impl std::error::Error for IndexManifestError {}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Default)]
struct RawIndexManifest {
version: String,
#[serde(default)]
scope: Option<String>,
#[serde(default)]
content: RawIndexContentHint,
#[serde(default)]
regions: Vec<RawIndexRegionHint>,
#[serde(default)]
fields: Vec<RawIndexFieldHint>,
#[serde(default)]
forms: Vec<RawIndexFormHint>,
#[serde(default)]
dates: Vec<RawIndexDateHint>,
}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Default)]
struct RawIndexContentHint {
#[serde(default)]
main_selector: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
struct RawIndexRegionHint {
role: String,
selector: String,
#[serde(default)]
collapsed: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
struct RawIndexFieldHint {
name: String,
#[serde(default)]
label: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
struct RawIndexFormHint {
name: String,
#[serde(default)]
selector: Option<String>,
#[serde(default)]
note: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
struct RawIndexDateHint {
field: String,
style: String,
}
#[must_use]
pub fn well_known_index_manifest_url(page_url: &str) -> Option<String> {
let mut url = Url::parse(page_url).ok()?;
url.set_path("/.well-known/index.idx");
url.set_query(None);
url.set_fragment(None);
Some(url.to_string())
}
#[must_use]
pub fn discover_index_manifest_link_from_html(html: &str, page_url: &str) -> Option<String> {
let base = Url::parse(page_url).ok()?;
let document = Html::parse_document(html);
let selector = selector("link[rel][href]")?;
for link in document.select(&selector) {
let rel = link.value().attr("rel").unwrap_or_default();
let is_manifest = rel
.split(|c: char| c.is_whitespace() || c == ',')
.any(|token| token.eq_ignore_ascii_case("index-manifest"));
if !is_manifest {
continue;
}
let href = link.value().attr("href")?;
let joined = base.join(href).ok()?;
return Some(joined.to_string());
}
None
}
#[must_use]
pub fn discover_index_manifest_link_from_http_link_header(
header_value: &str,
page_url: &str,
) -> Option<String> {
let base = Url::parse(page_url).ok()?;
for chunk in header_value.split(',') {
let trimmed = chunk.trim();
let Some((target_part, params_part)) = trimmed.split_once('>') else {
continue;
};
let target = target_part.trim().strip_prefix('<')?;
let is_manifest = params_part
.split(';')
.map(str::trim)
.filter_map(|part| part.strip_prefix("rel="))
.map(|rel| rel.trim_matches('"'))
.any(|rel| {
rel.split_whitespace()
.any(|token| token.eq_ignore_ascii_case("index-manifest"))
});
if !is_manifest {
continue;
}
let joined = base.join(target).ok()?;
return Some(joined.to_string());
}
None
}
pub fn parse_index_manifest(
input: &str,
source_url: &str,
page_url: &str,
) -> Result<IndexManifest, IndexManifestError> {
let actual_bytes = input.len();
if actual_bytes > MAX_MANIFEST_BYTES {
return Err(IndexManifestError::TooLarge {
max_bytes: MAX_MANIFEST_BYTES,
actual_bytes,
});
}
let source = Url::parse(source_url)
.map_err(|_| IndexManifestError::InvalidSourceUrl(source_url.to_owned()))?;
let page = Url::parse(page_url)
.map_err(|_| IndexManifestError::InvalidPageUrl(page_url.to_owned()))?;
if !same_origin(&source, &page) {
return Err(IndexManifestError::CrossOrigin {
source_url: source_url.to_owned(),
page_url: page_url.to_owned(),
});
}
let raw = serde_json::from_str::<RawIndexManifest>(input)
.map_err(|error| IndexManifestError::InvalidJson(error.to_string()))?;
if raw.version != INDEX_MANIFEST_VERSION {
return Err(IndexManifestError::UnsupportedVersion(raw.version));
}
let scope = normalize_scope(raw.scope.as_deref(), page.path())?;
if !page.path().starts_with(&scope) {
return Err(IndexManifestError::OutOfScope {
scope,
page_path: page.path().to_owned(),
});
}
if raw.regions.len() > MAX_MANIFEST_HINTS {
return Err(IndexManifestError::TooManyHints {
kind: "region",
max: MAX_MANIFEST_HINTS,
});
}
if raw.fields.len() > MAX_MANIFEST_HINTS {
return Err(IndexManifestError::TooManyHints {
kind: "field",
max: MAX_MANIFEST_HINTS,
});
}
if raw.forms.len() > MAX_MANIFEST_HINTS {
return Err(IndexManifestError::TooManyHints {
kind: "form",
max: MAX_MANIFEST_HINTS,
});
}
if raw.dates.len() > MAX_MANIFEST_HINTS {
return Err(IndexManifestError::TooManyHints {
kind: "date",
max: MAX_MANIFEST_HINTS,
});
}
let main_selector = raw
.content
.main_selector
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
.map(|value| validate_selector("content.main_selector", value))
.transpose()?;
let regions = raw
.regions
.into_iter()
.map(|raw| {
let role = validate_hint_text("region.role", &raw.role)?;
let selector = validate_selector("region.selector", &raw.selector)?;
Ok(IndexRegionHint {
role,
selector,
collapsed: raw.collapsed,
})
})
.collect::<Result<Vec<_>, IndexManifestError>>()?;
let fields = raw
.fields
.into_iter()
.map(|raw| {
let name = validate_hint_text("field.name", &raw.name)?;
let label = raw
.label
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
.map(|value| validate_hint_text("field.label", value))
.transpose()?;
Ok(IndexFieldHint { name, label })
})
.collect::<Result<Vec<_>, IndexManifestError>>()?;
let forms = raw
.forms
.into_iter()
.map(|raw| {
let name = validate_hint_text("form.name", &raw.name)?;
let selector = raw
.selector
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
.map(|value| validate_selector("form.selector", value))
.transpose()?;
let note = raw
.note
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
.map(|value| validate_hint_text("form.note", value))
.transpose()?;
Ok(IndexFormHint {
name,
selector,
note,
})
})
.collect::<Result<Vec<_>, IndexManifestError>>()?;
let dates = raw
.dates
.into_iter()
.map(|raw| {
let field = validate_hint_text("date.field", &raw.field)?;
let style = IndexDateStyle::parse(&raw.style).ok_or_else(|| {
IndexManifestError::InvalidHint {
kind: "date.style",
reason: format!("unsupported style: {}", raw.style),
}
})?;
Ok(IndexDateHint { field, style })
})
.collect::<Result<Vec<_>, IndexManifestError>>()?;
Ok(IndexManifest {
version: INDEX_MANIFEST_VERSION.to_owned(),
source_url: source_url.to_owned(),
scope,
content: IndexContentHint { main_selector },
regions,
fields,
forms,
dates,
})
}
fn same_origin(left: &Url, right: &Url) -> bool {
left.scheme() == right.scheme()
&& left.host_str() == right.host_str()
&& left.port_or_known_default() == right.port_or_known_default()
}
fn normalize_scope(scope: Option<&str>, page_path: &str) -> Result<String, IndexManifestError> {
let normalized = scope.unwrap_or("/").trim();
if normalized.is_empty() || !normalized.starts_with('/') {
return Err(IndexManifestError::InvalidScope(normalized.to_owned()));
}
if normalized.len() > MAX_MANIFEST_STRING_LEN {
return Err(IndexManifestError::InvalidScope(normalized.to_owned()));
}
if page_path.is_empty() {
return Ok(normalized.to_owned());
}
Ok(normalized.to_owned())
}
fn validate_hint_text(kind: &'static str, text: &str) -> Result<String, IndexManifestError> {
let value = text.trim();
if value.is_empty() || value.len() > MAX_MANIFEST_STRING_LEN {
return Err(IndexManifestError::InvalidHint {
kind,
reason: "text length is out of bounds".to_owned(),
});
}
Ok(value.to_owned())
}
fn validate_selector(kind: &'static str, selector: &str) -> Result<String, IndexManifestError> {
let value = validate_hint_text(kind, selector)?;
let complexity = value
.chars()
.filter(|ch| matches!(ch, '>' | '+' | '~' | '[' | ']' | ':' | '*' | '#'))
.count();
if complexity > MAX_MANIFEST_SELECTOR_COMPLEXITY {
return Err(IndexManifestError::InvalidHint {
kind,
reason: "selector complexity exceeds limit".to_owned(),
});
}
Selector::parse(&value)
.map_err(|_| IndexManifestError::InvalidHint {
kind,
reason: "selector syntax is invalid".to_owned(),
})
.map(|_| value)
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlDocument {
pub raw: String,
pub title: Option<String>,
pub headings: Vec<HtmlHeading>,
pub links: Vec<HtmlLink>,
pub forms: Vec<HtmlForm>,
pub nodes: Vec<HtmlNode>,
pub metadata: HtmlMetadata,
pub body_text: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct HtmlMetadata {
pub canonical_url: Option<String>,
pub language: Option<String>,
pub description: Option<String>,
pub open_graph_title: Option<String>,
pub open_graph_description: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlHeading {
pub level: u8,
pub text: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlLink {
pub text: String,
pub href: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HtmlSectionRole {
Main,
Navigation,
Aside,
Footer,
Comments,
Related,
Unknown,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlForm {
pub name: String,
pub method: String,
pub action: String,
pub inputs: Vec<HtmlInput>,
pub buttons: Vec<HtmlButton>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlInput {
pub name: String,
pub kind: String,
pub value: Option<String>,
pub required: bool,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlButton {
pub name: Option<String>,
pub value: Option<String>,
pub label: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum HtmlNode {
Heading {
level: u8,
text: String,
},
Paragraph(String),
Link(HtmlLink),
List {
ordered: bool,
items: Vec<String>,
},
CodeBlock {
language: Option<String>,
code: String,
},
Table {
rows: Vec<Vec<String>>,
},
Spacer {
lines: u8,
},
Section {
role: HtmlSectionRole,
title: Option<String>,
collapsed: bool,
nodes: Vec<HtmlNode>,
},
Image {
alt: String,
src: Option<String>,
},
Form(HtmlForm),
}
#[must_use]
pub fn parse_html(input: impl Into<String>) -> HtmlDocument {
let raw = input.into();
let html = Html::parse_document(&raw);
let base_url = extract_base_url(&html);
let metadata = extract_metadata(&html, base_url.as_ref());
let layout_rules = extract_layout_rules(&html);
let root = main_content_root(&html);
let mut nodes = root
.as_ref()
.map(|root| extract_nodes(&root.element, base_url.as_ref(), &layout_rules))
.unwrap_or_default();
if let Some(root) = root.as_ref().filter(|root| root.explicit) {
nodes.extend(extract_secondary_sections(
&html,
&root.element,
base_url.as_ref(),
&layout_rules,
));
}
let headings = nodes
.iter()
.filter_map(|node| match node {
HtmlNode::Heading { level, text } => Some(HtmlHeading {
level: *level,
text: text.clone(),
}),
_ => None,
})
.collect::<Vec<_>>();
let links = root
.as_ref()
.map(|root| extract_links(&root.element, base_url.as_ref()))
.unwrap_or_default();
let forms = nodes
.iter()
.filter_map(|node| match node {
HtmlNode::Form(form) => Some(form.clone()),
_ => None,
})
.collect();
let title = extract_title(&html, &metadata, &headings);
let body_text = body_text_from_nodes(&nodes);
HtmlDocument {
raw,
title,
headings,
links,
forms,
nodes,
metadata,
body_text,
}
}
fn selector(query: &str) -> Option<Selector> {
Selector::parse(query).ok()
}
struct MainContentRoot<'a> {
element: ElementRef<'a>,
explicit: bool,
}
fn main_content_root(html: &Html) -> Option<MainContentRoot<'_>> {
let mut candidates = Vec::new();
for query in [
"main",
"article",
"[role=\"main\"]",
"[itemprop=\"articleBody\"]",
"#content",
".content",
".article",
".post",
".entry-content",
".markdown-body",
] {
if let Some(selector) = selector(query) {
for element in html.select(&selector) {
candidates.push(MainContentRoot {
element,
explicit: true,
});
}
}
}
if let Some(best) = best_main_root(candidates) {
return Some(best);
}
let body = selector("body").and_then(|selector| html.select(&selector).next());
if let Some(body) = body {
if let Some(dense_region) = densest_body_region(&body) {
return Some(MainContentRoot {
element: dense_region,
explicit: false,
});
}
return Some(MainContentRoot {
element: body,
explicit: false,
});
}
html.root_element()
.first_child()
.and_then(ElementRef::wrap)
.map(|element| MainContentRoot {
element,
explicit: false,
})
}
fn best_main_root<'a>(roots: Vec<MainContentRoot<'a>>) -> Option<MainContentRoot<'a>> {
roots
.into_iter()
.max_by_key(|root| main_root_score(&root.element))
}
fn densest_body_region<'a>(body: &ElementRef<'a>) -> Option<ElementRef<'a>> {
let selector = selector("main, article, section, div")?;
body.select(&selector)
.filter(|element| !is_boilerplate_container(element))
.map(|element| {
let score = main_root_score(&element);
(element, score)
})
.filter(|(_element, score)| *score >= 6)
.max_by_key(|(_element, score)| *score)
.map(|(element, _)| element)
}
fn main_root_score(root: &ElementRef<'_>) -> i32 {
let mut heading_count = 0i32;
let mut paragraph_count = 0i32;
let mut list_count = 0i32;
let mut code_count = 0i32;
let mut table_count = 0i32;
let mut link_count = 0i32;
let mut text_chars = 0i32;
let mut boilerplate_penalty = 0i32;
for node in root.descendants() {
let Some(element) = ElementRef::wrap(node) else {
continue;
};
let tag = element.value().name();
if is_boilerplate_container(&element) {
boilerplate_penalty += 3;
}
match tag {
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => heading_count += 1,
"p" => {
paragraph_count += 1;
text_chars += element_text(&element).chars().count() as i32;
}
"ul" | "ol" => list_count += 1,
"pre" | "code" => {
code_count += 1;
text_chars += code_text(&element).chars().count() as i32;
}
"table" => table_count += 1,
"a" => link_count += 1,
_ => {}
}
}
let dense_text_score = text_chars / 120;
let link_penalty = (link_count - paragraph_count * 5).clamp(0, 14);
heading_count * 4
+ paragraph_count * 5
+ list_count * 2
+ code_count * 5
+ table_count * 3
+ dense_text_score
- link_penalty
- boilerplate_penalty
}
fn extract_title(html: &Html, metadata: &HtmlMetadata, headings: &[HtmlHeading]) -> Option<String> {
first_text(html, "title")
.or_else(|| metadata.open_graph_title.clone())
.or_else(|| headings.first().map(|heading| heading.text.clone()))
}
fn extract_base_url(html: &Html) -> Option<Url> {
let selector = selector("base[href]")?;
let href = html
.select(&selector)
.next()
.and_then(|node| node.value().attr("href"))?;
Url::parse(href).ok()
}
fn extract_metadata(html: &Html, base_url: Option<&Url>) -> HtmlMetadata {
HtmlMetadata {
canonical_url: extract_link_href(html, "link[rel~=\"canonical\"]", base_url),
language: extract_language(html),
description: extract_meta_content(html, "meta[name=\"description\"]"),
open_graph_title: extract_meta_content(html, "meta[property=\"og:title\"]"),
open_graph_description: extract_meta_content(html, "meta[property=\"og:description\"]"),
}
}
fn extract_language(html: &Html) -> Option<String> {
let selector = selector("html[lang]")?;
html.select(&selector)
.next()
.and_then(|node| node.value().attr("lang"))
.map(str::trim)
.filter(|value| !value.is_empty())
.map(ToOwned::to_owned)
}
fn extract_link_href(html: &Html, query: &str, base_url: Option<&Url>) -> Option<String> {
let selector = selector(query)?;
html.select(&selector)
.next()
.and_then(|node| node.value().attr("href"))
.map(|href| resolve_url(href, base_url))
}
fn extract_meta_content(html: &Html, query: &str) -> Option<String> {
let selector = selector(query)?;
html.select(&selector)
.next()
.and_then(|node| node.value().attr("content"))
.map(clean_text)
.filter(|text| !text.is_empty())
}
fn first_text(html: &Html, query: &str) -> Option<String> {
let selector = selector(query)?;
html.select(&selector)
.next()
.map(|node| element_text(&node))
.filter(|text| !text.is_empty())
}
fn extract_nodes(
root: &ElementRef<'_>,
base_url: Option<&Url>,
layout_rules: &LayoutRules,
) -> Vec<HtmlNode> {
let mut nodes = Vec::new();
extract_element_nodes(root, base_url, layout_rules, &mut nodes);
trim_layout_spacers(&mut nodes);
nodes
}
fn extract_element_nodes(
element: &ElementRef<'_>,
base_url: Option<&Url>,
layout_rules: &LayoutRules,
nodes: &mut Vec<HtmlNode>,
) {
let tag = element.value().name();
if tag == "br" {
push_spacer(nodes, 1);
return;
}
if is_boilerplate_container(element) && !is_primary_content_container(element) {
return;
}
if is_content_element(tag) {
let spacing = layout_spacing(element, layout_rules);
push_spacer(nodes, spacing.before);
if let Some(node) = html_node_from_element(element, base_url) {
nodes.push(node);
}
push_spacer(nodes, spacing.after);
return;
}
let spacing = is_layout_boundary(tag).then(|| layout_spacing(element, layout_rules));
if let Some(spacing) = spacing {
push_spacer(nodes, spacing.before);
}
for child in element.children() {
if let Some(child_element) = ElementRef::wrap(child) {
extract_element_nodes(&child_element, base_url, layout_rules, nodes);
}
}
if let Some(spacing) = spacing {
push_spacer(nodes, spacing.after);
}
}
fn html_node_from_element(element: &ElementRef<'_>, base_url: Option<&Url>) -> Option<HtmlNode> {
match element.value().name() {
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
let text = element_text(element);
if text.is_empty() {
None
} else {
Some(HtmlNode::Heading {
level: heading_level(element.value().name()),
text,
})
}
}
"p" => {
let text = element_text(element);
(!text.is_empty()
&& !is_anchor_only_paragraph(element, &text)
&& !is_boilerplate_paragraph(&text))
.then_some(HtmlNode::Paragraph(text))
}
"blockquote" => {
let text = element_text(element);
(!text.is_empty()).then_some(HtmlNode::Paragraph(format!("> {text}")))
}
"ul" | "ol" => {
let items = list_items(element);
(!items.is_empty()).then_some(HtmlNode::List {
ordered: element.value().name() == "ol",
items,
})
}
"pre" | "code" => {
let code = code_text(element);
(!code.trim().is_empty()).then_some(HtmlNode::CodeBlock {
language: code_language(element),
code,
})
}
"table" => {
let rows = table_rows(element);
(!rows.is_empty()).then_some(HtmlNode::Table { rows })
}
"img" => Some(HtmlNode::Image {
alt: image_alt(element),
src: element
.value()
.attr("src")
.map(|src| resolve_url(src, base_url)),
}),
"form" => Some(HtmlNode::Form(extract_form(element, base_url))),
_ => None,
}
}
fn extract_secondary_sections(
html: &Html,
main_root: &ElementRef<'_>,
base_url: Option<&Url>,
layout_rules: &LayoutRules,
) -> Vec<HtmlNode> {
let Some(selector) = selector(
"nav, aside, footer, [role=\"navigation\"], [role=\"contentinfo\"], .sidebar, .related, #related, .comments, #comments",
) else {
return Vec::new();
};
html.select(&selector)
.filter(|element| {
!is_descendant_of(element, main_root)
&& !is_descendant_of(main_root, element)
&& element.id() != main_root.id()
})
.filter_map(|element| secondary_section_from_element(&element, base_url, layout_rules))
.collect()
}
fn secondary_section_from_element(
element: &ElementRef<'_>,
base_url: Option<&Url>,
layout_rules: &LayoutRules,
) -> Option<HtmlNode> {
let mut nodes = extract_nodes(element, base_url, layout_rules);
if nodes.is_empty() {
nodes.extend(
extract_links(element, base_url)
.into_iter()
.map(HtmlNode::Link),
);
}
trim_layout_spacers(&mut nodes);
(!nodes.is_empty()).then(|| HtmlNode::Section {
role: section_role(element),
title: section_title(element),
collapsed: true,
nodes,
})
}
fn is_descendant_of(element: &ElementRef<'_>, ancestor: &ElementRef<'_>) -> bool {
let mut parent = element.parent();
while let Some(node) = parent {
if node.id() == ancestor.id() {
return true;
}
parent = node.parent();
}
false
}
fn section_role(element: &ElementRef<'_>) -> HtmlSectionRole {
let names = element
.value()
.attr("id")
.into_iter()
.chain(element.value().attr("class"))
.flat_map(str::split_whitespace)
.map(str::to_ascii_lowercase)
.collect::<Vec<_>>();
if names.iter().any(|name| name.contains("comment")) {
return HtmlSectionRole::Comments;
}
if names.iter().any(|name| name.contains("related")) {
return HtmlSectionRole::Related;
}
match element.value().name() {
"nav" => return HtmlSectionRole::Navigation,
"aside" => return HtmlSectionRole::Aside,
"footer" => return HtmlSectionRole::Footer,
_ => {}
}
match element.value().attr("role") {
Some("navigation") => return HtmlSectionRole::Navigation,
Some("contentinfo") => return HtmlSectionRole::Footer,
_ => {}
}
if names
.iter()
.any(|name| name.contains("side") || name.contains("rail"))
{
HtmlSectionRole::Aside
} else {
HtmlSectionRole::Unknown
}
}
fn section_title(element: &ElementRef<'_>) -> Option<String> {
element
.value()
.attr("aria-label")
.or_else(|| element.value().attr("title"))
.map(clean_text)
.filter(|title| !title.is_empty())
.or_else(|| {
let selector = selector("h1, h2, h3, h4, h5, h6")?;
element
.select(&selector)
.next()
.map(|heading| element_text(&heading))
.filter(|title| !title.is_empty())
})
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
struct LayoutSpacing {
before: u8,
after: u8,
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
struct LayoutRules {
rules: Vec<LayoutRule>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct LayoutRule {
selector: LayoutSelector,
spacing: LayoutSpacing,
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum LayoutSelector {
Tag(String),
Class(String),
Id(String),
}
fn is_content_element(tag: &str) -> bool {
matches!(
tag,
"h1" | "h2"
| "h3"
| "h4"
| "h5"
| "h6"
| "p"
| "blockquote"
| "ul"
| "ol"
| "pre"
| "code"
| "table"
| "img"
| "form"
)
}
fn is_layout_boundary(tag: &str) -> bool {
matches!(
tag,
"article" | "section" | "header" | "footer" | "aside" | "div"
)
}
fn layout_spacing(element: &ElementRef<'_>, rules: &LayoutRules) -> LayoutSpacing {
let mut spacing = default_layout_spacing(element.value().name());
if has_layout_hint_name(element) {
spacing.before = spacing.before.max(1);
spacing.after = spacing.after.max(1);
}
for rule in rules.rules_for(element) {
spacing.before = spacing.before.max(rule.spacing.before);
spacing.after = spacing.after.max(rule.spacing.after);
}
if let Some(style) = element.value().attr("style") {
let inline_spacing = spacing_from_declarations(style);
spacing.before = spacing.before.max(inline_spacing.before);
spacing.after = spacing.after.max(inline_spacing.after);
}
LayoutSpacing {
before: spacing.before.min(MAX_LAYOUT_SPACER_LINES),
after: spacing.after.min(MAX_LAYOUT_SPACER_LINES),
}
}
fn default_layout_spacing(tag: &str) -> LayoutSpacing {
match tag {
"article" | "section" | "header" | "footer" | "aside" => LayoutSpacing {
before: 1,
after: 1,
},
_ => LayoutSpacing::default(),
}
}
fn has_layout_hint_name(element: &ElementRef<'_>) -> bool {
element
.value()
.attr("id")
.into_iter()
.chain(element.value().attr("class"))
.flat_map(str::split_whitespace)
.any(|name| {
let name = name.to_ascii_lowercase();
[
"section", "hero", "intro", "outro", "spacer", "block", "panel", "card",
]
.iter()
.any(|hint| name.contains(hint))
})
}
fn is_primary_content_container(element: &ElementRef<'_>) -> bool {
let tag = element.value().name();
if matches!(tag, "main" | "article") {
return true;
}
if matches!(element.value().attr("role"), Some("main")) {
return true;
}
element
.value()
.attr("id")
.into_iter()
.chain(element.value().attr("class"))
.flat_map(str::split_whitespace)
.any(|name| {
let lowered = name.to_ascii_lowercase();
lowered.contains("content")
|| lowered.contains("article")
|| lowered.contains("post")
|| lowered.contains("entry")
|| lowered.contains("markdown")
})
}
fn is_boilerplate_container(element: &ElementRef<'_>) -> bool {
let tag = element.value().name();
if matches!(tag, "nav" | "footer") {
return true;
}
element
.value()
.attr("id")
.into_iter()
.chain(element.value().attr("class"))
.flat_map(str::split_whitespace)
.any(|name| {
let lowered = name.to_ascii_lowercase();
lowered.contains("nav")
|| lowered.contains("menu")
|| lowered.contains("footer")
|| lowered.contains("header")
|| lowered.contains("sidebar")
|| lowered.contains("related")
|| lowered.contains("breadcrumb")
|| lowered.contains("cookie")
|| lowered.contains("newsletter")
|| lowered.contains("subscribe")
|| lowered.contains("promo")
|| lowered.contains("advert")
|| lowered.contains("social")
|| lowered.contains("share")
})
}
fn is_boilerplate_paragraph(text: &str) -> bool {
let lowered = text.to_ascii_lowercase();
let matches_phrase = lowered.contains("sign up for")
|| lowered.contains("subscribe")
|| lowered.contains("cookie policy")
|| lowered.contains("all rights reserved")
|| lowered.contains("follow us")
|| lowered.contains("share this")
|| lowered.contains("advertisement");
matches_phrase && text.chars().count() <= 180
}
impl LayoutRules {
fn rules_for<'a>(
&'a self,
element: &'a ElementRef<'_>,
) -> impl Iterator<Item = &'a LayoutRule> {
self.rules
.iter()
.filter(move |rule| rule.selector.matches(element))
}
}
impl LayoutSelector {
fn matches(&self, element: &ElementRef<'_>) -> bool {
match self {
Self::Tag(tag) => element.value().name() == tag,
Self::Class(class) => element
.value()
.attr("class")
.unwrap_or_default()
.split_whitespace()
.any(|candidate| candidate == class),
Self::Id(id) => element.value().attr("id") == Some(id.as_str()),
}
}
}
fn extract_layout_rules(html: &Html) -> LayoutRules {
let Some(selector) = selector("style") else {
return LayoutRules::default();
};
let mut rules = Vec::new();
for style in html.select(&selector) {
rules.extend(parse_stylesheet_rules(
&style.text().collect::<Vec<_>>().join(" "),
));
}
LayoutRules { rules }
}
fn parse_stylesheet_rules(stylesheet: &str) -> Vec<LayoutRule> {
stylesheet
.split('}')
.filter_map(|rule| {
let (selectors, declarations) = rule.split_once('{')?;
let spacing = spacing_from_declarations(declarations);
if spacing.before == 0 && spacing.after == 0 {
return None;
}
Some(
selectors
.split(',')
.filter_map(parse_layout_selector)
.map(move |selector| LayoutRule { selector, spacing })
.collect::<Vec<_>>(),
)
})
.flatten()
.collect()
}
fn parse_layout_selector(selector: &str) -> Option<LayoutSelector> {
let selector = selector.trim();
if selector.is_empty()
|| selector
.chars()
.any(|ch| matches!(ch, ' ' | '\t' | '\n' | '\r' | '>' | '+' | '~' | '[' | ':'))
{
return None;
}
if let Some(class) = selector.strip_prefix('.') {
(!class.is_empty()).then(|| LayoutSelector::Class(class.to_owned()))
} else if let Some(id) = selector.strip_prefix('#') {
(!id.is_empty()).then(|| LayoutSelector::Id(id.to_owned()))
} else {
selector
.chars()
.all(|ch| ch.is_ascii_alphanumeric() || ch == '-')
.then(|| LayoutSelector::Tag(selector.to_ascii_lowercase()))
}
}
fn spacing_from_declarations(declarations: &str) -> LayoutSpacing {
let mut spacing = LayoutSpacing::default();
for declaration in declarations.split(';') {
let Some((property, value)) = declaration.split_once(':') else {
continue;
};
let property = property.trim().to_ascii_lowercase();
let lines = css_length_to_lines(value.trim());
if lines == 0 {
continue;
}
match property.as_str() {
"margin-top" | "padding-top" => spacing.before = spacing.before.max(lines),
"margin-bottom" | "padding-bottom" | "gap" | "row-gap" => {
spacing.after = spacing.after.max(lines);
}
"margin" | "padding" => {
spacing.before = spacing.before.max(lines);
spacing.after = spacing.after.max(lines);
}
_ => {}
}
}
spacing
}
fn css_length_to_lines(value: &str) -> u8 {
let value = value.trim().to_ascii_lowercase();
if value == "0"
|| value.starts_with("0px")
|| value.starts_with("0rem")
|| value.starts_with("0em")
{
return 0;
}
let Some(number) = first_css_number(&value) else {
return 0;
};
if number <= 0.0 {
return 0;
}
let lines = if value.contains("rem") || value.contains("em") {
if number >= 4.0 {
3
} else if number >= 2.0 {
2
} else {
1
}
} else if value.contains("px") {
if number >= 48.0 {
3
} else if number >= 28.0 {
2
} else if number >= 12.0 {
1
} else {
0
}
} else if value.contains("vh") || value.contains("vw") {
if number >= 14.0 {
3
} else if number >= 8.0 {
2
} else {
1
}
} else {
1
};
lines.min(MAX_LAYOUT_SPACER_LINES)
}
fn first_css_number(value: &str) -> Option<f32> {
let mut start = None;
let mut end = 0;
for (index, ch) in value.char_indices() {
if start.is_none() && (ch.is_ascii_digit() || ch == '.') {
start = Some(index);
}
if start.is_some() {
if ch.is_ascii_digit() || ch == '.' {
end = index + ch.len_utf8();
} else {
break;
}
}
}
value.get(start?..end)?.parse().ok()
}
fn push_spacer(nodes: &mut Vec<HtmlNode>, lines: u8) {
if lines == 0 {
return;
}
let lines = lines.min(MAX_LAYOUT_SPACER_LINES);
if let Some(HtmlNode::Spacer { lines: existing }) = nodes.last_mut() {
*existing = (*existing).max(lines).min(MAX_LAYOUT_SPACER_LINES);
} else {
nodes.push(HtmlNode::Spacer { lines });
}
}
fn trim_layout_spacers(nodes: &mut Vec<HtmlNode>) {
while matches!(nodes.first(), Some(HtmlNode::Spacer { .. })) {
nodes.remove(0);
}
while matches!(nodes.last(), Some(HtmlNode::Spacer { .. })) {
nodes.pop();
}
}
fn extract_form(form: &ElementRef<'_>, base_url: Option<&Url>) -> HtmlForm {
let method = form
.value()
.attr("method")
.map(clean_text)
.filter(|method| !method.is_empty())
.unwrap_or_else(|| "GET".to_owned())
.to_ascii_uppercase();
let action = form
.value()
.attr("action")
.map(|action| resolve_url(action, base_url))
.or_else(|| base_url.map(Url::to_string))
.unwrap_or_default();
let name = form
.value()
.attr("name")
.or_else(|| form.value().attr("id"))
.map(clean_text)
.filter(|name| !name.is_empty())
.unwrap_or_else(|| "form".to_owned());
HtmlForm {
name,
method,
action,
inputs: extract_inputs(form),
buttons: extract_buttons(form),
}
}
fn extract_inputs(form: &ElementRef<'_>) -> Vec<HtmlInput> {
let Some(selector) = selector("input[name], textarea[name], select[name]") else {
return Vec::new();
};
form.select(&selector)
.filter_map(|input| {
let name = input.value().attr("name").map(clean_text)?;
(!name.is_empty()).then_some(HtmlInput {
name,
kind: input_kind(&input),
value: input_value(&input),
required: input.value().attr("required").is_some(),
})
})
.collect()
}
fn extract_buttons(form: &ElementRef<'_>) -> Vec<HtmlButton> {
let Some(selector) = selector("button, input[type=\"submit\"], input[type=\"button\"]") else {
return Vec::new();
};
form.select(&selector)
.map(|button| HtmlButton {
name: button.value().attr("name").map(clean_text),
value: button.value().attr("value").map(clean_text),
label: button_label(&button),
})
.filter(|button| !button.label.is_empty() || button.name.is_some())
.collect()
}
fn input_kind(input: &ElementRef<'_>) -> String {
match input.value().name() {
"textarea" => "textarea".to_owned(),
"select" => "select".to_owned(),
_ => input
.value()
.attr("type")
.map(clean_text)
.filter(|kind| !kind.is_empty())
.unwrap_or_else(|| "text".to_owned()),
}
}
fn input_value(input: &ElementRef<'_>) -> Option<String> {
match input.value().name() {
"textarea" => Some(element_text(input)).filter(|value| !value.is_empty()),
"select" => {
let selected = selector("option[selected]")
.and_then(|selector| input.select(&selector).next())
.and_then(|option| option_value(&option));
selected.or_else(|| {
selector("option")
.and_then(|selector| input.select(&selector).next())
.and_then(|option| option_value(&option))
})
}
_ => input.value().attr("value").map(clean_text),
}
}
fn option_value(option: &ElementRef<'_>) -> Option<String> {
option
.value()
.attr("value")
.map(clean_text)
.or_else(|| Some(element_text(option)))
.filter(|value| !value.is_empty())
}
fn button_label(button: &ElementRef<'_>) -> String {
let text = element_text(button);
if !text.is_empty() {
return text;
}
button
.value()
.attr("value")
.map(clean_text)
.filter(|value| !value.is_empty())
.unwrap_or_else(|| "submit".to_owned())
}
fn is_anchor_only_paragraph(element: &ElementRef<'_>, paragraph_text: &str) -> bool {
let Some(selector) = selector("a[href]") else {
return false;
};
let link_text = element
.select(&selector)
.map(|link| element_text(&link))
.filter(|text| !text.is_empty())
.collect::<Vec<_>>()
.join(" ");
!link_text.is_empty() && link_text == paragraph_text
}
fn heading_level(tag: &str) -> u8 {
tag.strip_prefix('h')
.and_then(|level| level.parse::<u8>().ok())
.filter(|level| (1..=6).contains(level))
.unwrap_or(1)
}
fn extract_links(root: &ElementRef<'_>, base_url: Option<&Url>) -> Vec<HtmlLink> {
let Some(selector) = selector("a[href]") else {
return Vec::new();
};
root.select(&selector)
.filter_map(|element| {
let text = element_text(&element);
let href = element.value().attr("href")?;
(!text.is_empty()).then_some(HtmlLink {
text,
href: resolve_url(href, base_url),
})
})
.collect()
}
fn table_rows(table: &ElementRef<'_>) -> Vec<Vec<String>> {
let Some(row_selector) = selector("tr") else {
return Vec::new();
};
let Some(cell_selector) = selector("th, td") else {
return Vec::new();
};
table
.select(&row_selector)
.map(|row| {
row.select(&cell_selector)
.map(|cell| element_text(&cell))
.filter(|text| !text.is_empty())
.collect::<Vec<_>>()
})
.filter(|row| !row.is_empty())
.collect()
}
fn list_items(list: &ElementRef<'_>) -> Vec<String> {
let Some(item_selector) = selector("li") else {
return Vec::new();
};
list.select(&item_selector)
.filter(|item| {
item.parent()
.and_then(ElementRef::wrap)
.is_some_and(|parent| parent.id() == list.id())
})
.map(|item| element_text(&item))
.filter(|text| !text.is_empty())
.collect()
}
fn image_alt(image: &ElementRef<'_>) -> String {
image
.value()
.attr("alt")
.or_else(|| image.value().attr("title"))
.map(clean_text)
.filter(|text| !text.is_empty())
.unwrap_or_else(|| "image".to_owned())
}
fn code_language(element: &ElementRef<'_>) -> Option<String> {
let class = element.value().attr("class").unwrap_or_default();
class
.split_whitespace()
.find_map(language_from_class)
.or_else(|| {
let selector = selector("code")?;
element.select(&selector).find_map(|code| {
code.value()
.attr("class")
.unwrap_or_default()
.split_whitespace()
.find_map(language_from_class)
})
})
}
fn language_from_class(class: &str) -> Option<String> {
class
.strip_prefix("language-")
.or_else(|| class.strip_prefix("lang-"))
.map(ToOwned::to_owned)
.filter(|language| !language.is_empty())
}
fn resolve_url(input: &str, base_url: Option<&Url>) -> String {
let trimmed = input.trim();
if let Ok(url) = Url::parse(trimmed) {
return url.to_string();
}
base_url
.and_then(|base| base.join(trimmed).ok())
.map(|url| url.to_string())
.unwrap_or_else(|| trimmed.to_owned())
}
fn body_text_from_nodes(nodes: &[HtmlNode]) -> String {
let parts = nodes
.iter()
.filter_map(|node| match node {
HtmlNode::Heading { text, .. }
| HtmlNode::Paragraph(text)
| HtmlNode::CodeBlock { code: text, .. } => Some(text.clone()),
HtmlNode::List { items, .. } => Some(items.join(" ")).filter(|text| !text.is_empty()),
HtmlNode::Link(link) => Some(link.text.clone()).filter(|text| !text.is_empty()),
HtmlNode::Form(_form) => None,
HtmlNode::Spacer { .. } => None,
HtmlNode::Section { nodes, .. } => {
Some(body_text_from_nodes(nodes)).filter(|text| !text.is_empty())
}
_ => None,
})
.collect::<Vec<_>>();
clean_text(parts.join(" "))
}
fn element_text(element: &ElementRef<'_>) -> String {
clean_text(element.text().collect::<Vec<_>>().join(" "))
}
fn code_text(element: &ElementRef<'_>) -> String {
element.text().collect::<String>()
}
fn clean_text(input: impl AsRef<str>) -> String {
input
.as_ref()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
#[cfg(test)]
mod tests {
use super::{
HtmlNode, HtmlSectionRole, IndexDateStyle, discover_index_manifest_link_from_html,
discover_index_manifest_link_from_http_link_header, parse_html, parse_index_manifest,
well_known_index_manifest_url,
};
#[test]
fn extracts_title() {
let doc = parse_html("<html><title>Hello</title><body></body></html>");
assert_eq!(doc.title.as_deref(), Some("Hello"));
}
#[test]
fn extracts_headings_with_levels() {
let doc = parse_html("<main><h1>Main</h1><h2>Sub</h2></main>");
assert_eq!(doc.headings.len(), 2);
assert_eq!(doc.headings[0].level, 1);
assert_eq!(doc.headings[0].text, "Main");
assert_eq!(doc.headings[1].level, 2);
assert_eq!(doc.headings[1].text, "Sub");
}
#[test]
fn extracts_links_from_main_content() {
let doc = parse_html(
r#"<nav><a href="/noise">Noise</a></nav><main><a href="https://example.com">Example</a></main>"#,
);
assert_eq!(doc.links.len(), 1);
assert_eq!(doc.links[0].text, "Example");
assert_eq!(doc.links[0].href, "https://example.com/");
}
#[test]
fn prioritizes_explicit_main_and_collapses_secondary_regions() {
let doc = parse_html(
r#"
<body>
<nav aria-label="Site"><a href="/docs">Docs</a><a href="/about">About</a></nav>
<main><h1>Main Article</h1><p>Readable body.</p></main>
<aside class="related"><h2>Related</h2><a href="/next">Next</a></aside>
<footer><a href="/license">License</a></footer>
</body>
"#,
);
assert!(matches!(
doc.nodes.first(),
Some(HtmlNode::Heading { text, .. }) if text == "Main Article"
));
assert!(doc.nodes.iter().any(|node| matches!(
node,
HtmlNode::Section {
role: HtmlSectionRole::Navigation,
title: Some(title),
collapsed: true,
nodes
} if title == "Site" && nodes.len() == 2
)));
assert!(doc.nodes.iter().any(|node| matches!(
node,
HtmlNode::Section {
role: HtmlSectionRole::Related,
collapsed: true,
..
}
)));
assert!(doc.nodes.iter().any(|node| matches!(
node,
HtmlNode::Section {
role: HtmlSectionRole::Footer,
collapsed: true,
..
}
)));
}
#[test]
fn main_root_scoring_prefers_dense_article_over_chrome_like_main() {
let doc = parse_html(
r#"
<body>
<main class="top-nav">
<a href="/home">Home</a>
<a href="/pricing">Pricing</a>
<a href="/docs">Docs</a>
</main>
<article id="story">
<h1>Deep Story</h1>
<p>This paragraph carries the actual article payload for readers.</p>
<p>Second paragraph keeps the dense main-content region obvious.</p>
</article>
</body>
"#,
);
assert!(matches!(
doc.nodes.first(),
Some(HtmlNode::Heading { text, .. }) if text == "Deep Story"
));
assert!(!doc.body_text.contains("Pricing"));
}
#[test]
fn suppresses_boilerplate_containers_and_paragraphs_inside_main() {
let doc = parse_html(
r#"
<main>
<p>Primary body remains.</p>
<div class="newsletter signup">
<p>Sign up for updates and subscribe.</p>
</div>
<div class="related">
<a href="/related">Related link</a>
</div>
</main>
"#,
);
assert!(doc.nodes.iter().any(
|node| matches!(node, HtmlNode::Paragraph(text) if text == "Primary body remains.")
));
assert!(!doc.body_text.contains("Sign up for updates"));
assert!(!doc.body_text.contains("Related link"));
}
#[test]
fn preserves_br_boundaries_as_spacing_hints() {
let doc = parse_html("<main><p>First line.</p><br><p>Second line.</p></main>");
assert!(
doc.nodes
.iter()
.any(|node| matches!(node, HtmlNode::Spacer { lines: 1 }))
);
}
#[test]
fn extracts_structured_nodes() {
let doc = parse_html(
r#"
<main>
<pre><code class="language-rust">fn main() {}</code></pre>
<ul><li>Read</li><li>Search</li></ul>
<table><tr><th>Name</th></tr><tr><td>Index</td></tr></table>
<img src="/logo.png" alt="Index logo">
</main>
"#,
);
assert!(matches!(
doc.nodes.first(),
Some(HtmlNode::CodeBlock {
language: Some(language),
..
}) if language == "rust"
));
assert!(
doc.nodes
.iter()
.any(|node| matches!(node, HtmlNode::List { ordered: false, items } if items == &vec!["Read".to_owned(), "Search".to_owned()]))
);
assert!(
doc.nodes
.iter()
.any(|node| matches!(node, HtmlNode::Table { rows } if rows.len() == 2))
);
assert!(
doc.nodes
.iter()
.any(|node| matches!(node, HtmlNode::Image { alt, .. } if alt == "Index logo"))
);
}
#[test]
fn preserves_pre_and_code_text_whitespace() {
let doc = parse_html(
"<main><pre><code class=\"language-rust\">fn main() {\n println!(\"hi\");\n}</code></pre><code> raw\n block</code></main>",
);
assert!(doc.nodes.iter().any(|node| matches!(
node,
HtmlNode::CodeBlock { language: Some(language), code }
if language == "rust" && code == "fn main() {\n println!(\"hi\");\n}"
)));
assert!(doc.nodes.iter().any(|node| matches!(
node,
HtmlNode::CodeBlock { language: None, code } if code == " raw\n block"
)));
}
#[test]
fn preserves_blockquote_as_quoted_paragraph() {
let doc = parse_html("<main><blockquote>Quoted reply text.</blockquote></main>");
assert!(doc.nodes.iter().any(|node| matches!(
node,
HtmlNode::Paragraph(text) if text == "> Quoted reply text."
)));
}
#[test]
fn extracts_bounded_layout_spacing_from_css_and_block_boundaries() {
let doc = parse_html(
r#"
<html lang="en-US">
<head>
<style>
.hero { margin-bottom: 60px; }
.chapter { padding-top: 2rem; padding-bottom: 6rem; }
p { margin-bottom: 1rem; }
</style>
</head>
<main>
<section class="hero"><h1>Landing</h1><p>Intro.</p></section>
<section class="chapter"><p>Chapter body.</p></section>
</main>
</html>
"#,
);
assert!(
doc.nodes
.iter()
.any(|node| matches!(node, HtmlNode::Spacer { lines } if (1..=3).contains(lines)))
);
assert!(
doc.nodes
.iter()
.any(|node| matches!(node, HtmlNode::Spacer { lines: 3 }))
);
assert!(matches!(
doc.nodes.first(),
Some(HtmlNode::Heading { text, .. }) if text == "Landing"
));
assert!(matches!(
doc.nodes.last(),
Some(HtmlNode::Paragraph(text)) if text == "Chapter body."
));
}
#[test]
fn extracts_metadata_and_resolves_relative_urls_against_base() {
let doc = parse_html(
r#"
<html lang="en-US">
<head>
<base href="https://example.com/docs/">
<link rel="canonical" href="guide.html">
<meta name="description" content="A calm reader">
<meta property="og:title" content="OG Title">
<meta property="og:description" content="OG Description">
</head>
<main><a href="chapter.html">Chapter</a><img src="img.png" alt="Image"></main>
</html>
"#,
);
assert_eq!(
doc.metadata.canonical_url.as_deref(),
Some("https://example.com/docs/guide.html")
);
assert_eq!(doc.metadata.language.as_deref(), Some("en-US"));
assert_eq!(doc.metadata.description.as_deref(), Some("A calm reader"));
assert_eq!(doc.metadata.open_graph_title.as_deref(), Some("OG Title"));
assert_eq!(
doc.metadata.open_graph_description.as_deref(),
Some("OG Description")
);
assert_eq!(
doc.links.first().map(|link| link.href.as_str()),
Some("https://example.com/docs/chapter.html")
);
assert!(doc.nodes.iter().any(|node| matches!(
node,
HtmlNode::Image { src: Some(src), .. } if src == "https://example.com/docs/img.png"
)));
}
#[test]
fn extracts_forms_inputs_buttons_and_resolves_actions() {
let doc = parse_html(
r#"
<html>
<head><base href="https://example.com/docs/"></head>
<main>
<form id="search" method="get" action="../search">
<input type="search" name="q" required>
<input type="hidden" name="source" value="docs">
<button type="submit" name="go" value="1">Search</button>
</form>
</main>
</html>
"#,
);
assert_eq!(doc.forms.len(), 1);
let form = &doc.forms[0];
assert_eq!(form.name, "search");
assert_eq!(form.method, "GET");
assert_eq!(form.action, "https://example.com/search");
assert_eq!(form.inputs.len(), 2);
assert!(
form.inputs
.iter()
.any(|input| input.name == "q" && input.required)
);
assert!(
form.buttons
.iter()
.any(|button| { button.name.as_deref() == Some("go") && button.label == "Search" })
);
assert!(
doc.nodes
.iter()
.any(|node| matches!(node, HtmlNode::Form(form) if form.name == "search"))
);
}
#[test]
fn extracts_select_values_for_form_inputs() {
let doc = parse_html(
r#"
<main>
<form id="filters" action="/search">
<select name="sort">
<option value="relevance">Relevance</option>
<option value="recent" selected>Most recent</option>
</select>
<select name="view">
<option>compact</option>
<option value="expanded">expanded</option>
</select>
</form>
</main>
"#,
);
let form = &doc.forms[0];
assert!(
form.inputs
.iter()
.any(|input| input.name == "sort" && input.value.as_deref() == Some("recent"))
);
assert!(
form.inputs
.iter()
.any(|input| input.name == "view" && input.value.as_deref() == Some("compact"))
);
}
#[test]
fn drops_anchor_only_paragraph_nodes_but_keeps_links() {
let doc = parse_html(r#"<main><p><a href="https://example.com">Read more</a></p></main>"#);
assert!(!doc.nodes.iter().any(|node| matches!(
node,
HtmlNode::Paragraph(text) if text == "Read more"
)));
assert_eq!(doc.links.len(), 1);
}
#[test]
fn malformed_html_does_not_panic() {
let doc = parse_html("<main><h1>Broken<p>Still readable<a href=\"/x\">link");
assert_eq!(doc.title.as_deref(), Some("Broken Still readable link"));
assert!(!doc.body_text.is_empty());
}
#[test]
fn index_manifest_discovery_supports_well_known_html_and_header_links() {
let page_url = "https://example.org/docs/guide";
assert_eq!(
well_known_index_manifest_url(page_url).as_deref(),
Some("https://example.org/.well-known/index.idx")
);
assert_eq!(
discover_index_manifest_link_from_html(
r#"<html><head><link rel="index-manifest preload" href="/manifests/site.idx"></head></html>"#,
page_url
)
.as_deref(),
Some("https://example.org/manifests/site.idx")
);
assert_eq!(
discover_index_manifest_link_from_http_link_header(
r#"</meta/index.idx>; rel="index-manifest"; type="application/json""#,
page_url
)
.as_deref(),
Some("https://example.org/meta/index.idx")
);
}
#[test]
fn parse_index_manifest_validates_same_origin_scope_and_date_styles()
-> Result<(), Box<dyn std::error::Error>> {
let manifest = parse_index_manifest(
r#"{
"version": "index.idx/v1",
"scope": "/docs",
"content": { "main_selector": "main article" },
"regions": [{ "role": "related", "selector": "aside.related", "collapsed": true }],
"fields": [{ "name": "updated", "label": "Updated" }],
"forms": [{ "name": "search", "selector": "form.search", "note": "Public search" }],
"dates": [{ "field": "updated", "style": "date" }]
}"#,
"https://example.org/.well-known/index.idx",
"https://example.org/docs/guide",
)?;
assert_eq!(manifest.version, "index.idx/v1");
assert_eq!(manifest.scope, "/docs");
assert_eq!(
manifest.content.main_selector.as_deref(),
Some("main article")
);
assert_eq!(manifest.regions.len(), 1);
assert_eq!(manifest.fields.len(), 1);
assert_eq!(manifest.forms.len(), 1);
assert_eq!(manifest.dates.len(), 1);
assert_eq!(manifest.dates[0].style, IndexDateStyle::Date);
Ok(())
}
#[test]
fn parse_index_manifest_rejects_cross_origin_and_invalid_scope() {
let cross_origin = parse_index_manifest(
r#"{"version":"index.idx/v1"}"#,
"https://cdn.example.org/index.idx",
"https://example.org/docs/guide",
);
assert!(matches!(
cross_origin,
Err(super::IndexManifestError::CrossOrigin { .. })
));
let invalid_scope = parse_index_manifest(
r#"{"version":"index.idx/v1","scope":"docs"}"#,
"https://example.org/index.idx",
"https://example.org/docs/guide",
);
assert!(matches!(
invalid_scope,
Err(super::IndexManifestError::InvalidScope(_))
));
let out_of_scope = parse_index_manifest(
r#"{"version":"index.idx/v1","scope":"/blog"}"#,
"https://example.org/index.idx",
"https://example.org/docs/guide",
);
assert!(matches!(
out_of_scope,
Err(super::IndexManifestError::OutOfScope { .. })
));
}
}