use index_dom::{HtmlDocument, HtmlForm, HtmlLink, HtmlNode, HtmlSectionRole};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ReadablePage {
pub title: String,
pub paragraphs: Vec<String>,
pub nodes: Vec<ReadableNode>,
pub links: Vec<HtmlLink>,
pub forms: Vec<HtmlForm>,
pub metadata: ReadableMetadata,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ReadableNode {
Heading {
level: u8,
text: String,
},
Paragraph(String),
Link(HtmlLink),
List {
ordered: bool,
items: Vec<String>,
},
CodeBlock {
language: Option<String>,
code: String,
},
Table {
rows: Vec<Vec<String>>,
},
Spacer {
lines: u8,
},
Section {
role: ReadableSectionRole,
title: Option<String>,
collapsed: bool,
nodes: Vec<ReadableNode>,
},
Image {
alt: String,
src: Option<String>,
},
Form(HtmlForm),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ReadableSectionRole {
Main,
Navigation,
Aside,
Footer,
Comments,
Related,
Unknown,
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct ReadableMetadata {
pub canonical_url: Option<String>,
pub language: Option<String>,
pub description: Option<String>,
pub open_graph_title: Option<String>,
pub open_graph_description: Option<String>,
}
impl ReadablePage {
#[must_use]
pub fn from_html(doc: &HtmlDocument) -> Self {
let title = doc
.title
.clone()
.or_else(|| doc.headings.first().map(|heading| heading.text.clone()))
.unwrap_or_else(|| "Untitled".to_owned());
let link_texts = doc
.links
.iter()
.map(|link| link.text.as_str())
.collect::<Vec<_>>();
let mut nodes = doc
.nodes
.iter()
.filter_map(|node| readable_node_from_html(node, &link_texts))
.collect::<Vec<_>>();
if nodes.is_empty() && !doc.body_text.is_empty() {
nodes.push(ReadableNode::Paragraph(doc.body_text.clone()));
}
let paragraphs = nodes
.iter()
.filter_map(|node| match node {
ReadableNode::Paragraph(text) => Some(text.clone()),
_ => None,
})
.collect::<Vec<_>>();
Self {
title,
paragraphs,
nodes,
links: doc.links.clone(),
forms: doc.forms.clone(),
metadata: ReadableMetadata {
canonical_url: doc.metadata.canonical_url.clone(),
language: doc.metadata.language.clone(),
description: doc.metadata.description.clone(),
open_graph_title: doc.metadata.open_graph_title.clone(),
open_graph_description: doc.metadata.open_graph_description.clone(),
},
}
}
#[must_use]
pub fn has_body(&self) -> bool {
!self.nodes.is_empty()
}
}
fn readable_node_from_html(node: &HtmlNode, link_texts: &[&str]) -> Option<ReadableNode> {
match node {
HtmlNode::Heading { level, text } if !text.is_empty() => Some(ReadableNode::Heading {
level: *level,
text: text.clone(),
}),
HtmlNode::Paragraph(text) if !text.is_empty() && !link_texts.contains(&text.as_str()) => {
Some(ReadableNode::Paragraph(text.clone()))
}
HtmlNode::Link(link) if !link.text.is_empty() => Some(ReadableNode::Link(link.clone())),
HtmlNode::List { ordered, items } if !items.is_empty() => Some(ReadableNode::List {
ordered: *ordered,
items: items.clone(),
}),
HtmlNode::CodeBlock { language, code } if !code.is_empty() => {
Some(ReadableNode::CodeBlock {
language: language.clone(),
code: code.clone(),
})
}
HtmlNode::Table { rows } if !rows.is_empty() => {
Some(ReadableNode::Table { rows: rows.clone() })
}
HtmlNode::Spacer { lines } if *lines > 0 => Some(ReadableNode::Spacer { lines: *lines }),
HtmlNode::Section {
role,
title,
collapsed,
nodes,
} => {
let section_nodes = nodes
.iter()
.filter_map(|node| readable_node_from_html(node, link_texts))
.collect::<Vec<_>>();
(!section_nodes.is_empty()).then(|| ReadableNode::Section {
role: readable_section_role(*role),
title: title.clone(),
collapsed: *collapsed,
nodes: section_nodes,
})
}
HtmlNode::Image { alt, src } if !alt.is_empty() || src.is_some() => {
Some(ReadableNode::Image {
alt: alt.clone(),
src: src.clone(),
})
}
HtmlNode::Form(form) => Some(ReadableNode::Form(form.clone())),
_ => None,
}
}
fn readable_section_role(role: HtmlSectionRole) -> ReadableSectionRole {
match role {
HtmlSectionRole::Main => ReadableSectionRole::Main,
HtmlSectionRole::Navigation => ReadableSectionRole::Navigation,
HtmlSectionRole::Aside => ReadableSectionRole::Aside,
HtmlSectionRole::Footer => ReadableSectionRole::Footer,
HtmlSectionRole::Comments => ReadableSectionRole::Comments,
HtmlSectionRole::Related => ReadableSectionRole::Related,
HtmlSectionRole::Unknown => ReadableSectionRole::Unknown,
}
}
#[cfg(test)]
mod tests {
use index_dom::parse_html;
use super::{ReadableNode, ReadablePage};
#[test]
fn uses_title_when_available() {
let html = parse_html("<title>Doc</title><main><p>Hello world.</p></main>");
let page = ReadablePage::from_html(&html);
assert_eq!(page.title, "Doc");
}
#[test]
fn falls_back_to_heading() {
let html = parse_html("<main><h1>Heading</h1><p>Hello world.</p></main>");
let page = ReadablePage::from_html(&html);
assert_eq!(page.title, "Heading");
}
#[test]
fn extracts_body_paragraphs() {
let html = parse_html("<main><p>Hello world. This is readable.</p></main>");
let page = ReadablePage::from_html(&html);
assert!(page.has_body());
assert_eq!(
page.paragraphs,
vec!["Hello world. This is readable.".to_owned()]
);
}
#[test]
fn preserves_structured_reader_nodes() {
let html = parse_html(
r#"
<main>
<h2>Install</h2>
<ol><li>Install Rust</li><li>Run Index</li></ol>
<pre><code class="language-sh">cargo install index</code></pre>
<table><tr><th>Command</th></tr><tr><td>index</td></tr></table>
<img src="/logo.png" alt="Logo">
<form id="search" action="/search"><input name="q" required></form>
</main>
"#,
);
let page = ReadablePage::from_html(&html);
assert!(page.nodes.iter().any(
|node| matches!(node, ReadableNode::Heading { level: 2, text } if text == "Install")
));
assert!(page.nodes.iter().any(
|node| matches!(node, ReadableNode::List { ordered: true, items } if items == &vec!["Install Rust".to_owned(), "Run Index".to_owned()])
));
assert!(page.nodes.iter().any(|node| matches!(node, ReadableNode::CodeBlock { language: Some(language), .. } if language == "sh")));
assert!(
page.nodes
.iter()
.any(|node| matches!(node, ReadableNode::Table { rows } if rows.len() == 2))
);
assert!(
page.nodes
.iter()
.any(|node| matches!(node, ReadableNode::Image { alt, .. } if alt == "Logo"))
);
assert!(
page.nodes
.iter()
.any(|node| matches!(node, ReadableNode::Form(form) if form.name == "search"))
);
}
#[test]
fn preserves_layout_spacers() {
let html = parse_html(
r#"
<head><style>.section { margin-bottom: 40px; }</style></head>
<main><section class="section"><p>First.</p></section><p>Second.</p></main>
"#,
);
let page = ReadablePage::from_html(&html);
assert!(
page.nodes
.iter()
.any(|node| matches!(node, ReadableNode::Spacer { lines } if *lines >= 1))
);
}
#[test]
fn preserves_collapsed_secondary_sections() {
let html = parse_html(
r#"
<nav aria-label="Site"><a href="/docs">Docs</a></nav>
<main><p>Main body.</p></main>
"#,
);
let page = ReadablePage::from_html(&html);
assert!(page.nodes.iter().any(|node| matches!(
node,
ReadableNode::Section {
role: super::ReadableSectionRole::Navigation,
title: Some(title),
collapsed: true,
nodes
} if title == "Site" && matches!(nodes.first(), Some(ReadableNode::Link(link)) if link.text == "Docs")
)));
}
#[test]
fn carries_metadata_forward() {
let html = parse_html(
r#"
<html lang="en">
<head>
<meta name="description" content="Readable docs">
<meta property="og:title" content="Index">
</head>
<main><p>Body.</p></main>
</html>
"#,
);
let page = ReadablePage::from_html(&html);
assert_eq!(page.metadata.language.as_deref(), Some("en"));
assert_eq!(page.metadata.description.as_deref(), Some("Readable docs"));
assert_eq!(page.metadata.open_graph_title.as_deref(), Some("Index"));
}
#[test]
fn drops_anchor_only_paragraphs_but_keeps_link() {
let html = parse_html(r#"<main><p><a href="https://example.com">Read more</a></p></main>"#);
let page = ReadablePage::from_html(&html);
assert!(page.paragraphs.is_empty());
assert_eq!(page.links.len(), 1);
}
}