use std::collections::{BTreeSet, btree_map::Entry};
use index_core::{ButtonAction, Form, IndexDocument, IndexNode, Input, Link, SectionRole};
use index_readability::{ReadableNode, ReadablePage, ReadableSectionRole};
use url::Url;
const MAX_EMITTED_LINKS: usize = 300;
pub trait Instruction {
fn execute(&self, page: &ReadablePage, document: &mut IndexDocument);
}
#[derive(Debug, Clone, Copy)]
pub struct ApplyMetadata;
impl Instruction for ApplyMetadata {
fn execute(&self, page: &ReadablePage, document: &mut IndexDocument) {
document.metadata.canonical_url = page.metadata.canonical_url.clone();
document.metadata.language = page.metadata.language.clone();
document.metadata.description = page.metadata.description.clone();
document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
}
}
#[derive(Debug, Clone, Copy)]
pub struct EmitTitle;
impl Instruction for EmitTitle {
fn execute(&self, page: &ReadablePage, document: &mut IndexDocument) {
document.push(IndexNode::Heading {
level: 1,
text: page.title.clone(),
});
}
}
#[derive(Debug, Clone, Copy)]
pub struct EmitReadableNodes;
impl Instruction for EmitReadableNodes {
fn execute(&self, page: &ReadablePage, document: &mut IndexDocument) {
let mut skipped_title_heading = false;
for node in &page.nodes {
if !skipped_title_heading
&& matches!(
node,
ReadableNode::Heading { level: 1, text } if text == &page.title
)
{
skipped_title_heading = true;
continue;
}
match node {
ReadableNode::Heading { level, text } => document.push(IndexNode::Heading {
level: *level,
text: text.clone(),
}),
ReadableNode::Paragraph(text) => {
document.push(IndexNode::Paragraph(text.clone()));
}
ReadableNode::Link(link) => {
document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
}
ReadableNode::List { ordered, items } => {
document.push(IndexNode::List {
ordered: *ordered,
items: items.clone(),
});
}
ReadableNode::CodeBlock { language, code } => {
document.push(IndexNode::CodeBlock {
language: language.clone(),
code: code.clone(),
});
}
ReadableNode::Table { rows } => {
document.push(IndexNode::Table { rows: rows.clone() });
}
ReadableNode::Spacer { lines } => {
document.push(IndexNode::Spacer { lines: *lines });
}
ReadableNode::Section {
role,
title,
collapsed,
nodes,
} => {
document.push(IndexNode::Section {
role: section_role(*role),
title: title.clone(),
collapsed: *collapsed,
nodes: nodes.iter().map(index_node_from_readable).collect(),
});
}
ReadableNode::Image { alt, src } => {
document.push(IndexNode::Image {
alt: alt.clone(),
src: src.clone(),
});
}
ReadableNode::Form(form) => {
document.push(IndexNode::Form(Form {
name: form.name.clone(),
method: form.method.clone(),
action: form.action.clone(),
inputs: form
.inputs
.iter()
.map(|input| Input {
name: input.name.clone(),
kind: input.kind.clone(),
value: input.value.clone(),
required: input.required,
})
.collect(),
buttons: form
.buttons
.iter()
.map(|button| ButtonAction {
name: button.name.clone(),
value: button.value.clone(),
label: button.label.clone(),
})
.collect(),
}));
}
}
}
}
}
fn index_node_from_readable(node: &ReadableNode) -> IndexNode {
match node {
ReadableNode::Heading { level, text } => IndexNode::Heading {
level: *level,
text: text.clone(),
},
ReadableNode::Paragraph(text) => IndexNode::Paragraph(text.clone()),
ReadableNode::Link(link) => IndexNode::Link(Link::new(&link.text, &link.href)),
ReadableNode::List { ordered, items } => IndexNode::List {
ordered: *ordered,
items: items.clone(),
},
ReadableNode::CodeBlock { language, code } => IndexNode::CodeBlock {
language: language.clone(),
code: code.clone(),
},
ReadableNode::Table { rows } => IndexNode::Table { rows: rows.clone() },
ReadableNode::Spacer { lines } => IndexNode::Spacer { lines: *lines },
ReadableNode::Section {
role,
title,
collapsed,
nodes,
} => IndexNode::Section {
role: section_role(*role),
title: title.clone(),
collapsed: *collapsed,
nodes: nodes.iter().map(index_node_from_readable).collect(),
},
ReadableNode::Image { alt, src } => IndexNode::Image {
alt: alt.clone(),
src: src.clone(),
},
ReadableNode::Form(form) => IndexNode::Form(index_form_from_html(form)),
}
}
fn index_form_from_html(form: &index_dom::HtmlForm) -> Form {
Form {
name: form.name.clone(),
method: form.method.clone(),
action: form.action.clone(),
inputs: form
.inputs
.iter()
.map(|input| Input {
name: input.name.clone(),
kind: input.kind.clone(),
value: input.value.clone(),
required: input.required,
})
.collect(),
buttons: form
.buttons
.iter()
.map(|button| ButtonAction {
name: button.name.clone(),
value: button.value.clone(),
label: button.label.clone(),
})
.collect(),
}
}
fn section_role(role: ReadableSectionRole) -> SectionRole {
match role {
ReadableSectionRole::Main => SectionRole::Main,
ReadableSectionRole::Navigation => SectionRole::Navigation,
ReadableSectionRole::Aside => SectionRole::Aside,
ReadableSectionRole::Footer => SectionRole::Footer,
ReadableSectionRole::Comments => SectionRole::Comments,
ReadableSectionRole::Related => SectionRole::Related,
ReadableSectionRole::Unknown => SectionRole::Unknown,
}
}
#[derive(Debug, Clone, Copy)]
pub struct EmitLinks;
impl Instruction for EmitLinks {
fn execute(&self, page: &ReadablePage, document: &mut IndexDocument) {
let mut seen = BTreeSet::new();
collect_existing_links(&document.nodes, &mut seen);
let mut emitted = 0usize;
let mut truncated = 0usize;
let mut ranked = std::collections::BTreeMap::<String, RankedLink>::new();
for (index, link) in page.links.iter().enumerate() {
let normalized = normalized_link_key(&link.href);
if normalized.is_empty() || seen.contains(&normalized) {
continue;
}
let score = link_relevance_score(&link.text, &link.href);
let candidate = RankedLink {
text: link.text.clone(),
href: link.href.clone(),
score,
first_seen: index,
};
match ranked.entry(normalized) {
Entry::Vacant(entry) => {
entry.insert(candidate);
}
Entry::Occupied(mut entry) => {
if candidate.score > entry.get().score
|| (candidate.score == entry.get().score
&& candidate.text.len() > entry.get().text.len())
{
entry.insert(candidate);
}
}
}
}
let mut ranked = ranked.into_values().collect::<Vec<_>>();
ranked.sort_by(|left, right| {
right
.score
.cmp(&left.score)
.then(left.first_seen.cmp(&right.first_seen))
.then(left.text.cmp(&right.text))
.then(left.href.cmp(&right.href))
});
for link in ranked {
if emitted < MAX_EMITTED_LINKS {
document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
emitted += 1;
} else {
truncated += 1;
}
}
if truncated > 0 {
document.push(IndexNode::Section {
role: SectionRole::Unknown,
title: Some("Diagnostic".to_owned()),
collapsed: true,
nodes: vec![
IndexNode::Error(format!(
"link list truncated: kept first {MAX_EMITTED_LINKS} unique links"
)),
IndexNode::List {
ordered: false,
items: vec![
format!("{truncated} additional links were omitted"),
"use :extract links to inspect bounded output".to_owned(),
],
},
],
});
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct RankedLink {
text: String,
href: String,
score: i32,
first_seen: usize,
}
fn collect_existing_links(nodes: &[IndexNode], seen: &mut BTreeSet<String>) {
for node in nodes {
match node {
IndexNode::Link(link) => {
let _ = seen.insert(normalized_link_key(&link.href));
}
IndexNode::Section { nodes, .. } => collect_existing_links(nodes, seen),
_ => {}
}
}
}
fn normalized_link_key(href: &str) -> String {
let trimmed = href.trim();
if trimmed.is_empty() {
return String::new();
}
if let Ok(mut url) = Url::parse(trimmed) {
url.set_fragment(None);
let mut path = url.path().to_owned();
while path.len() > 1 && path.ends_with('/') {
path.pop();
}
url.set_path(&path);
return url.to_string();
}
trimmed
.split('#')
.next()
.unwrap_or_default()
.trim_end_matches('/')
.to_owned()
}
fn link_relevance_score(text: &str, href: &str) -> i32 {
let text = text.trim().to_ascii_lowercase();
let href = href.trim().to_ascii_lowercase();
let mut score = 0i32;
if text.is_empty() || href.is_empty() {
return -100;
}
if href.starts_with("javascript:") || href.starts_with("mailto:") {
return -100;
}
if matches!(
text.as_str(),
"next" | "prev" | "previous" | "older" | "newer" | "continue"
) || text.starts_with("page ")
|| href.contains("page=")
|| href.contains("/page-")
|| href.contains("start=")
|| href.contains("p=")
{
score += 8;
}
if text.contains("result")
|| text.contains("thread")
|| text.contains("discussion")
|| text.contains("article")
|| text.contains("guide")
|| text.contains("documentation")
|| text.contains("reference")
|| text.contains("docs")
{
score += 4;
}
if matches!(
text.as_str(),
"privacy" | "terms" | "cookie" | "sign in" | "log in" | "register" | "login" | "help"
) {
score -= 5;
}
if text.len() >= 18 {
score += 1;
} else if text.len() <= 2 {
score -= 2;
}
score
}
#[cfg(test)]
mod tests {
use index_core::{IndexDocument, IndexNode, SectionRole};
use index_dom::{HtmlButton, HtmlForm, HtmlInput, HtmlLink};
use index_readability::{ReadableMetadata, ReadableNode, ReadablePage, ReadableSectionRole};
use super::{ApplyMetadata, EmitLinks, EmitReadableNodes, EmitTitle, Instruction};
fn page() -> ReadablePage {
ReadablePage {
title: "Title".to_owned(),
paragraphs: vec!["Paragraph.".to_owned()],
nodes: vec![
ReadableNode::Paragraph("Paragraph.".to_owned()),
ReadableNode::List {
ordered: false,
items: vec!["Read".to_owned(), "Search".to_owned()],
},
ReadableNode::CodeBlock {
language: Some("rust".to_owned()),
code: "fn main() {}".to_owned(),
},
ReadableNode::Table {
rows: vec![vec!["Name".to_owned()], vec!["Index".to_owned()]],
},
ReadableNode::Spacer { lines: 2 },
ReadableNode::Section {
role: ReadableSectionRole::Navigation,
title: Some("Site".to_owned()),
collapsed: true,
nodes: vec![ReadableNode::Link(HtmlLink {
text: "About".to_owned(),
href: "https://example.com/about".to_owned(),
})],
},
ReadableNode::Image {
alt: "Logo".to_owned(),
src: Some("https://example.com/logo.png".to_owned()),
},
ReadableNode::Form(HtmlForm {
name: "search".to_owned(),
method: "GET".to_owned(),
action: "https://example.com/search".to_owned(),
inputs: vec![HtmlInput {
name: "q".to_owned(),
kind: "search".to_owned(),
value: None,
required: true,
}],
buttons: Vec::new(),
}),
],
links: vec![HtmlLink {
text: "Docs".to_owned(),
href: "https://example.com/docs".to_owned(),
}],
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some("https://example.com/docs".to_owned()),
language: Some("en".to_owned()),
description: Some("Docs".to_owned()),
open_graph_title: Some("OG".to_owned()),
open_graph_description: Some("OpenGraph docs".to_owned()),
},
}
}
#[test]
fn apply_metadata_sets_document_metadata() {
let mut document = IndexDocument::titled("Title");
ApplyMetadata.execute(&page(), &mut document);
assert_eq!(
document.metadata.canonical_url.as_deref(),
Some("https://example.com/docs")
);
assert_eq!(document.metadata.language.as_deref(), Some("en"));
assert_eq!(document.metadata.description.as_deref(), Some("Docs"));
assert_eq!(document.metadata.open_graph_title.as_deref(), Some("OG"));
}
#[test]
fn emit_title_adds_heading() {
let mut document = IndexDocument::titled("Title");
EmitTitle.execute(&page(), &mut document);
assert!(matches!(
document.nodes.first(),
Some(IndexNode::Heading { .. })
));
}
#[test]
fn emit_readable_nodes_adds_structured_nodes() {
let mut document = IndexDocument::titled("Title");
EmitReadableNodes.execute(&page(), &mut document);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Paragraph(_)))
);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::List { ordered: false, items } if items == &vec!["Read".to_owned(), "Search".to_owned()]))
);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::CodeBlock { .. }))
);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Table { .. }))
);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Spacer { lines: 2 }))
);
assert!(document.nodes.iter().any(|node| matches!(
node,
IndexNode::Section {
role: index_core::SectionRole::Navigation,
title: Some(title),
collapsed: true,
nodes
} if title == "Site" && matches!(nodes.first(), Some(IndexNode::Link(link)) if link.text == "About")
)));
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Image { .. }))
);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Form(form) if form.name == "search"))
);
}
#[test]
fn emit_readable_nodes_maps_nested_section_contents() {
let mut page = page();
page.nodes = vec![ReadableNode::Section {
role: ReadableSectionRole::Aside,
title: Some("More".to_owned()),
collapsed: false,
nodes: vec![
ReadableNode::Heading {
level: 2,
text: "Nested".to_owned(),
},
ReadableNode::Paragraph("Nested paragraph.".to_owned()),
ReadableNode::Link(HtmlLink {
text: "Nested link".to_owned(),
href: "https://example.com/nested".to_owned(),
}),
ReadableNode::List {
ordered: true,
items: vec!["One".to_owned()],
},
ReadableNode::CodeBlock {
language: None,
code: "let value = 1;".to_owned(),
},
ReadableNode::Table {
rows: vec![vec!["cell".to_owned()]],
},
ReadableNode::Spacer { lines: 1 },
ReadableNode::Section {
role: ReadableSectionRole::Comments,
title: None,
collapsed: true,
nodes: vec![ReadableNode::Paragraph("Comment.".to_owned())],
},
ReadableNode::Image {
alt: "Nested image".to_owned(),
src: None,
},
ReadableNode::Form(HtmlForm {
name: "nested-form".to_owned(),
method: "POST".to_owned(),
action: "https://example.com/submit".to_owned(),
inputs: vec![HtmlInput {
name: "email".to_owned(),
kind: "email".to_owned(),
value: Some("reader@example.com".to_owned()),
required: true,
}],
buttons: vec![HtmlButton {
name: Some("save".to_owned()),
value: Some("1".to_owned()),
label: "Save".to_owned(),
}],
}),
],
}];
let mut document = IndexDocument::titled("Title");
EmitReadableNodes.execute(&page, &mut document);
assert!(matches!(
document.nodes.first(),
Some(IndexNode::Section { .. })
));
if let Some(IndexNode::Section {
role,
title,
collapsed,
nodes,
}) = document.nodes.first()
{
assert_eq!(*role, SectionRole::Aside);
assert_eq!(title.as_deref(), Some("More"));
assert!(!collapsed);
assert!(
matches!(nodes.first(), Some(IndexNode::Heading { level: 2, text }) if text == "Nested")
);
assert!(nodes.iter().any(
|node| matches!(node, IndexNode::Paragraph(text) if text == "Nested paragraph.")
));
assert!(
nodes.iter().any(
|node| matches!(node, IndexNode::Link(link) if link.text == "Nested link")
)
);
assert!(nodes
.iter()
.any(|node| matches!(node, IndexNode::List { ordered: true, items } if items == &vec!["One".to_owned()])));
assert!(nodes
.iter()
.any(|node| matches!(node, IndexNode::CodeBlock { language: None, code } if code == "let value = 1;")));
assert!(nodes
.iter()
.any(|node| matches!(node, IndexNode::Table { rows } if rows == &vec![vec!["cell".to_owned()]])));
assert!(
nodes
.iter()
.any(|node| matches!(node, IndexNode::Spacer { lines: 1 }))
);
assert!(nodes.iter().any(|node| matches!(
node,
IndexNode::Section {
role: SectionRole::Comments,
collapsed: true,
..
}
)));
assert!(nodes.iter().any(
|node| matches!(node, IndexNode::Image { alt, src: None } if alt == "Nested image")
));
assert!(nodes.iter().any(
|node| matches!(node, IndexNode::Form(form) if form.name == "nested-form" && form.buttons.len() == 1)
));
}
}
#[test]
fn emit_readable_nodes_maps_all_section_roles() {
let roles = [
(ReadableSectionRole::Main, SectionRole::Main),
(ReadableSectionRole::Navigation, SectionRole::Navigation),
(ReadableSectionRole::Aside, SectionRole::Aside),
(ReadableSectionRole::Footer, SectionRole::Footer),
(ReadableSectionRole::Comments, SectionRole::Comments),
(ReadableSectionRole::Related, SectionRole::Related),
(ReadableSectionRole::Unknown, SectionRole::Unknown),
];
for (readable_role, expected_role) in roles {
let mut page = page();
page.nodes = vec![ReadableNode::Section {
role: readable_role,
title: None,
collapsed: true,
nodes: Vec::new(),
}];
let mut document = IndexDocument::titled("Title");
EmitReadableNodes.execute(&page, &mut document);
assert!(matches!(
document.nodes.first(),
Some(IndexNode::Section { role, .. }) if *role == expected_role
));
}
}
#[test]
fn emit_readable_nodes_skips_duplicate_title_heading() {
let mut page = page();
page.nodes.insert(
0,
ReadableNode::Heading {
level: 1,
text: "Title".to_owned(),
},
);
let mut document = IndexDocument::titled("Title");
EmitReadableNodes.execute(&page, &mut document);
assert!(!matches!(
document.nodes.first(),
Some(IndexNode::Heading { level: 1, text }) if text == "Title"
));
}
#[test]
fn emit_links_adds_link_node() {
let mut document = IndexDocument::titled("Title");
EmitLinks.execute(&page(), &mut document);
assert!(matches!(document.nodes.first(), Some(IndexNode::Link(_))));
}
#[test]
fn emit_links_deduplicates_and_bounds_large_link_sets() {
let mut document = IndexDocument::titled("Title");
document.push(IndexNode::Link(index_core::Link::new(
"Existing",
"https://example.com/existing",
)));
let mut source = page();
source.links = (0..340)
.map(|index| HtmlLink {
text: format!("Link {index}"),
href: format!("https://example.com/{index}"),
})
.collect();
source.links.push(HtmlLink {
text: "Existing".to_owned(),
href: "https://example.com/existing".to_owned(),
});
EmitLinks.execute(&source, &mut document);
let link_count = document
.nodes
.iter()
.filter(|node| matches!(node, IndexNode::Link(_)))
.count();
assert_eq!(link_count, 301);
assert!(document.nodes.iter().any(|node| matches!(
node,
IndexNode::Section {
title: Some(title),
..
} if title == "Diagnostic"
)));
}
#[test]
fn emit_links_ranks_navigation_and_result_links_ahead_of_policy_links() {
let mut document = IndexDocument::titled("Title");
let mut source = page();
source.links = vec![
HtmlLink {
text: "Privacy".to_owned(),
href: "https://example.com/privacy".to_owned(),
},
HtmlLink {
text: "Next".to_owned(),
href: "https://example.com/thread?page=2".to_owned(),
},
HtmlLink {
text: "API reference".to_owned(),
href: "https://example.com/docs/api".to_owned(),
},
];
EmitLinks.execute(&source, &mut document);
let emitted = document
.nodes
.iter()
.filter_map(|node| match node {
IndexNode::Link(link) => Some(link.text.clone()),
_ => None,
})
.collect::<Vec<_>>();
assert_eq!(emitted.first().map(String::as_str), Some("Next"));
assert!(emitted.len() >= 3);
assert!(
emitted.iter().position(|text| text == "Privacy")
> emitted.iter().position(|text| text == "API reference")
);
}
#[test]
fn emit_links_deduplicates_by_normalized_href_and_prefers_richer_text() {
let mut document = IndexDocument::titled("Title");
let mut source = page();
source.links = vec![
HtmlLink {
text: "Docs".to_owned(),
href: "https://example.com/docs#top".to_owned(),
},
HtmlLink {
text: "Documentation".to_owned(),
href: "https://example.com/docs".to_owned(),
},
HtmlLink {
text: "Docs mirror".to_owned(),
href: "https://example.com/docs/".to_owned(),
},
];
EmitLinks.execute(&source, &mut document);
let docs_links = document
.nodes
.iter()
.filter_map(|node| match node {
IndexNode::Link(link) if link.href.starts_with("https://example.com/docs") => {
Some(link.text.as_str())
}
_ => None,
})
.collect::<Vec<_>>();
assert_eq!(docs_links.len(), 1);
assert_eq!(docs_links[0], "Documentation");
}
}