use readabilityrs::{
MarkdownOptions, Readability, ReadabilityOptions,
markdown::options::{HeadingStyle, LinkStyle},
};
use regex::Regex;
use std::sync::LazyLock;
use thiserror::Error;
use url::Url;
#[derive(Debug, Error)]
pub enum ExtractorError {
#[error("readabilityrs: {0}")]
Readability(String),
#[error("metadata extraction failed: {0}")]
Metadata(String),
#[error("output directory error at {path}: {source}")]
Output {
path: String,
#[source]
source: std::io::Error,
},
#[error("could not write table {ordinal} to {path}: {source}")]
TableWrite {
ordinal: usize,
path: String,
#[source]
source: std::io::Error,
},
#[error("could not download image at {url}: {source}")]
ImageDownload {
url: String,
#[source]
source: reqwest::Error,
},
#[error("could not write image at {path}: {source}")]
ImageWrite {
path: String,
#[source]
source: std::io::Error,
},
#[error("invalid image url {url}: {source}")]
ImageUrlInvalid {
url: String,
#[source]
source: url::ParseError,
},
#[error("ssrf policy blocked image url {url}: {source}")]
ImageSsrf {
url: String,
#[source]
source: crate::fetcher::ssrf::SsrfError,
},
#[error("captioner `{name}` failed: {source}")]
CaptionerCall {
name: String,
#[source]
source: Box<crate::vlm::VlmError>,
},
#[error("no captioner configured for images.mode = caption")]
CaptionerNotConfigured,
}
#[derive(Debug, Clone)]
pub struct ExtractedDoc {
pub title: Option<String>,
pub body_md: String,
pub language: Option<String>,
pub byline: Option<String>,
pub excerpt: Option<String>,
pub site_name: Option<String>,
pub published_time: Option<String>,
pub image: Option<String>,
pub metadata: crate::extractor::metadata::ExtractedMetadata,
pub raw_html_text_len: usize,
}
fn rover_markdown_options() -> MarkdownOptions {
MarkdownOptions {
heading_style: HeadingStyle::Atx,
bullet_char: '-',
code_fence: '`',
emphasis_delimiter: '*',
strong_delimiter: "**".to_string(),
link_style: LinkStyle::Inline,
preserve_complex_tables: true,
}
}
pub fn extract_full(html: &str, base_url: &Url) -> Result<ExtractedDoc, ExtractorError> {
let effective_base =
crate::extractor::base_href::read_base_href(html).unwrap_or_else(|| base_url.clone());
let metadata = crate::extractor::metadata::extract(html, &effective_base);
let raw_html_text_len = approximate_html_text_len(html);
let opts = ReadabilityOptions::builder()
.output_markdown(true)
.markdown_options(rover_markdown_options())
.build();
let readability = Readability::new(html, Some(effective_base.as_str()), Some(opts))
.map_err(|e| ExtractorError::Readability(e.to_string()))?;
let (mut body_md, title, language, byline, excerpt, site_name, published_time, image) =
match readability.parse() {
Some(article) => (
article.markdown_content.unwrap_or_default(),
article.title.or_else(|| metadata.title.clone()),
article.lang.or_else(|| metadata.language.clone()),
article.byline,
article.excerpt,
article.site_name,
article
.published_time
.or_else(|| metadata.published.clone()),
article.image.or_else(|| metadata.image.clone()),
),
None => {
tracing::debug!(
target: "rover::extractor",
url = %effective_base,
"readabilityrs found no article; using direct body→markdown fallback"
);
(
fallback_body_markdown(html),
metadata.title.clone().or_else(|| read_title_tag(html)),
metadata.language.clone(),
metadata.author.clone(),
metadata.description.clone(),
None,
metadata.published.clone(),
metadata.image.clone(),
)
}
};
body_md = crate::extractor::links::absolutize(&body_md, &effective_base);
Ok(ExtractedDoc {
title,
body_md,
language,
byline,
excerpt,
site_name,
published_time,
image,
metadata,
raw_html_text_len,
})
}
static NONCONTENT_BLOCKS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
["script", "style", "noscript", "template", "svg"]
.iter()
.map(|t| Regex::new(&format!(r"(?is)<{t}\b[^>]*>.*?</{t}>")).unwrap())
.collect()
});
static HTML_COMMENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?s)<!--.*?-->").unwrap());
fn fallback_body_markdown(html: &str) -> String {
let doc = scraper::Html::parse_document(html);
let body_sel = scraper::Selector::parse("body").unwrap();
let body_html = doc
.select(&body_sel)
.next()
.map(|b| b.inner_html())
.unwrap_or_else(|| html.to_string());
let mut cleaned = HTML_COMMENT.replace_all(&body_html, "").into_owned();
for re in NONCONTENT_BLOCKS.iter() {
cleaned = re.replace_all(&cleaned, "").into_owned();
}
readabilityrs::markdown::html_to_markdown(&cleaned, &rover_markdown_options())
.trim()
.to_string()
}
fn read_title_tag(html: &str) -> Option<String> {
let doc = scraper::Html::parse_document(html);
let sel = scraper::Selector::parse("title").ok()?;
doc.select(&sel)
.next()
.map(|t| t.text().collect::<String>().trim().to_string())
.filter(|s| !s.is_empty())
}
pub fn extract(html: &str, base_url: Option<&Url>) -> Result<ExtractedDoc, ExtractorError> {
let base = base_url
.cloned()
.unwrap_or_else(|| Url::parse("about:blank").unwrap());
extract_full(html, &base)
}
fn approximate_html_text_len(html: &str) -> usize {
let doc = scraper::Html::parse_document(html);
let body_sel = scraper::Selector::parse("body").unwrap();
doc.select(&body_sel)
.next()
.map(|b| b.text().map(|t| t.chars().count()).sum())
.unwrap_or_else(|| html.chars().count())
}
#[cfg(test)]
mod tests {
use super::*;
const SAMPLE_HTML: &str = r#"<!doctype html>
<html lang="en">
<head>
<title>Sample Article About How To Do The Thing</title>
<meta http-equiv="Content-Language" content="en" />
</head>
<body>
<article>
<h1>Sample Article About How To Do The Thing</h1>
<h2>How to do the thing</h2>
<p>This is a long paragraph of body content. It needs to be substantial enough that
readabilityrs identifies it as the article. Otherwise the extractor will fall back
to no-article, which is what we want to avoid in this test. The content has to
cross the default character threshold of 500 characters, so we need a few sentences
of filler. Here is more filler. Lorem ipsum dolor sit amet, consectetur adipiscing
elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.</p>
<p>Second paragraph with a <a href="/relative">relative link</a> and a <a href="https://example.com/abs">absolute link</a>.</p>
</article>
</body>
</html>
"#;
#[test]
fn extracts_title_and_body() {
let url = Url::parse("https://example.com/page").unwrap();
let doc = extract(SAMPLE_HTML, Some(&url)).expect("extract ok");
assert!(doc.title.unwrap().contains("Sample Article"));
assert!(doc.body_md.contains("How to do the thing"));
assert!(doc.body_md.contains("filler"));
}
#[test]
fn produces_atx_headings() {
let url = Url::parse("https://example.com/page").unwrap();
let doc = extract(SAMPLE_HTML, Some(&url)).expect("extract ok");
assert!(doc.body_md.contains("## How to do the thing"));
}
#[test]
fn captures_language() {
let url = Url::parse("https://example.com/page").unwrap();
let doc = extract(SAMPLE_HTML, Some(&url)).expect("extract ok");
assert_eq!(doc.language.as_deref(), Some("en"));
}
#[test]
fn trivial_heading_only_doc_does_not_error() {
let url = Url::parse("http://127.0.0.1/").unwrap();
let doc = extract(
"<html><head><title>Hi</title></head><body><h1>Hello loopback</h1></body></html>",
Some(&url),
)
.expect("trivial doc should extract, not error");
assert!(
doc.body_md.contains("Hello loopback"),
"body should contain the heading text, got: {:?}",
doc.body_md
);
}
#[test]
fn short_paragraph_doc_does_not_error() {
let url = Url::parse("https://example.com/").unwrap();
let doc = extract(
"<html><head><title>Note</title></head><body><p>A short note.</p></body></html>",
Some(&url),
)
.expect("short doc should extract, not error");
assert!(
doc.body_md.contains("A short note."),
"body should contain the paragraph text, got: {:?}",
doc.body_md
);
}
#[test]
fn empty_body_doc_does_not_error() {
let url = Url::parse("https://example.com/").unwrap();
let doc = extract(
"<html><head><title>Empty</title></head><body></body></html>",
Some(&url),
)
.expect("empty doc should extract, not error");
assert_eq!(doc.title.as_deref(), Some("Empty"));
}
}