1use readabilityrs::{
9 MarkdownOptions, Readability, ReadabilityOptions,
10 markdown::options::{HeadingStyle, LinkStyle},
11};
12use regex::Regex;
13use std::sync::LazyLock;
14use thiserror::Error;
15use url::Url;
16
17#[derive(Debug, Error)]
18pub enum ExtractorError {
19 #[error("readabilityrs: {0}")]
20 Readability(String),
21
22 #[error("metadata extraction failed: {0}")]
23 Metadata(String),
24
25 #[error("output directory error at {path}: {source}")]
26 Output {
27 path: String,
28 #[source]
29 source: std::io::Error,
30 },
31
32 #[error("could not write table {ordinal} to {path}: {source}")]
33 TableWrite {
34 ordinal: usize,
35 path: String,
36 #[source]
37 source: std::io::Error,
38 },
39
40 #[error("could not download image at {url}: {source}")]
41 ImageDownload {
42 url: String,
43 #[source]
44 source: reqwest::Error,
45 },
46
47 #[error("could not write image at {path}: {source}")]
48 ImageWrite {
49 path: String,
50 #[source]
51 source: std::io::Error,
52 },
53
54 #[error("invalid image url {url}: {source}")]
55 ImageUrlInvalid {
56 url: String,
57 #[source]
58 source: url::ParseError,
59 },
60
61 #[error("ssrf policy blocked image url {url}: {source}")]
62 ImageSsrf {
63 url: String,
64 #[source]
65 source: crate::fetcher::ssrf::SsrfError,
66 },
67
68 #[error("captioner `{name}` failed: {source}")]
69 CaptionerCall {
70 name: String,
71 #[source]
72 source: Box<crate::vlm::VlmError>,
73 },
74
75 #[error("no captioner configured for images.mode = caption")]
76 CaptionerNotConfigured,
77}
78
79#[derive(Debug, Clone)]
81pub struct ExtractedDoc {
82 pub title: Option<String>,
83 pub body_md: String,
84 pub language: Option<String>,
85 pub byline: Option<String>,
86 pub excerpt: Option<String>,
87 pub site_name: Option<String>,
88 pub published_time: Option<String>,
89 pub image: Option<String>,
90 pub metadata: crate::extractor::metadata::ExtractedMetadata,
91 pub raw_html_text_len: usize,
92}
93
94fn rover_markdown_options() -> MarkdownOptions {
97 MarkdownOptions {
98 heading_style: HeadingStyle::Atx,
99 bullet_char: '-',
100 code_fence: '`',
101 emphasis_delimiter: '*',
102 strong_delimiter: "**".to_string(),
103 link_style: LinkStyle::Inline,
104 preserve_complex_tables: true,
105 }
106}
107
108pub fn extract_full(html: &str, base_url: &Url) -> Result<ExtractedDoc, ExtractorError> {
116 let effective_base =
118 crate::extractor::base_href::read_base_href(html).unwrap_or_else(|| base_url.clone());
119 let metadata = crate::extractor::metadata::extract(html, &effective_base);
120 let raw_html_text_len = approximate_html_text_len(html);
121
122 let opts = ReadabilityOptions::builder()
124 .output_markdown(true)
125 .markdown_options(rover_markdown_options())
126 .build();
127 let readability = Readability::new(html, Some(effective_base.as_str()), Some(opts))
128 .map_err(|e| ExtractorError::Readability(e.to_string()))?;
129
130 let (mut body_md, title, language, byline, excerpt, site_name, published_time, image) =
137 match readability.parse() {
138 Some(article) => (
139 article.markdown_content.unwrap_or_default(),
140 article.title.or_else(|| metadata.title.clone()),
141 article.lang.or_else(|| metadata.language.clone()),
142 article.byline,
143 article.excerpt,
144 article.site_name,
145 article
146 .published_time
147 .or_else(|| metadata.published.clone()),
148 article.image.or_else(|| metadata.image.clone()),
149 ),
150 None => {
151 tracing::debug!(
152 target: "rover::extractor",
153 url = %effective_base,
154 "readabilityrs found no article; using direct body→markdown fallback"
155 );
156 (
157 fallback_body_markdown(html),
158 metadata.title.clone().or_else(|| read_title_tag(html)),
159 metadata.language.clone(),
160 metadata.author.clone(),
161 metadata.description.clone(),
162 None,
163 metadata.published.clone(),
164 metadata.image.clone(),
165 )
166 }
167 };
168
169 body_md = crate::extractor::links::absolutize(&body_md, &effective_base);
171
172 Ok(ExtractedDoc {
173 title,
174 body_md,
175 language,
176 byline,
177 excerpt,
178 site_name,
179 published_time,
180 image,
181 metadata,
182 raw_html_text_len,
183 })
184}
185
186static NONCONTENT_BLOCKS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
190 ["script", "style", "noscript", "template", "svg"]
191 .iter()
192 .map(|t| Regex::new(&format!(r"(?is)<{t}\b[^>]*>.*?</{t}>")).unwrap())
195 .collect()
196});
197
198static HTML_COMMENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?s)<!--.*?-->").unwrap());
199
200fn fallback_body_markdown(html: &str) -> String {
205 let doc = scraper::Html::parse_document(html);
206 let body_sel = scraper::Selector::parse("body").unwrap();
207 let body_html = doc
208 .select(&body_sel)
209 .next()
210 .map(|b| b.inner_html())
211 .unwrap_or_else(|| html.to_string());
212
213 let mut cleaned = HTML_COMMENT.replace_all(&body_html, "").into_owned();
214 for re in NONCONTENT_BLOCKS.iter() {
215 cleaned = re.replace_all(&cleaned, "").into_owned();
216 }
217
218 readabilityrs::markdown::html_to_markdown(&cleaned, &rover_markdown_options())
219 .trim()
220 .to_string()
221}
222
223fn read_title_tag(html: &str) -> Option<String> {
227 let doc = scraper::Html::parse_document(html);
228 let sel = scraper::Selector::parse("title").ok()?;
229 doc.select(&sel)
230 .next()
231 .map(|t| t.text().collect::<String>().trim().to_string())
232 .filter(|s| !s.is_empty())
233}
234
235pub fn extract(html: &str, base_url: Option<&Url>) -> Result<ExtractedDoc, ExtractorError> {
237 let base = base_url
238 .cloned()
239 .unwrap_or_else(|| Url::parse("about:blank").unwrap());
240 extract_full(html, &base)
241}
242
243fn approximate_html_text_len(html: &str) -> usize {
247 let doc = scraper::Html::parse_document(html);
248 let body_sel = scraper::Selector::parse("body").unwrap();
249 doc.select(&body_sel)
250 .next()
251 .map(|b| b.text().map(|t| t.chars().count()).sum())
252 .unwrap_or_else(|| html.chars().count())
253}
254
255#[cfg(test)]
256mod tests {
257 use super::*;
258
259 const SAMPLE_HTML: &str = r#"<!doctype html>
260<html lang="en">
261<head>
262 <title>Sample Article About How To Do The Thing</title>
263 <meta http-equiv="Content-Language" content="en" />
264</head>
265<body>
266 <article>
267 <h1>Sample Article About How To Do The Thing</h1>
268 <h2>How to do the thing</h2>
269 <p>This is a long paragraph of body content. It needs to be substantial enough that
270 readabilityrs identifies it as the article. Otherwise the extractor will fall back
271 to no-article, which is what we want to avoid in this test. The content has to
272 cross the default character threshold of 500 characters, so we need a few sentences
273 of filler. Here is more filler. Lorem ipsum dolor sit amet, consectetur adipiscing
274 elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.</p>
275 <p>Second paragraph with a <a href="/relative">relative link</a> and a <a href="https://example.com/abs">absolute link</a>.</p>
276 </article>
277</body>
278</html>
279"#;
280
281 #[test]
282 fn extracts_title_and_body() {
283 let url = Url::parse("https://example.com/page").unwrap();
284 let doc = extract(SAMPLE_HTML, Some(&url)).expect("extract ok");
285 assert!(doc.title.unwrap().contains("Sample Article"));
286 assert!(doc.body_md.contains("How to do the thing"));
287 assert!(doc.body_md.contains("filler"));
288 }
289
290 #[test]
291 fn produces_atx_headings() {
292 let url = Url::parse("https://example.com/page").unwrap();
293 let doc = extract(SAMPLE_HTML, Some(&url)).expect("extract ok");
294 assert!(doc.body_md.contains("## How to do the thing"));
296 }
297
298 #[test]
299 fn captures_language() {
300 let url = Url::parse("https://example.com/page").unwrap();
301 let doc = extract(SAMPLE_HTML, Some(&url)).expect("extract ok");
302 assert_eq!(doc.language.as_deref(), Some("en"));
303 }
304
305 #[test]
306 fn trivial_heading_only_doc_does_not_error() {
307 let url = Url::parse("http://127.0.0.1/").unwrap();
311 let doc = extract(
312 "<html><head><title>Hi</title></head><body><h1>Hello loopback</h1></body></html>",
313 Some(&url),
314 )
315 .expect("trivial doc should extract, not error");
316 assert!(
317 doc.body_md.contains("Hello loopback"),
318 "body should contain the heading text, got: {:?}",
319 doc.body_md
320 );
321 }
322
323 #[test]
324 fn short_paragraph_doc_does_not_error() {
325 let url = Url::parse("https://example.com/").unwrap();
326 let doc = extract(
327 "<html><head><title>Note</title></head><body><p>A short note.</p></body></html>",
328 Some(&url),
329 )
330 .expect("short doc should extract, not error");
331 assert!(
332 doc.body_md.contains("A short note."),
333 "body should contain the paragraph text, got: {:?}",
334 doc.body_md
335 );
336 }
337
338 #[test]
339 fn empty_body_doc_does_not_error() {
340 let url = Url::parse("https://example.com/").unwrap();
343 let doc = extract(
344 "<html><head><title>Empty</title></head><body></body></html>",
345 Some(&url),
346 )
347 .expect("empty doc should extract, not error");
348 assert_eq!(doc.title.as_deref(), Some("Empty"));
349 }
350}