Skip to main content

rover/extractor/
pipeline.rs

1//! Content extraction pipeline (PRD §6.1).
2//!
3//! `bytes → charset_detect → utf8 → readabilityrs → markdown_postprocess`.
4//!
5//! Charset detection is the fetcher's job (see `fetcher::charset`); this
6//! module receives a UTF-8 string and returns markdown.
7
8use readabilityrs::{
9    MarkdownOptions, Readability, ReadabilityOptions,
10    markdown::options::{HeadingStyle, LinkStyle},
11};
12use regex::Regex;
13use std::sync::LazyLock;
14use thiserror::Error;
15use url::Url;
16
17#[derive(Debug, Error)]
18pub enum ExtractorError {
19    #[error("readabilityrs: {0}")]
20    Readability(String),
21
22    #[error("metadata extraction failed: {0}")]
23    Metadata(String),
24
25    #[error("output directory error at {path}: {source}")]
26    Output {
27        path: String,
28        #[source]
29        source: std::io::Error,
30    },
31
32    #[error("could not write table {ordinal} to {path}: {source}")]
33    TableWrite {
34        ordinal: usize,
35        path: String,
36        #[source]
37        source: std::io::Error,
38    },
39
40    #[error("could not download image at {url}: {source}")]
41    ImageDownload {
42        url: String,
43        #[source]
44        source: reqwest::Error,
45    },
46
47    #[error("could not write image at {path}: {source}")]
48    ImageWrite {
49        path: String,
50        #[source]
51        source: std::io::Error,
52    },
53
54    #[error("invalid image url {url}: {source}")]
55    ImageUrlInvalid {
56        url: String,
57        #[source]
58        source: url::ParseError,
59    },
60
61    #[error("ssrf policy blocked image url {url}: {source}")]
62    ImageSsrf {
63        url: String,
64        #[source]
65        source: crate::fetcher::ssrf::SsrfError,
66    },
67
68    #[error("captioner `{name}` failed: {source}")]
69    CaptionerCall {
70        name: String,
71        #[source]
72        source: Box<crate::vlm::VlmError>,
73    },
74
75    #[error("no captioner configured for images.mode = caption")]
76    CaptionerNotConfigured,
77}
78
79/// Successfully extracted article.
80#[derive(Debug, Clone)]
81pub struct ExtractedDoc {
82    pub title: Option<String>,
83    pub body_md: String,
84    pub language: Option<String>,
85    pub byline: Option<String>,
86    pub excerpt: Option<String>,
87    pub site_name: Option<String>,
88    pub published_time: Option<String>,
89    pub image: Option<String>,
90    pub metadata: crate::extractor::metadata::ExtractedMetadata,
91    pub raw_html_text_len: usize,
92}
93
94/// Build the markdown options Rover prefers (PRD §6.1: ATX headings, backtick
95/// fences, dash bullets, inline links).
96fn rover_markdown_options() -> MarkdownOptions {
97    MarkdownOptions {
98        heading_style: HeadingStyle::Atx,
99        bullet_char: '-',
100        code_fence: '`',
101        emphasis_delimiter: '*',
102        strong_delimiter: "**".to_string(),
103        link_style: LinkStyle::Inline,
104        preserve_complex_tables: true,
105    }
106}
107
108/// Extract the article from `html`, resolving relative links against `base_url`.
109///
110/// Runs the two-pass M4 shape:
111///   1. Pre-pass on raw HTML — read `<base href>` and extract structured
112///      metadata (JSON-LD / OG / Twitter / `<html lang>` / canonical).
113///   2. readabilityrs main pass against the effective base.
114///   3. Post-pass — absolutize relative links/images in the markdown body.
115pub fn extract_full(html: &str, base_url: &Url) -> Result<ExtractedDoc, ExtractorError> {
116    // Pre-pass: base href + metadata, on raw HTML.
117    let effective_base =
118        crate::extractor::base_href::read_base_href(html).unwrap_or_else(|| base_url.clone());
119    let metadata = crate::extractor::metadata::extract(html, &effective_base);
120    let raw_html_text_len = approximate_html_text_len(html);
121
122    // readabilityrs main pass.
123    let opts = ReadabilityOptions::builder()
124        .output_markdown(true)
125        .markdown_options(rover_markdown_options())
126        .build();
127    let readability = Readability::new(html, Some(effective_base.as_str()), Some(opts))
128        .map_err(|e| ExtractorError::Readability(e.to_string()))?;
129
130    // readabilityrs returns `None` when it can't isolate an article — which
131    // happens for short or boilerplate-only pages (a bare heading, a one-line
132    // note, an error page). Rover is an agent's browser, not an article reader,
133    // so it must still return the page content rather than failing the fetch.
134    // Fall back to a direct body→markdown conversion, sourcing the descriptive
135    // fields from the metadata pre-pass.
136    let (mut body_md, title, language, byline, excerpt, site_name, published_time, image) =
137        match readability.parse() {
138            Some(article) => (
139                article.markdown_content.unwrap_or_default(),
140                article.title.or_else(|| metadata.title.clone()),
141                article.lang.or_else(|| metadata.language.clone()),
142                article.byline,
143                article.excerpt,
144                article.site_name,
145                article
146                    .published_time
147                    .or_else(|| metadata.published.clone()),
148                article.image.or_else(|| metadata.image.clone()),
149            ),
150            None => {
151                tracing::debug!(
152                    target: "rover::extractor",
153                    url = %effective_base,
154                    "readabilityrs found no article; using direct body→markdown fallback"
155                );
156                (
157                    fallback_body_markdown(html),
158                    metadata.title.clone().or_else(|| read_title_tag(html)),
159                    metadata.language.clone(),
160                    metadata.author.clone(),
161                    metadata.description.clone(),
162                    None,
163                    metadata.published.clone(),
164                    metadata.image.clone(),
165                )
166            }
167        };
168
169    // Post-pass: absolutize links/images against the effective base.
170    body_md = crate::extractor::links::absolutize(&body_md, &effective_base);
171
172    Ok(ExtractedDoc {
173        title,
174        body_md,
175        language,
176        byline,
177        excerpt,
178        site_name,
179        published_time,
180        image,
181        metadata,
182        raw_html_text_len,
183    })
184}
185
186/// Non-content blocks whose inner text must never bleed into the fallback
187/// markdown. The markdown converter's default arm recurses into unknown tags
188/// and emits their text, so `<script>`/`<style>`/etc. are stripped first.
189static NONCONTENT_BLOCKS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
190    ["script", "style", "noscript", "template", "svg"]
191        .iter()
192        // The `regex` crate has no backreferences, so each tag gets its own
193        // open-tag…close-tag pattern (case-insensitive, dot-matches-newline).
194        .map(|t| Regex::new(&format!(r"(?is)<{t}\b[^>]*>.*?</{t}>")).unwrap())
195        .collect()
196});
197
198static HTML_COMMENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?s)<!--.*?-->").unwrap());
199
200/// Best-effort `<body>`→markdown conversion used when readabilityrs finds no
201/// article. Extracts the body, strips non-content blocks (scripts, styles,
202/// inline SVG, comments) so they can't leak as text, then converts what
203/// remains. Returns an empty string for a contentless document.
204fn fallback_body_markdown(html: &str) -> String {
205    let doc = scraper::Html::parse_document(html);
206    let body_sel = scraper::Selector::parse("body").unwrap();
207    let body_html = doc
208        .select(&body_sel)
209        .next()
210        .map(|b| b.inner_html())
211        .unwrap_or_else(|| html.to_string());
212
213    let mut cleaned = HTML_COMMENT.replace_all(&body_html, "").into_owned();
214    for re in NONCONTENT_BLOCKS.iter() {
215        cleaned = re.replace_all(&cleaned, "").into_owned();
216    }
217
218    readabilityrs::markdown::html_to_markdown(&cleaned, &rover_markdown_options())
219        .trim()
220        .to_string()
221}
222
223/// Read the document's `<title>` element text — the last-resort title for the
224/// no-article fallback (the metadata pre-pass only reads OG/Twitter/JSON-LD
225/// titles, not the plain `<title>` tag).
226fn read_title_tag(html: &str) -> Option<String> {
227    let doc = scraper::Html::parse_document(html);
228    let sel = scraper::Selector::parse("title").ok()?;
229    doc.select(&sel)
230        .next()
231        .map(|t| t.text().collect::<String>().trim().to_string())
232        .filter(|s| !s.is_empty())
233}
234
235/// Backwards-compatible wrapper for callers that don't have a base `Url`.
236pub fn extract(html: &str, base_url: Option<&Url>) -> Result<ExtractedDoc, ExtractorError> {
237    let base = base_url
238        .cloned()
239        .unwrap_or_else(|| Url::parse("about:blank").unwrap());
240    extract_full(html, &base)
241}
242
243/// Approximate the visible-text length of `html` by counting characters
244/// in the `<body>`'s text descendants. Falls back to the full input length
245/// when no `<body>` is present (defends against fragment HTML).
246fn approximate_html_text_len(html: &str) -> usize {
247    let doc = scraper::Html::parse_document(html);
248    let body_sel = scraper::Selector::parse("body").unwrap();
249    doc.select(&body_sel)
250        .next()
251        .map(|b| b.text().map(|t| t.chars().count()).sum())
252        .unwrap_or_else(|| html.chars().count())
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    const SAMPLE_HTML: &str = r#"<!doctype html>
260<html lang="en">
261<head>
262  <title>Sample Article About How To Do The Thing</title>
263  <meta http-equiv="Content-Language" content="en" />
264</head>
265<body>
266  <article>
267    <h1>Sample Article About How To Do The Thing</h1>
268    <h2>How to do the thing</h2>
269    <p>This is a long paragraph of body content. It needs to be substantial enough that
270       readabilityrs identifies it as the article. Otherwise the extractor will fall back
271       to no-article, which is what we want to avoid in this test. The content has to
272       cross the default character threshold of 500 characters, so we need a few sentences
273       of filler. Here is more filler. Lorem ipsum dolor sit amet, consectetur adipiscing
274       elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.</p>
275    <p>Second paragraph with a <a href="/relative">relative link</a> and a <a href="https://example.com/abs">absolute link</a>.</p>
276  </article>
277</body>
278</html>
279"#;
280
281    #[test]
282    fn extracts_title_and_body() {
283        let url = Url::parse("https://example.com/page").unwrap();
284        let doc = extract(SAMPLE_HTML, Some(&url)).expect("extract ok");
285        assert!(doc.title.unwrap().contains("Sample Article"));
286        assert!(doc.body_md.contains("How to do the thing"));
287        assert!(doc.body_md.contains("filler"));
288    }
289
290    #[test]
291    fn produces_atx_headings() {
292        let url = Url::parse("https://example.com/page").unwrap();
293        let doc = extract(SAMPLE_HTML, Some(&url)).expect("extract ok");
294        // ATX heading is `## Heading`, not the Setext underline form.
295        assert!(doc.body_md.contains("## How to do the thing"));
296    }
297
298    #[test]
299    fn captures_language() {
300        let url = Url::parse("https://example.com/page").unwrap();
301        let doc = extract(SAMPLE_HTML, Some(&url)).expect("extract ok");
302        assert_eq!(doc.language.as_deref(), Some("en"));
303    }
304
305    #[test]
306    fn trivial_heading_only_doc_does_not_error() {
307        // A bare heading is below readabilityrs's char threshold and readability
308        // alone returns no article — Rover must still surface the content rather
309        // than failing the whole fetch.
310        let url = Url::parse("http://127.0.0.1/").unwrap();
311        let doc = extract(
312            "<html><head><title>Hi</title></head><body><h1>Hello loopback</h1></body></html>",
313            Some(&url),
314        )
315        .expect("trivial doc should extract, not error");
316        assert!(
317            doc.body_md.contains("Hello loopback"),
318            "body should contain the heading text, got: {:?}",
319            doc.body_md
320        );
321    }
322
323    #[test]
324    fn short_paragraph_doc_does_not_error() {
325        let url = Url::parse("https://example.com/").unwrap();
326        let doc = extract(
327            "<html><head><title>Note</title></head><body><p>A short note.</p></body></html>",
328            Some(&url),
329        )
330        .expect("short doc should extract, not error");
331        assert!(
332            doc.body_md.contains("A short note."),
333            "body should contain the paragraph text, got: {:?}",
334            doc.body_md
335        );
336    }
337
338    #[test]
339    fn empty_body_doc_does_not_error() {
340        // Even a contentless document should yield an (empty) doc, never a hard
341        // error that fails the fetch.
342        let url = Url::parse("https://example.com/").unwrap();
343        let doc = extract(
344            "<html><head><title>Empty</title></head><body></body></html>",
345            Some(&url),
346        )
347        .expect("empty doc should extract, not error");
348        assert_eq!(doc.title.as_deref(), Some("Empty"));
349    }
350}