Skip to main content

servo_fetch/
extract.rs

1//! Content extraction — converts raw HTML into readable Markdown or structured JSON.
2
3use std::borrow::Cow;
4use std::collections::HashMap;
5use std::fmt::{self, Write as _};
6
7use dom_query::Document;
8use dom_smoothie::Readability;
9use htmd::HtmlToMarkdown;
10use serde::Serialize;
11use servo::accesskit::{Node, NodeId};
12
13use crate::layout::{self, LayoutElement};
14use crate::visibility::{self, A11yIndex, VisibilityPolicy};
15
16/// Errors that can occur during content extraction.
17#[derive(Debug, thiserror::Error)]
18#[non_exhaustive]
19pub enum ExtractError {
20    /// Failed to format Markdown output.
21    #[error("markdown formatting failed")]
22    Fmt(#[from] fmt::Error),
23    /// Failed to serialize JSON output.
24    #[error("JSON serialization failed")]
25    Json(#[from] serde_json::Error),
26    /// The provided CSS selector is invalid.
27    #[error("invalid CSS selector")]
28    InvalidSelector,
29}
30
31/// Structured article data for JSON output.
32#[derive(Serialize)]
33#[non_exhaustive]
34pub struct ArticleData {
35    /// Page title.
36    pub title: String,
37    /// Raw HTML content extracted by Readability.
38    pub content: String,
39    /// Readable text content (Markdown).
40    pub text_content: String,
41    /// Author or byline, if detected.
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub byline: Option<String>,
44    /// Short excerpt or description.
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub excerpt: Option<String>,
47    /// Document language (e.g. "en").
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub lang: Option<String>,
50    /// Canonical URL.
51    #[serde(skip_serializing_if = "Option::is_none")]
52    pub url: Option<String>,
53}
54
55/// Extract text content from a PDF byte slice, or an empty string on failure.
56#[must_use]
57pub fn extract_pdf(data: &[u8]) -> String {
58    match pdf_extract::extract_text_from_mem(data) {
59        Ok(text) => text,
60        Err(e) => {
61            tracing::warn!(error = %e, "PDF text extraction failed");
62            String::new()
63        }
64    }
65}
66
67/// Input parameters for content extraction.
68#[non_exhaustive]
69pub struct ExtractInput<'a> {
70    /// Raw HTML of the page.
71    pub html: &'a str,
72    /// URL of the page (used for resolving relative links).
73    pub url: &'a str,
74    /// JSON-serialized layout data from the injected JS, if available.
75    pub layout_json: Option<&'a str>,
76    /// JSON-serialized visibility data from the injected JS, if available.
77    pub visibility_json: Option<&'a str>,
78    /// AccessKit accessibility tree, if available.
79    pub a11y: Option<&'a HashMap<NodeId, Node>>,
80    /// `document.body.innerText` fallback, if available.
81    pub inner_text: Option<&'a str>,
82    /// CSS selector to extract a specific section instead of using Readability.
83    pub selector: Option<&'a str>,
84    /// Visibility policy controlling which hidden content is stripped.
85    pub visibility: VisibilityPolicy,
86}
87
88impl<'a> ExtractInput<'a> {
89    /// Create a new `ExtractInput` with required fields.
90    #[must_use]
91    pub fn new(html: &'a str, url: &'a str) -> Self {
92        Self {
93            html,
94            url,
95            layout_json: None,
96            visibility_json: None,
97            a11y: None,
98            inner_text: None,
99            selector: None,
100            visibility: VisibilityPolicy::default(),
101        }
102    }
103
104    /// Set the layout JSON data.
105    #[must_use]
106    pub fn with_layout_json(mut self, layout_json: Option<&'a str>) -> Self {
107        self.layout_json = layout_json;
108        self
109    }
110
111    /// Set the visibility JSON data.
112    #[must_use]
113    pub fn with_visibility_json(mut self, visibility_json: Option<&'a str>) -> Self {
114        self.visibility_json = visibility_json;
115        self
116    }
117
118    /// Set the typed accessibility tree.
119    #[must_use]
120    pub fn with_a11y(mut self, a11y: Option<&'a HashMap<NodeId, Node>>) -> Self {
121        self.a11y = a11y;
122        self
123    }
124
125    /// Set the inner text fallback.
126    #[must_use]
127    pub fn with_inner_text(mut self, inner_text: Option<&'a str>) -> Self {
128        self.inner_text = inner_text;
129        self
130    }
131
132    /// Set the CSS selector for targeted extraction.
133    #[must_use]
134    pub fn with_selector(mut self, selector: Option<&'a str>) -> Self {
135        self.selector = selector;
136        self
137    }
138
139    /// Set the visibility policy.
140    #[must_use]
141    pub fn with_visibility(mut self, policy: VisibilityPolicy) -> Self {
142        self.visibility = policy;
143        self
144    }
145}
146
147/// Extract readable content as Markdown text.
148pub fn extract_text(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
149    if let Some(selector) = input.selector {
150        return extract_by_selector(input, selector);
151    }
152    let article = parse_article(input);
153
154    let mut out = String::new();
155    if !article.title.is_empty() {
156        writeln!(out, "# {}\n", article.title)?;
157    }
158    if let Some(ref byline) = article.byline {
159        writeln!(out, "*{}*\n", byline.replace('*', r"\*"))?;
160    }
161    if let Some(ref excerpt) = article.excerpt {
162        writeln!(out, "> {excerpt}\n")?;
163    }
164    write!(out, "{}", article.text_content)?;
165    Ok(clean_markdown(&out))
166}
167
168/// Extract readable content as JSON.
169pub fn extract_json(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
170    if let Some(selector) = input.selector {
171        let text = extract_by_selector(input, selector)?;
172        let data = ArticleData {
173            title: String::new(),
174            content: String::new(),
175            text_content: text,
176            byline: None,
177            excerpt: None,
178            lang: None,
179            url: Some(input.url.to_string()),
180        };
181        return Ok(serde_json::to_string_pretty(&data)?);
182    }
183    let article = parse_article(input);
184    let data = ArticleData {
185        title: article.title,
186        content: article.content,
187        text_content: article.text_content,
188        byline: article.byline,
189        excerpt: article.excerpt,
190        lang: article.lang,
191        url: Some(input.url.to_string()),
192    };
193    Ok(serde_json::to_string_pretty(&data)?)
194}
195
196struct ParsedArticle {
197    title: String,
198    content: String,
199    text_content: String,
200    byline: Option<String>,
201    excerpt: Option<String>,
202    lang: Option<String>,
203}
204
205fn is_nextjs_error_page(text: &str) -> bool {
206    let t = text.trim();
207    t.contains("client-side exception has occurred") || t.contains("Application error: a")
208}
209
210fn parse_article(input: &ExtractInput<'_>) -> ParsedArticle {
211    let filtered = filter(input);
212
213    let doc = Document::from(filtered.as_ref());
214    if let Ok(mut readability) = Readability::with_document(doc, Some(input.url), None) {
215        if let Ok(article) = readability.parse() {
216            if !is_nextjs_error_page(&article.text_content) {
217                let converter = HtmlToMarkdown::builder().build();
218                let markdown = converter
219                    .convert(&article.content)
220                    .unwrap_or_else(|_| article.text_content.to_string());
221                return ParsedArticle {
222                    title: article.title.clone(),
223                    content: article.content.to_string(),
224                    text_content: markdown,
225                    byline: article.byline.clone(),
226                    excerpt: article.excerpt.clone(),
227                    lang: article.lang,
228                };
229            }
230        }
231    }
232
233    // Readability failed or returned an error page — fall back to the filtered
234    // document's text content.
235    let doc = Document::from(filtered.as_ref());
236    doc.select("script, style, noscript").remove();
237    let title = doc.select("title").text().to_string();
238    let filtered_text = doc.select("body").text().to_string();
239    let body_text = if filtered_text.trim().is_empty() {
240        input.inner_text.filter(|s| !s.trim().is_empty()).map_or_else(
241            || {
242                tracing::warn!(r#"could not extract content; try --js "document.body.innerText" for JS-heavy sites"#);
243                String::new()
244            },
245            String::from,
246        )
247    } else {
248        filtered_text
249    };
250    ParsedArticle {
251        title,
252        content: String::new(),
253        text_content: body_text,
254        byline: None,
255        excerpt: None,
256        lang: None,
257    }
258}
259
260fn extract_by_selector(input: &ExtractInput<'_>, selector: &str) -> Result<String, ExtractError> {
261    let matcher = dom_query::Matcher::new(selector).map_err(|_| ExtractError::InvalidSelector)?;
262    let filtered = filter(input);
263    let doc = Document::from(filtered.as_ref());
264    let selected = doc.select_matcher(&matcher);
265    let fragment = selected.html();
266    if fragment.is_empty() {
267        return Ok(String::new());
268    }
269    let converter = HtmlToMarkdown::builder().skip_tags(vec!["script", "style"]).build();
270    let markdown = converter
271        .convert(&fragment)
272        .unwrap_or_else(|_| selected.text().to_string());
273    Ok(clean_markdown(&markdown))
274}
275
276fn filter<'a>(input: &'a ExtractInput<'a>) -> Cow<'a, str> {
277    let mut selectors: Vec<String> = Vec::new();
278
279    if let Some(lj) = input.layout_json
280        && let Ok(els) = serde_json::from_str::<Vec<LayoutElement>>(lj)
281    {
282        selectors.extend(layout::selectors_to_strip(&els));
283    }
284
285    let a11y_index = input.a11y.map(A11yIndex::new);
286
287    selectors.extend(visibility::selectors_to_strip(
288        input.visibility,
289        a11y_index.as_ref(),
290        input.visibility_json,
291    ));
292
293    let needs_attr_cleanup = input.visibility_json.is_some() || input.html.contains("data-vf-id=");
294    if selectors.is_empty() && !needs_attr_cleanup {
295        return Cow::Borrowed(input.html);
296    }
297
298    let doc = Document::from(input.html);
299    for sel in &selectors {
300        doc.select(sel).remove();
301    }
302    if needs_attr_cleanup {
303        doc.select("[data-vf-id]").remove_attr("data-vf-id");
304    }
305    Cow::Owned(doc.html().to_string())
306}
307
308// Collapse runs of 3+ blank lines down to 2.
309fn clean_markdown(input: &str) -> String {
310    let mut result = String::with_capacity(input.len());
311    let mut blank_count = 0u8;
312    for line in input.lines() {
313        if line.trim().is_empty() {
314            blank_count = blank_count.saturating_add(1);
315            if blank_count <= 2 {
316                result.push('\n');
317            }
318        } else {
319            blank_count = 0;
320            result.push_str(line);
321            result.push('\n');
322        }
323    }
324    result
325}
326
327#[cfg(test)]
328mod tests {
329    use super::*;
330
331    #[test]
332    fn is_nextjs_error_page_detects_nextjs() {
333        assert!(is_nextjs_error_page(
334            "Application error: a client-side exception has occurred"
335        ));
336    }
337
338    #[test]
339    fn is_nextjs_error_page_ignores_normal_content() {
340        assert!(!is_nextjs_error_page("This article discusses error handling in Rust."));
341        assert!(!is_nextjs_error_page(
342            "A long page about many topics that happens to mention errors somewhere in the middle of a paragraph."
343        ));
344    }
345
346    #[test]
347    fn clean_markdown_collapses_blank_lines() {
348        let input = "line1\n\n\n\n\nline2\n";
349        let result = clean_markdown(input);
350        assert_eq!(result, "line1\n\n\nline2\n");
351    }
352
353    #[test]
354    fn clean_markdown_preserves_single_blank() {
355        let input = "a\n\nb\n";
356        assert_eq!(clean_markdown(input), "a\n\nb\n");
357    }
358
359    #[test]
360    fn filter_off_policy_keeps_visible_content() {
361        let input = ExtractInput::new("<html><body>hello</body></html>", "").with_visibility(VisibilityPolicy::off());
362        let result = filter(&input);
363        assert!(result.contains("hello"));
364    }
365
366    #[test]
367    fn filter_strips_footer() {
368        let html = r#"<html><body><footer style="position:static">nav</footer><p>content</p></body></html>"#;
369        let layout = r#"[{"tag":"FOOTER","role":null,"w":1280,"h":100,"position":"static"}]"#;
370        let input = ExtractInput::new(html, "")
371            .with_layout_json(Some(layout))
372            .with_visibility(VisibilityPolicy::off());
373        let result = filter(&input);
374        assert!(!result.contains("<footer"));
375        assert!(result.contains("content"));
376    }
377
378    #[test]
379    fn filter_strips_visibility_flagged_element() {
380        let html = r#"<html><body><p data-vf-id="1">drop</p><p data-vf-id="2">keep</p></body></html>"#;
381        let visibility = r#"[{"id":"1","flags":16}]"#;
382        let input = ExtractInput::new(html, "")
383            .with_visibility_json(Some(visibility))
384            .with_visibility(VisibilityPolicy::moderate());
385        let result = filter(&input);
386        assert!(!result.contains("drop"));
387        assert!(result.contains("keep"));
388    }
389
390    #[test]
391    fn filter_removes_data_vf_id_from_output() {
392        let html = r#"<html><body><p data-vf-id="1">keep</p></body></html>"#;
393        let input = ExtractInput::new(html, "")
394            .with_layout_json(Some("[]"))
395            .with_visibility(VisibilityPolicy::off());
396        let result = filter(&input);
397        assert!(!result.contains("data-vf-id"));
398    }
399
400    #[test]
401    fn extract_input_builder() {
402        let input = ExtractInput::new("<html></html>", "https://example.com")
403            .with_layout_json(Some("[]"))
404            .with_visibility_json(Some(r"[]"))
405            .with_inner_text(Some("hello"))
406            .with_selector(Some("article"))
407            .with_visibility(VisibilityPolicy::strict());
408        assert_eq!(input.layout_json, Some("[]"));
409        assert_eq!(input.visibility_json, Some("[]"));
410        assert_eq!(input.inner_text, Some("hello"));
411        assert_eq!(input.selector, Some("article"));
412    }
413
414    #[test]
415    fn clean_markdown_no_trailing_newline() {
416        let input = "line1\nline2";
417        let result = clean_markdown(input);
418        assert_eq!(result, "line1\nline2\n");
419    }
420}