Skip to main content

servo_fetch/
extract.rs

1//! Content extraction — converts raw HTML into readable Markdown or structured JSON.
2
3use std::borrow::Cow;
4use std::fmt::Write;
5
6use dom_query::Document;
7use dom_smoothie::Readability;
8use htmd::HtmlToMarkdown;
9use serde::Serialize;
10
11use crate::layout::{self, LayoutElement};
12
13/// Errors that can occur during content extraction.
14#[derive(Debug, thiserror::Error)]
15#[non_exhaustive]
16pub enum ExtractError {
17    /// Failed to format Markdown output.
18    #[error("markdown formatting failed")]
19    Fmt(#[from] std::fmt::Error),
20    /// Failed to serialize JSON output.
21    #[error("JSON serialization failed")]
22    Json(#[from] serde_json::Error),
23    /// The provided CSS selector is invalid.
24    #[error("invalid CSS selector")]
25    InvalidSelector,
26}
27
28/// Structured article data for JSON output.
29#[derive(Serialize)]
30#[non_exhaustive]
31pub struct ArticleData {
32    /// Page title.
33    pub title: String,
34    /// Raw HTML content extracted by Readability.
35    pub content: String,
36    /// Readable text content (Markdown).
37    pub text_content: String,
38    /// Author or byline, if detected.
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub byline: Option<String>,
41    /// Short excerpt or description.
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub excerpt: Option<String>,
44    /// Document language (e.g. "en").
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub lang: Option<String>,
47    /// Canonical URL.
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub url: Option<String>,
50}
51
52/// Extract text content from a PDF byte slice, or an empty string on failure.
53#[must_use]
54pub fn extract_pdf(data: &[u8]) -> String {
55    match pdf_extract::extract_text_from_mem(data) {
56        Ok(text) => text,
57        Err(e) => {
58            tracing::warn!(error = %e, "PDF text extraction failed");
59            String::new()
60        }
61    }
62}
63
64/// Input parameters for content extraction.
65#[non_exhaustive]
66pub struct ExtractInput<'a> {
67    /// Raw HTML of the page.
68    pub html: &'a str,
69    /// URL of the page (used for resolving relative links).
70    pub url: &'a str,
71    /// JSON-serialized layout data from the injected JS, if available.
72    pub layout_json: Option<&'a str>,
73    /// `document.body.innerText` fallback, if available.
74    pub inner_text: Option<&'a str>,
75    /// CSS selector to extract a specific section instead of using Readability.
76    pub selector: Option<&'a str>,
77}
78
79impl<'a> ExtractInput<'a> {
80    /// Create a new `ExtractInput` with required fields.
81    #[must_use]
82    pub fn new(html: &'a str, url: &'a str) -> Self {
83        Self {
84            html,
85            url,
86            layout_json: None,
87            inner_text: None,
88            selector: None,
89        }
90    }
91
92    /// Set the layout JSON data.
93    #[must_use]
94    pub fn with_layout_json(mut self, layout_json: Option<&'a str>) -> Self {
95        self.layout_json = layout_json;
96        self
97    }
98
99    /// Set the inner text fallback.
100    #[must_use]
101    pub fn with_inner_text(mut self, inner_text: Option<&'a str>) -> Self {
102        self.inner_text = inner_text;
103        self
104    }
105
106    /// Set the CSS selector for targeted extraction.
107    #[must_use]
108    pub fn with_selector(mut self, selector: Option<&'a str>) -> Self {
109        self.selector = selector;
110        self
111    }
112}
113
114/// Extract readable content as Markdown text.
115///
116/// # Errors
117/// Returns [`ExtractError::Fmt`] if Markdown assembly fails.
118pub fn extract_text(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
119    if let Some(selector) = input.selector {
120        return extract_by_selector(input.html, input.layout_json, selector);
121    }
122    let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
123
124    let mut out = String::new();
125    if !article.title.is_empty() {
126        writeln!(out, "# {}\n", article.title)?;
127    }
128    if let Some(ref byline) = article.byline {
129        writeln!(out, "*{}*\n", byline.replace('*', r"\*"))?;
130    }
131    if let Some(ref excerpt) = article.excerpt {
132        writeln!(out, "> {excerpt}\n")?;
133    }
134    write!(out, "{}", article.text_content)?;
135    Ok(clean_markdown(&out))
136}
137
138/// Extract readable content as JSON.
139///
140/// # Errors
141/// Returns [`ExtractError::Json`] if JSON serialization fails.
142pub fn extract_json(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
143    if let Some(selector) = input.selector {
144        let text = extract_by_selector(input.html, input.layout_json, selector)?;
145        let data = ArticleData {
146            title: String::new(),
147            content: String::new(),
148            text_content: text,
149            byline: None,
150            excerpt: None,
151            lang: None,
152            url: Some(input.url.to_string()),
153        };
154        return Ok(serde_json::to_string_pretty(&data)?);
155    }
156    let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
157    let data = ArticleData {
158        title: article.title,
159        content: article.content,
160        text_content: article.text_content,
161        byline: article.byline,
162        excerpt: article.excerpt,
163        lang: article.lang,
164        url: Some(input.url.to_string()),
165    };
166    Ok(serde_json::to_string_pretty(&data)?)
167}
168
169struct ParsedArticle {
170    title: String,
171    content: String,
172    text_content: String,
173    byline: Option<String>,
174    excerpt: Option<String>,
175    lang: Option<String>,
176}
177
178fn is_nextjs_error_page(text: &str) -> bool {
179    let t = text.trim();
180    t.contains("client-side exception has occurred") || t.contains("Application error: a")
181}
182
183fn parse_article(html: &str, url: &str, layout_json: Option<&str>, inner_text: Option<&str>) -> ParsedArticle {
184    let filtered = filter(html, layout_json);
185
186    let doc = Document::from(filtered.as_ref());
187    if let Ok(mut readability) = Readability::with_document(doc, Some(url), None) {
188        if let Ok(article) = readability.parse() {
189            if !is_nextjs_error_page(&article.text_content) {
190                let converter = HtmlToMarkdown::builder().build();
191                let markdown = converter
192                    .convert(&article.content)
193                    .unwrap_or_else(|_| article.text_content.to_string());
194                return ParsedArticle {
195                    title: article.title.clone(),
196                    content: article.content.to_string(),
197                    text_content: markdown,
198                    byline: article.byline.clone(),
199                    excerpt: article.excerpt.clone(),
200                    lang: article.lang,
201                };
202            }
203        }
204    }
205
206    // Readability failed or returned an error page — fall back to innerText.
207    let doc = Document::from(filtered.as_ref());
208    let title = doc.select("title").text().to_string();
209    let body_text = inner_text.filter(|s| !s.trim().is_empty()).map_or_else(
210        || {
211            tracing::warn!(r#"could not extract content; try --js "document.body.innerText" for JS-heavy sites"#);
212            String::new()
213        },
214        String::from,
215    );
216    ParsedArticle {
217        title,
218        content: String::new(),
219        text_content: body_text,
220        byline: None,
221        excerpt: None,
222        lang: None,
223    }
224}
225
226fn extract_by_selector(html: &str, layout_json: Option<&str>, selector: &str) -> Result<String, ExtractError> {
227    let matcher = dom_query::Matcher::new(selector).map_err(|_| ExtractError::InvalidSelector)?;
228    let filtered = filter(html, layout_json);
229    let doc = Document::from(filtered.as_ref());
230    let selected = doc.select_matcher(&matcher);
231    let fragment = selected.html();
232    if fragment.is_empty() {
233        return Ok(String::new());
234    }
235    let converter = HtmlToMarkdown::builder().skip_tags(vec!["script", "style"]).build();
236    let markdown = converter
237        .convert(&fragment)
238        .unwrap_or_else(|_| selected.text().to_string());
239    Ok(clean_markdown(&markdown))
240}
241
242fn filter<'a>(html: &'a str, layout_json: Option<&str>) -> Cow<'a, str> {
243    layout_json
244        .and_then(|lj| serde_json::from_str::<Vec<LayoutElement>>(lj).ok())
245        .map_or(Cow::Borrowed(html), |els| {
246            let sels = layout::selectors_to_strip(&els);
247            if sels.is_empty() {
248                return Cow::Borrowed(html);
249            }
250            let doc = Document::from(html);
251            for sel in &sels {
252                doc.select(sel).remove();
253            }
254            Cow::Owned(doc.html().to_string())
255        })
256}
257
258// Collapse runs of 3+ blank lines down to 2.
259fn clean_markdown(input: &str) -> String {
260    let mut result = String::with_capacity(input.len());
261    let mut blank_count = 0u8;
262    for line in input.lines() {
263        if line.trim().is_empty() {
264            blank_count = blank_count.saturating_add(1);
265            if blank_count <= 2 {
266                result.push('\n');
267            }
268        } else {
269            blank_count = 0;
270            result.push_str(line);
271            result.push('\n');
272        }
273    }
274    result
275}
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280
281    #[test]
282    fn is_nextjs_error_page_detects_nextjs() {
283        assert!(is_nextjs_error_page(
284            "Application error: a client-side exception has occurred"
285        ));
286    }
287
288    #[test]
289    fn is_nextjs_error_page_ignores_normal_content() {
290        assert!(!is_nextjs_error_page("This article discusses error handling in Rust."));
291        assert!(!is_nextjs_error_page(
292            "A long page about many topics that happens to mention errors somewhere in the middle of a paragraph."
293        ));
294    }
295
296    #[test]
297    fn clean_markdown_collapses_blank_lines() {
298        let input = "line1\n\n\n\n\nline2\n";
299        let result = clean_markdown(input);
300        assert_eq!(result, "line1\n\n\nline2\n");
301    }
302
303    #[test]
304    fn clean_markdown_preserves_single_blank() {
305        let input = "a\n\nb\n";
306        assert_eq!(clean_markdown(input), "a\n\nb\n");
307    }
308
309    #[test]
310    fn filter_without_layout_returns_original() {
311        let html = "<html><body>hello</body></html>";
312        let result = filter(html, None);
313        assert_eq!(result.as_ref(), html);
314    }
315
316    #[test]
317    fn filter_strips_footer() {
318        let html = r#"<html><body><footer style="position:static">nav</footer><p>content</p></body></html>"#;
319        let layout = r#"[{"tag":"FOOTER","role":null,"w":1280,"h":100,"position":"static"}]"#;
320        let result = filter(html, Some(layout));
321        assert!(!result.contains("<footer"));
322        assert!(result.contains("content"));
323    }
324
325    #[test]
326    fn extract_input_builder() {
327        let input = ExtractInput::new("<html></html>", "https://example.com")
328            .with_layout_json(Some("[]"))
329            .with_inner_text(Some("hello"))
330            .with_selector(Some("article"));
331        assert_eq!(input.layout_json, Some("[]"));
332        assert_eq!(input.inner_text, Some("hello"));
333        assert_eq!(input.selector, Some("article"));
334    }
335
336    #[test]
337    fn clean_markdown_no_trailing_newline() {
338        let input = "line1\nline2";
339        let result = clean_markdown(input);
340        assert_eq!(result, "line1\nline2\n");
341    }
342}