Skip to main content

servo_fetch/
extract.rs

1//! Content extraction — converts raw HTML into readable Markdown or structured JSON.
2
3use std::borrow::Cow;
4use std::fmt::Write;
5
6use dom_query::Document;
7use dom_smoothie::Readability;
8use htmd::HtmlToMarkdown;
9use serde::Serialize;
10
11use crate::layout::{self, LayoutElement};
12
13/// Errors that can occur during content extraction.
14#[derive(Debug, thiserror::Error)]
15#[non_exhaustive]
16pub enum ExtractError {
17    /// Failed to format Markdown output.
18    #[error("markdown formatting failed")]
19    Fmt(#[from] std::fmt::Error),
20    /// Failed to serialize JSON output.
21    #[error("JSON serialization failed")]
22    Json(#[from] serde_json::Error),
23}
24
25/// Structured article data for JSON output.
26#[derive(Serialize)]
27#[non_exhaustive]
28pub struct ArticleData {
29    /// Page title.
30    pub title: String,
31    /// Raw HTML content extracted by Readability.
32    pub content: String,
33    /// Readable text content (Markdown).
34    pub text_content: String,
35    /// Author or byline, if detected.
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub byline: Option<String>,
38    /// Short excerpt or description.
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub excerpt: Option<String>,
41    /// Document language (e.g. "en").
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub lang: Option<String>,
44    /// Canonical URL.
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub url: Option<String>,
47}
48
49/// Extract text content from a PDF byte slice.
50///
51/// Returns the extracted text, or an empty string if extraction fails.
52#[must_use]
53pub fn extract_pdf(data: &[u8]) -> String {
54    match pdf_extract::extract_text_from_mem(data) {
55        Ok(text) => text,
56        Err(e) => {
57            eprintln!("warning: PDF text extraction failed: {e}");
58            String::new()
59        }
60    }
61}
62
63/// Input parameters for content extraction.
64#[non_exhaustive]
65pub struct ExtractInput<'a> {
66    /// Raw HTML of the page.
67    pub html: &'a str,
68    /// URL of the page (used for resolving relative links).
69    pub url: &'a str,
70    /// JSON-serialized layout data from the injected JS, if available.
71    pub layout_json: Option<&'a str>,
72    /// `document.body.innerText` fallback, if available.
73    pub inner_text: Option<&'a str>,
74    /// CSS selector to extract a specific section instead of using Readability.
75    pub selector: Option<&'a str>,
76}
77
78impl<'a> ExtractInput<'a> {
79    /// Create a new `ExtractInput` with required fields.
80    #[must_use]
81    pub fn new(html: &'a str, url: &'a str) -> Self {
82        Self {
83            html,
84            url,
85            layout_json: None,
86            inner_text: None,
87            selector: None,
88        }
89    }
90}
91
92/// Extract readable content as Markdown text.
93///
94/// # Errors
95///
96/// Returns [`ExtractError::Fmt`] if the Markdown assembly fails.
97pub fn extract_text(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
98    if let Some(selector) = input.selector {
99        return Ok(extract_by_selector(input.html, input.layout_json, selector));
100    }
101    let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
102
103    let mut out = String::new();
104    if !article.title.is_empty() {
105        writeln!(out, "# {}\n", article.title)?;
106    }
107    if let Some(ref byline) = article.byline {
108        writeln!(out, "*{}*\n", byline.replace('*', r"\*"))?;
109    }
110    if let Some(ref excerpt) = article.excerpt {
111        writeln!(out, "> {excerpt}\n")?;
112    }
113    write!(out, "{}", article.text_content)?;
114    Ok(clean_markdown(&out))
115}
116
117/// Extract readable content as JSON.
118///
119/// # Errors
120///
121/// Returns [`ExtractError::Json`] if JSON serialization fails.
122pub fn extract_json(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
123    if let Some(selector) = input.selector {
124        let text = extract_by_selector(input.html, input.layout_json, selector);
125        let data = ArticleData {
126            title: String::new(),
127            content: String::new(),
128            text_content: text,
129            byline: None,
130            excerpt: None,
131            lang: None,
132            url: Some(input.url.to_string()),
133        };
134        return Ok(serde_json::to_string_pretty(&data)?);
135    }
136    let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
137    let data = ArticleData {
138        title: article.title,
139        content: article.content,
140        text_content: article.text_content,
141        byline: article.byline,
142        excerpt: article.excerpt,
143        lang: article.lang,
144        url: Some(input.url.to_string()),
145    };
146    Ok(serde_json::to_string_pretty(&data)?)
147}
148
149struct ParsedArticle {
150    title: String,
151    content: String,
152    text_content: String,
153    byline: Option<String>,
154    excerpt: Option<String>,
155    lang: Option<String>,
156}
157
158fn is_nextjs_error_page(text: &str) -> bool {
159    let t = text.trim();
160    t.contains("client-side exception has occurred") || t.contains("Application error: a")
161}
162
163fn parse_article(html: &str, url: &str, layout_json: Option<&str>, inner_text: Option<&str>) -> ParsedArticle {
164    let filtered = filter(html, layout_json);
165
166    let doc = Document::from(filtered.as_ref());
167    if let Ok(mut readability) = Readability::with_document(doc, Some(url), None) {
168        if let Ok(article) = readability.parse() {
169            if !is_nextjs_error_page(&article.text_content) {
170                let converter = HtmlToMarkdown::builder().build();
171                let markdown = converter
172                    .convert(&article.content)
173                    .unwrap_or_else(|_| article.text_content.to_string());
174                return ParsedArticle {
175                    title: article.title.clone(),
176                    content: article.content.to_string(),
177                    text_content: markdown,
178                    byline: article.byline.clone(),
179                    excerpt: article.excerpt.clone(),
180                    lang: article.lang.clone(),
181                };
182            }
183        }
184    }
185
186    // Readability failed or returned an error page — fall back to innerText.
187    let doc = Document::from(filtered.as_ref());
188    let title = doc.select("title").text().to_string();
189    let body_text = inner_text.filter(|s| !s.trim().is_empty()).map_or_else(
190        || {
191            eprintln!(
192                "warning: could not extract content. \
193                 Try --js \"document.body.innerText\" for JS-heavy sites."
194            );
195            String::new()
196        },
197        String::from,
198    );
199    ParsedArticle {
200        title,
201        content: String::new(),
202        text_content: body_text,
203        byline: None,
204        excerpt: None,
205        lang: None,
206    }
207}
208
209fn extract_by_selector(html: &str, layout_json: Option<&str>, selector: &str) -> String {
210    let filtered = filter(html, layout_json);
211    let doc = Document::from(filtered.as_ref());
212    let selected = doc.select(selector);
213    let fragment = selected.html();
214    if fragment.is_empty() {
215        return String::new();
216    }
217    let converter = HtmlToMarkdown::builder().skip_tags(vec!["script", "style"]).build();
218    let markdown = converter
219        .convert(&fragment)
220        .unwrap_or_else(|_| selected.text().to_string());
221    clean_markdown(&markdown)
222}
223
224fn filter<'a>(html: &'a str, layout_json: Option<&str>) -> Cow<'a, str> {
225    layout_json
226        .and_then(|lj| serde_json::from_str::<Vec<LayoutElement>>(lj).ok())
227        .map_or(Cow::Borrowed(html), |els| {
228            let sels = layout::selectors_to_strip(&els);
229            if sels.is_empty() {
230                return Cow::Borrowed(html);
231            }
232            let doc = Document::from(html);
233            for sel in &sels {
234                doc.select(sel).remove();
235            }
236            Cow::Owned(doc.html().to_string())
237        })
238}
239
240// Collapse runs of 3+ blank lines down to 2.
241fn clean_markdown(input: &str) -> String {
242    let mut result = String::with_capacity(input.len());
243    let mut blank_count = 0u8;
244    for line in input.lines() {
245        if line.trim().is_empty() {
246            blank_count = blank_count.saturating_add(1);
247            if blank_count <= 2 {
248                result.push('\n');
249            }
250        } else {
251            blank_count = 0;
252            result.push_str(line);
253            result.push('\n');
254        }
255    }
256    result
257}
258
259#[cfg(test)]
260mod tests {
261    use super::*;
262
263    #[test]
264    fn is_nextjs_error_page_detects_nextjs() {
265        assert!(is_nextjs_error_page(
266            "Application error: a client-side exception has occurred"
267        ));
268    }
269
270    #[test]
271    fn is_nextjs_error_page_ignores_normal_content() {
272        assert!(!is_nextjs_error_page("This article discusses error handling in Rust."));
273        assert!(!is_nextjs_error_page(
274            "A long page about many topics that happens to mention errors somewhere in the middle of a paragraph."
275        ));
276    }
277
278    #[test]
279    fn clean_markdown_collapses_blank_lines() {
280        let input = "line1\n\n\n\n\nline2\n";
281        let result = clean_markdown(input);
282        assert_eq!(result, "line1\n\n\nline2\n");
283    }
284
285    #[test]
286    fn clean_markdown_preserves_single_blank() {
287        let input = "a\n\nb\n";
288        assert_eq!(clean_markdown(input), "a\n\nb\n");
289    }
290
291    #[test]
292    fn filter_without_layout_returns_original() {
293        let html = "<html><body>hello</body></html>";
294        let result = filter(html, None);
295        assert_eq!(result.as_ref(), html);
296    }
297
298    #[test]
299    fn filter_strips_footer() {
300        let html = r#"<html><body><footer style="position:static">nav</footer><p>content</p></body></html>"#;
301        let layout = r#"[{"tag":"FOOTER","role":null,"w":1280,"h":100,"position":"static"}]"#;
302        let result = filter(html, Some(layout));
303        assert!(!result.contains("<footer"));
304        assert!(result.contains("content"));
305    }
306}