Skip to main content

servo_fetch/
extract.rs

1//! Content extraction — converts raw HTML into readable Markdown or structured JSON.
2
3use std::borrow::Cow;
4use std::fmt::Write;
5
6use dom_query::Document;
7use dom_smoothie::Readability;
8use htmd::HtmlToMarkdown;
9use serde::Serialize;
10
11use crate::layout::{self, LayoutElement};
12
13/// Errors that can occur during content extraction.
14#[derive(Debug, thiserror::Error)]
15#[non_exhaustive]
16pub enum ExtractError {
17    /// Failed to format Markdown output.
18    #[error("markdown formatting failed")]
19    Fmt(#[from] std::fmt::Error),
20    /// Failed to serialize JSON output.
21    #[error("JSON serialization failed")]
22    Json(#[from] serde_json::Error),
23}
24
25/// Structured article data for JSON output.
26#[derive(Serialize)]
27#[non_exhaustive]
28pub struct ArticleData {
29    /// Page title.
30    pub title: String,
31    /// Raw HTML content extracted by Readability.
32    pub content: String,
33    /// Readable text content (Markdown).
34    pub text_content: String,
35    /// Author or byline, if detected.
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub byline: Option<String>,
38    /// Short excerpt or description.
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub excerpt: Option<String>,
41    /// Document language (e.g. "en").
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub lang: Option<String>,
44    /// Canonical URL.
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub url: Option<String>,
47}
48
49/// Extract text content from a PDF byte slice, or an empty string on failure.
50#[must_use]
51pub fn extract_pdf(data: &[u8]) -> String {
52    match pdf_extract::extract_text_from_mem(data) {
53        Ok(text) => text,
54        Err(e) => {
55            tracing::warn!(error = %e, "PDF text extraction failed");
56            String::new()
57        }
58    }
59}
60
61/// Input parameters for content extraction.
62#[non_exhaustive]
63pub struct ExtractInput<'a> {
64    /// Raw HTML of the page.
65    pub html: &'a str,
66    /// URL of the page (used for resolving relative links).
67    pub url: &'a str,
68    /// JSON-serialized layout data from the injected JS, if available.
69    pub layout_json: Option<&'a str>,
70    /// `document.body.innerText` fallback, if available.
71    pub inner_text: Option<&'a str>,
72    /// CSS selector to extract a specific section instead of using Readability.
73    pub selector: Option<&'a str>,
74}
75
76impl<'a> ExtractInput<'a> {
77    /// Create a new `ExtractInput` with required fields.
78    #[must_use]
79    pub fn new(html: &'a str, url: &'a str) -> Self {
80        Self {
81            html,
82            url,
83            layout_json: None,
84            inner_text: None,
85            selector: None,
86        }
87    }
88
89    /// Set the layout JSON data.
90    #[must_use]
91    pub fn with_layout_json(mut self, layout_json: Option<&'a str>) -> Self {
92        self.layout_json = layout_json;
93        self
94    }
95
96    /// Set the inner text fallback.
97    #[must_use]
98    pub fn with_inner_text(mut self, inner_text: Option<&'a str>) -> Self {
99        self.inner_text = inner_text;
100        self
101    }
102
103    /// Set the CSS selector for targeted extraction.
104    #[must_use]
105    pub fn with_selector(mut self, selector: Option<&'a str>) -> Self {
106        self.selector = selector;
107        self
108    }
109}
110
111/// Extract readable content as Markdown text.
112///
113/// # Errors
114/// Returns [`ExtractError::Fmt`] if Markdown assembly fails.
115pub fn extract_text(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
116    if let Some(selector) = input.selector {
117        return Ok(extract_by_selector(input.html, input.layout_json, selector));
118    }
119    let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
120
121    let mut out = String::new();
122    if !article.title.is_empty() {
123        writeln!(out, "# {}\n", article.title)?;
124    }
125    if let Some(ref byline) = article.byline {
126        writeln!(out, "*{}*\n", byline.replace('*', r"\*"))?;
127    }
128    if let Some(ref excerpt) = article.excerpt {
129        writeln!(out, "> {excerpt}\n")?;
130    }
131    write!(out, "{}", article.text_content)?;
132    Ok(clean_markdown(&out))
133}
134
135/// Extract readable content as JSON.
136///
137/// # Errors
138/// Returns [`ExtractError::Json`] if JSON serialization fails.
139pub fn extract_json(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
140    if let Some(selector) = input.selector {
141        let text = extract_by_selector(input.html, input.layout_json, selector);
142        let data = ArticleData {
143            title: String::new(),
144            content: String::new(),
145            text_content: text,
146            byline: None,
147            excerpt: None,
148            lang: None,
149            url: Some(input.url.to_string()),
150        };
151        return Ok(serde_json::to_string_pretty(&data)?);
152    }
153    let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
154    let data = ArticleData {
155        title: article.title,
156        content: article.content,
157        text_content: article.text_content,
158        byline: article.byline,
159        excerpt: article.excerpt,
160        lang: article.lang,
161        url: Some(input.url.to_string()),
162    };
163    Ok(serde_json::to_string_pretty(&data)?)
164}
165
166struct ParsedArticle {
167    title: String,
168    content: String,
169    text_content: String,
170    byline: Option<String>,
171    excerpt: Option<String>,
172    lang: Option<String>,
173}
174
175fn is_nextjs_error_page(text: &str) -> bool {
176    let t = text.trim();
177    t.contains("client-side exception has occurred") || t.contains("Application error: a")
178}
179
180fn parse_article(html: &str, url: &str, layout_json: Option<&str>, inner_text: Option<&str>) -> ParsedArticle {
181    let filtered = filter(html, layout_json);
182
183    let doc = Document::from(filtered.as_ref());
184    if let Ok(mut readability) = Readability::with_document(doc, Some(url), None) {
185        if let Ok(article) = readability.parse() {
186            if !is_nextjs_error_page(&article.text_content) {
187                let converter = HtmlToMarkdown::builder().build();
188                let markdown = converter
189                    .convert(&article.content)
190                    .unwrap_or_else(|_| article.text_content.to_string());
191                return ParsedArticle {
192                    title: article.title.clone(),
193                    content: article.content.to_string(),
194                    text_content: markdown,
195                    byline: article.byline.clone(),
196                    excerpt: article.excerpt.clone(),
197                    lang: article.lang,
198                };
199            }
200        }
201    }
202
203    // Readability failed or returned an error page — fall back to innerText.
204    let doc = Document::from(filtered.as_ref());
205    let title = doc.select("title").text().to_string();
206    let body_text = inner_text.filter(|s| !s.trim().is_empty()).map_or_else(
207        || {
208            tracing::warn!(r#"could not extract content; try --js "document.body.innerText" for JS-heavy sites"#);
209            String::new()
210        },
211        String::from,
212    );
213    ParsedArticle {
214        title,
215        content: String::new(),
216        text_content: body_text,
217        byline: None,
218        excerpt: None,
219        lang: None,
220    }
221}
222
223fn extract_by_selector(html: &str, layout_json: Option<&str>, selector: &str) -> String {
224    let filtered = filter(html, layout_json);
225    let doc = Document::from(filtered.as_ref());
226    let selected = doc.select(selector);
227    let fragment = selected.html();
228    if fragment.is_empty() {
229        return String::new();
230    }
231    let converter = HtmlToMarkdown::builder().skip_tags(vec!["script", "style"]).build();
232    let markdown = converter
233        .convert(&fragment)
234        .unwrap_or_else(|_| selected.text().to_string());
235    clean_markdown(&markdown)
236}
237
238fn filter<'a>(html: &'a str, layout_json: Option<&str>) -> Cow<'a, str> {
239    layout_json
240        .and_then(|lj| serde_json::from_str::<Vec<LayoutElement>>(lj).ok())
241        .map_or(Cow::Borrowed(html), |els| {
242            let sels = layout::selectors_to_strip(&els);
243            if sels.is_empty() {
244                return Cow::Borrowed(html);
245            }
246            let doc = Document::from(html);
247            for sel in &sels {
248                doc.select(sel).remove();
249            }
250            Cow::Owned(doc.html().to_string())
251        })
252}
253
254// Collapse runs of 3+ blank lines down to 2.
255fn clean_markdown(input: &str) -> String {
256    let mut result = String::with_capacity(input.len());
257    let mut blank_count = 0u8;
258    for line in input.lines() {
259        if line.trim().is_empty() {
260            blank_count = blank_count.saturating_add(1);
261            if blank_count <= 2 {
262                result.push('\n');
263            }
264        } else {
265            blank_count = 0;
266            result.push_str(line);
267            result.push('\n');
268        }
269    }
270    result
271}
272
273#[cfg(test)]
274mod tests {
275    use super::*;
276
277    #[test]
278    fn is_nextjs_error_page_detects_nextjs() {
279        assert!(is_nextjs_error_page(
280            "Application error: a client-side exception has occurred"
281        ));
282    }
283
284    #[test]
285    fn is_nextjs_error_page_ignores_normal_content() {
286        assert!(!is_nextjs_error_page("This article discusses error handling in Rust."));
287        assert!(!is_nextjs_error_page(
288            "A long page about many topics that happens to mention errors somewhere in the middle of a paragraph."
289        ));
290    }
291
292    #[test]
293    fn clean_markdown_collapses_blank_lines() {
294        let input = "line1\n\n\n\n\nline2\n";
295        let result = clean_markdown(input);
296        assert_eq!(result, "line1\n\n\nline2\n");
297    }
298
299    #[test]
300    fn clean_markdown_preserves_single_blank() {
301        let input = "a\n\nb\n";
302        assert_eq!(clean_markdown(input), "a\n\nb\n");
303    }
304
305    #[test]
306    fn filter_without_layout_returns_original() {
307        let html = "<html><body>hello</body></html>";
308        let result = filter(html, None);
309        assert_eq!(result.as_ref(), html);
310    }
311
312    #[test]
313    fn filter_strips_footer() {
314        let html = r#"<html><body><footer style="position:static">nav</footer><p>content</p></body></html>"#;
315        let layout = r#"[{"tag":"FOOTER","role":null,"w":1280,"h":100,"position":"static"}]"#;
316        let result = filter(html, Some(layout));
317        assert!(!result.contains("<footer"));
318        assert!(result.contains("content"));
319    }
320
321    #[test]
322    fn extract_input_builder() {
323        let input = ExtractInput::new("<html></html>", "https://example.com")
324            .with_layout_json(Some("[]"))
325            .with_inner_text(Some("hello"))
326            .with_selector(Some("article"));
327        assert_eq!(input.layout_json, Some("[]"));
328        assert_eq!(input.inner_text, Some("hello"));
329        assert_eq!(input.selector, Some("article"));
330    }
331
332    #[test]
333    fn clean_markdown_no_trailing_newline() {
334        let input = "line1\nline2";
335        let result = clean_markdown(input);
336        assert_eq!(result, "line1\nline2\n");
337    }
338}