1use std::borrow::Cow;
4use std::collections::HashMap;
5use std::fmt::{self, Write as _};
6
7use dom_query::Document;
8use dom_smoothie::Readability;
9use htmd::HtmlToMarkdown;
10use serde::Serialize;
11use servo::accesskit::{Node, NodeId};
12
13use crate::layout::{self, LayoutElement};
14use crate::visibility::{self, A11yIndex, VisibilityPolicy};
15
16#[derive(Debug, thiserror::Error)]
18#[non_exhaustive]
19pub enum ExtractError {
20 #[error("markdown formatting failed")]
22 Fmt(#[from] fmt::Error),
23 #[error("JSON serialization failed")]
25 Json(#[from] serde_json::Error),
26 #[error("invalid CSS selector")]
28 InvalidSelector,
29}
30
31#[derive(Serialize)]
33#[non_exhaustive]
34pub struct ArticleData {
35 pub title: String,
37 pub content: String,
39 pub text_content: String,
41 #[serde(skip_serializing_if = "Option::is_none")]
43 pub byline: Option<String>,
44 #[serde(skip_serializing_if = "Option::is_none")]
46 pub excerpt: Option<String>,
47 #[serde(skip_serializing_if = "Option::is_none")]
49 pub lang: Option<String>,
50 #[serde(skip_serializing_if = "Option::is_none")]
52 pub url: Option<String>,
53}
54
55#[must_use]
57pub fn extract_pdf(data: &[u8]) -> String {
58 match pdf_extract::extract_text_from_mem(data) {
59 Ok(text) => text,
60 Err(e) => {
61 tracing::warn!(error = %e, "PDF text extraction failed");
62 String::new()
63 }
64 }
65}
66
67#[non_exhaustive]
69pub struct ExtractInput<'a> {
70 pub html: &'a str,
72 pub url: &'a str,
74 pub layout_json: Option<&'a str>,
76 pub visibility_json: Option<&'a str>,
78 pub a11y: Option<&'a HashMap<NodeId, Node>>,
80 pub inner_text: Option<&'a str>,
82 pub selector: Option<&'a str>,
84 pub visibility: VisibilityPolicy,
86}
87
88impl<'a> ExtractInput<'a> {
89 #[must_use]
91 pub fn new(html: &'a str, url: &'a str) -> Self {
92 Self {
93 html,
94 url,
95 layout_json: None,
96 visibility_json: None,
97 a11y: None,
98 inner_text: None,
99 selector: None,
100 visibility: VisibilityPolicy::default(),
101 }
102 }
103
104 #[must_use]
106 pub fn with_layout_json(mut self, layout_json: Option<&'a str>) -> Self {
107 self.layout_json = layout_json;
108 self
109 }
110
111 #[must_use]
113 pub fn with_visibility_json(mut self, visibility_json: Option<&'a str>) -> Self {
114 self.visibility_json = visibility_json;
115 self
116 }
117
118 #[must_use]
120 pub fn with_a11y(mut self, a11y: Option<&'a HashMap<NodeId, Node>>) -> Self {
121 self.a11y = a11y;
122 self
123 }
124
125 #[must_use]
127 pub fn with_inner_text(mut self, inner_text: Option<&'a str>) -> Self {
128 self.inner_text = inner_text;
129 self
130 }
131
132 #[must_use]
134 pub fn with_selector(mut self, selector: Option<&'a str>) -> Self {
135 self.selector = selector;
136 self
137 }
138
139 #[must_use]
141 pub fn with_visibility(mut self, policy: VisibilityPolicy) -> Self {
142 self.visibility = policy;
143 self
144 }
145}
146
147pub fn extract_text(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
149 if let Some(selector) = input.selector {
150 return extract_by_selector(input, selector);
151 }
152 let article = parse_article(input);
153
154 let mut out = String::new();
155 if !article.title.is_empty() {
156 writeln!(out, "# {}\n", article.title)?;
157 }
158 if let Some(ref byline) = article.byline {
159 writeln!(out, "*{}*\n", byline.replace('*', r"\*"))?;
160 }
161 if let Some(ref excerpt) = article.excerpt {
162 writeln!(out, "> {excerpt}\n")?;
163 }
164 write!(out, "{}", article.text_content)?;
165 Ok(clean_markdown(&out))
166}
167
168pub fn extract_json(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
170 if let Some(selector) = input.selector {
171 let text = extract_by_selector(input, selector)?;
172 let data = ArticleData {
173 title: String::new(),
174 content: String::new(),
175 text_content: text,
176 byline: None,
177 excerpt: None,
178 lang: None,
179 url: Some(input.url.to_string()),
180 };
181 return Ok(serde_json::to_string_pretty(&data)?);
182 }
183 let article = parse_article(input);
184 let data = ArticleData {
185 title: article.title,
186 content: article.content,
187 text_content: article.text_content,
188 byline: article.byline,
189 excerpt: article.excerpt,
190 lang: article.lang,
191 url: Some(input.url.to_string()),
192 };
193 Ok(serde_json::to_string_pretty(&data)?)
194}
195
196struct ParsedArticle {
197 title: String,
198 content: String,
199 text_content: String,
200 byline: Option<String>,
201 excerpt: Option<String>,
202 lang: Option<String>,
203}
204
205fn is_nextjs_error_page(text: &str) -> bool {
206 let t = text.trim();
207 t.contains("client-side exception has occurred") || t.contains("Application error: a")
208}
209
210fn parse_article(input: &ExtractInput<'_>) -> ParsedArticle {
211 let filtered = filter(input);
212
213 let doc = Document::from(filtered.as_ref());
214 if let Ok(mut readability) = Readability::with_document(doc, Some(input.url), None) {
215 if let Ok(article) = readability.parse() {
216 if !is_nextjs_error_page(&article.text_content) {
217 let converter = HtmlToMarkdown::builder().build();
218 let markdown = converter
219 .convert(&article.content)
220 .unwrap_or_else(|_| article.text_content.to_string());
221 return ParsedArticle {
222 title: article.title.clone(),
223 content: article.content.to_string(),
224 text_content: markdown,
225 byline: article.byline.clone(),
226 excerpt: article.excerpt.clone(),
227 lang: article.lang,
228 };
229 }
230 }
231 }
232
233 let doc = Document::from(filtered.as_ref());
236 doc.select("script, style, noscript").remove();
237 let title = doc.select("title").text().to_string();
238 let filtered_text = doc.select("body").text().to_string();
239 let body_text = if filtered_text.trim().is_empty() {
240 input.inner_text.filter(|s| !s.trim().is_empty()).map_or_else(
241 || {
242 tracing::warn!(r#"could not extract content; try --js "document.body.innerText" for JS-heavy sites"#);
243 String::new()
244 },
245 String::from,
246 )
247 } else {
248 filtered_text
249 };
250 ParsedArticle {
251 title,
252 content: String::new(),
253 text_content: body_text,
254 byline: None,
255 excerpt: None,
256 lang: None,
257 }
258}
259
260fn extract_by_selector(input: &ExtractInput<'_>, selector: &str) -> Result<String, ExtractError> {
261 let matcher = dom_query::Matcher::new(selector).map_err(|_| ExtractError::InvalidSelector)?;
262 let filtered = filter(input);
263 let doc = Document::from(filtered.as_ref());
264 let selected = doc.select_matcher(&matcher);
265 let fragment = selected.html();
266 if fragment.is_empty() {
267 return Ok(String::new());
268 }
269 let converter = HtmlToMarkdown::builder().skip_tags(vec!["script", "style"]).build();
270 let markdown = converter
271 .convert(&fragment)
272 .unwrap_or_else(|_| selected.text().to_string());
273 Ok(clean_markdown(&markdown))
274}
275
276fn filter<'a>(input: &'a ExtractInput<'a>) -> Cow<'a, str> {
277 let mut selectors: Vec<String> = Vec::new();
278
279 if let Some(lj) = input.layout_json
280 && let Ok(els) = serde_json::from_str::<Vec<LayoutElement>>(lj)
281 {
282 selectors.extend(layout::selectors_to_strip(&els));
283 }
284
285 let a11y_index = input.a11y.map(A11yIndex::new);
286
287 selectors.extend(visibility::selectors_to_strip(
288 input.visibility,
289 a11y_index.as_ref(),
290 input.visibility_json,
291 ));
292
293 let needs_attr_cleanup = input.visibility_json.is_some() || input.html.contains("data-vf-id=");
294 if selectors.is_empty() && !needs_attr_cleanup {
295 return Cow::Borrowed(input.html);
296 }
297
298 let doc = Document::from(input.html);
299 for sel in &selectors {
300 doc.select(sel).remove();
301 }
302 if needs_attr_cleanup {
303 doc.select("[data-vf-id]").remove_attr("data-vf-id");
304 }
305 Cow::Owned(doc.html().to_string())
306}
307
308fn clean_markdown(input: &str) -> String {
310 let mut result = String::with_capacity(input.len());
311 let mut blank_count = 0u8;
312 for line in input.lines() {
313 if line.trim().is_empty() {
314 blank_count = blank_count.saturating_add(1);
315 if blank_count <= 2 {
316 result.push('\n');
317 }
318 } else {
319 blank_count = 0;
320 result.push_str(line);
321 result.push('\n');
322 }
323 }
324 result
325}
326
327#[cfg(test)]
328mod tests {
329 use super::*;
330
331 #[test]
332 fn is_nextjs_error_page_detects_nextjs() {
333 assert!(is_nextjs_error_page(
334 "Application error: a client-side exception has occurred"
335 ));
336 }
337
338 #[test]
339 fn is_nextjs_error_page_ignores_normal_content() {
340 assert!(!is_nextjs_error_page("This article discusses error handling in Rust."));
341 assert!(!is_nextjs_error_page(
342 "A long page about many topics that happens to mention errors somewhere in the middle of a paragraph."
343 ));
344 }
345
346 #[test]
347 fn clean_markdown_collapses_blank_lines() {
348 let input = "line1\n\n\n\n\nline2\n";
349 let result = clean_markdown(input);
350 assert_eq!(result, "line1\n\n\nline2\n");
351 }
352
353 #[test]
354 fn clean_markdown_preserves_single_blank() {
355 let input = "a\n\nb\n";
356 assert_eq!(clean_markdown(input), "a\n\nb\n");
357 }
358
359 #[test]
360 fn filter_off_policy_keeps_visible_content() {
361 let input = ExtractInput::new("<html><body>hello</body></html>", "").with_visibility(VisibilityPolicy::off());
362 let result = filter(&input);
363 assert!(result.contains("hello"));
364 }
365
366 #[test]
367 fn filter_strips_footer() {
368 let html = r#"<html><body><footer style="position:static">nav</footer><p>content</p></body></html>"#;
369 let layout = r#"[{"tag":"FOOTER","role":null,"w":1280,"h":100,"position":"static"}]"#;
370 let input = ExtractInput::new(html, "")
371 .with_layout_json(Some(layout))
372 .with_visibility(VisibilityPolicy::off());
373 let result = filter(&input);
374 assert!(!result.contains("<footer"));
375 assert!(result.contains("content"));
376 }
377
378 #[test]
379 fn filter_strips_visibility_flagged_element() {
380 let html = r#"<html><body><p data-vf-id="1">drop</p><p data-vf-id="2">keep</p></body></html>"#;
381 let visibility = r#"[{"id":"1","flags":16}]"#;
382 let input = ExtractInput::new(html, "")
383 .with_visibility_json(Some(visibility))
384 .with_visibility(VisibilityPolicy::moderate());
385 let result = filter(&input);
386 assert!(!result.contains("drop"));
387 assert!(result.contains("keep"));
388 }
389
390 #[test]
391 fn filter_removes_data_vf_id_from_output() {
392 let html = r#"<html><body><p data-vf-id="1">keep</p></body></html>"#;
393 let input = ExtractInput::new(html, "")
394 .with_layout_json(Some("[]"))
395 .with_visibility(VisibilityPolicy::off());
396 let result = filter(&input);
397 assert!(!result.contains("data-vf-id"));
398 }
399
400 #[test]
401 fn extract_input_builder() {
402 let input = ExtractInput::new("<html></html>", "https://example.com")
403 .with_layout_json(Some("[]"))
404 .with_visibility_json(Some(r"[]"))
405 .with_inner_text(Some("hello"))
406 .with_selector(Some("article"))
407 .with_visibility(VisibilityPolicy::strict());
408 assert_eq!(input.layout_json, Some("[]"));
409 assert_eq!(input.visibility_json, Some("[]"));
410 assert_eq!(input.inner_text, Some("hello"));
411 assert_eq!(input.selector, Some("article"));
412 }
413
414 #[test]
415 fn clean_markdown_no_trailing_newline() {
416 let input = "line1\nline2";
417 let result = clean_markdown(input);
418 assert_eq!(result, "line1\nline2\n");
419 }
420}