1use std::borrow::Cow;
4use std::fmt::Write;
5
6use dom_query::Document;
7use dom_smoothie::Readability;
8use htmd::HtmlToMarkdown;
9use serde::Serialize;
10
11use crate::layout::{self, LayoutElement};
12
13#[derive(Debug, thiserror::Error)]
15#[non_exhaustive]
16pub enum ExtractError {
17 #[error("markdown formatting failed")]
19 Fmt(#[from] std::fmt::Error),
20 #[error("JSON serialization failed")]
22 Json(#[from] serde_json::Error),
23}
24
25#[derive(Serialize)]
27#[non_exhaustive]
28pub struct ArticleData {
29 pub title: String,
31 pub content: String,
33 pub text_content: String,
35 #[serde(skip_serializing_if = "Option::is_none")]
37 pub byline: Option<String>,
38 #[serde(skip_serializing_if = "Option::is_none")]
40 pub excerpt: Option<String>,
41 #[serde(skip_serializing_if = "Option::is_none")]
43 pub lang: Option<String>,
44 #[serde(skip_serializing_if = "Option::is_none")]
46 pub url: Option<String>,
47}
48
49#[must_use]
51pub fn extract_pdf(data: &[u8]) -> String {
52 match pdf_extract::extract_text_from_mem(data) {
53 Ok(text) => text,
54 Err(e) => {
55 tracing::warn!(error = %e, "PDF text extraction failed");
56 String::new()
57 }
58 }
59}
60
61#[non_exhaustive]
63pub struct ExtractInput<'a> {
64 pub html: &'a str,
66 pub url: &'a str,
68 pub layout_json: Option<&'a str>,
70 pub inner_text: Option<&'a str>,
72 pub selector: Option<&'a str>,
74}
75
76impl<'a> ExtractInput<'a> {
77 #[must_use]
79 pub fn new(html: &'a str, url: &'a str) -> Self {
80 Self {
81 html,
82 url,
83 layout_json: None,
84 inner_text: None,
85 selector: None,
86 }
87 }
88
89 #[must_use]
91 pub fn with_layout_json(mut self, layout_json: Option<&'a str>) -> Self {
92 self.layout_json = layout_json;
93 self
94 }
95
96 #[must_use]
98 pub fn with_inner_text(mut self, inner_text: Option<&'a str>) -> Self {
99 self.inner_text = inner_text;
100 self
101 }
102
103 #[must_use]
105 pub fn with_selector(mut self, selector: Option<&'a str>) -> Self {
106 self.selector = selector;
107 self
108 }
109}
110
111pub fn extract_text(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
116 if let Some(selector) = input.selector {
117 return Ok(extract_by_selector(input.html, input.layout_json, selector));
118 }
119 let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
120
121 let mut out = String::new();
122 if !article.title.is_empty() {
123 writeln!(out, "# {}\n", article.title)?;
124 }
125 if let Some(ref byline) = article.byline {
126 writeln!(out, "*{}*\n", byline.replace('*', r"\*"))?;
127 }
128 if let Some(ref excerpt) = article.excerpt {
129 writeln!(out, "> {excerpt}\n")?;
130 }
131 write!(out, "{}", article.text_content)?;
132 Ok(clean_markdown(&out))
133}
134
135pub fn extract_json(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
140 if let Some(selector) = input.selector {
141 let text = extract_by_selector(input.html, input.layout_json, selector);
142 let data = ArticleData {
143 title: String::new(),
144 content: String::new(),
145 text_content: text,
146 byline: None,
147 excerpt: None,
148 lang: None,
149 url: Some(input.url.to_string()),
150 };
151 return Ok(serde_json::to_string_pretty(&data)?);
152 }
153 let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
154 let data = ArticleData {
155 title: article.title,
156 content: article.content,
157 text_content: article.text_content,
158 byline: article.byline,
159 excerpt: article.excerpt,
160 lang: article.lang,
161 url: Some(input.url.to_string()),
162 };
163 Ok(serde_json::to_string_pretty(&data)?)
164}
165
166struct ParsedArticle {
167 title: String,
168 content: String,
169 text_content: String,
170 byline: Option<String>,
171 excerpt: Option<String>,
172 lang: Option<String>,
173}
174
175fn is_nextjs_error_page(text: &str) -> bool {
176 let t = text.trim();
177 t.contains("client-side exception has occurred") || t.contains("Application error: a")
178}
179
180fn parse_article(html: &str, url: &str, layout_json: Option<&str>, inner_text: Option<&str>) -> ParsedArticle {
181 let filtered = filter(html, layout_json);
182
183 let doc = Document::from(filtered.as_ref());
184 if let Ok(mut readability) = Readability::with_document(doc, Some(url), None) {
185 if let Ok(article) = readability.parse() {
186 if !is_nextjs_error_page(&article.text_content) {
187 let converter = HtmlToMarkdown::builder().build();
188 let markdown = converter
189 .convert(&article.content)
190 .unwrap_or_else(|_| article.text_content.to_string());
191 return ParsedArticle {
192 title: article.title.clone(),
193 content: article.content.to_string(),
194 text_content: markdown,
195 byline: article.byline.clone(),
196 excerpt: article.excerpt.clone(),
197 lang: article.lang,
198 };
199 }
200 }
201 }
202
203 let doc = Document::from(filtered.as_ref());
205 let title = doc.select("title").text().to_string();
206 let body_text = inner_text.filter(|s| !s.trim().is_empty()).map_or_else(
207 || {
208 tracing::warn!(r#"could not extract content; try --js "document.body.innerText" for JS-heavy sites"#);
209 String::new()
210 },
211 String::from,
212 );
213 ParsedArticle {
214 title,
215 content: String::new(),
216 text_content: body_text,
217 byline: None,
218 excerpt: None,
219 lang: None,
220 }
221}
222
223fn extract_by_selector(html: &str, layout_json: Option<&str>, selector: &str) -> String {
224 let filtered = filter(html, layout_json);
225 let doc = Document::from(filtered.as_ref());
226 let selected = doc.select(selector);
227 let fragment = selected.html();
228 if fragment.is_empty() {
229 return String::new();
230 }
231 let converter = HtmlToMarkdown::builder().skip_tags(vec!["script", "style"]).build();
232 let markdown = converter
233 .convert(&fragment)
234 .unwrap_or_else(|_| selected.text().to_string());
235 clean_markdown(&markdown)
236}
237
238fn filter<'a>(html: &'a str, layout_json: Option<&str>) -> Cow<'a, str> {
239 layout_json
240 .and_then(|lj| serde_json::from_str::<Vec<LayoutElement>>(lj).ok())
241 .map_or(Cow::Borrowed(html), |els| {
242 let sels = layout::selectors_to_strip(&els);
243 if sels.is_empty() {
244 return Cow::Borrowed(html);
245 }
246 let doc = Document::from(html);
247 for sel in &sels {
248 doc.select(sel).remove();
249 }
250 Cow::Owned(doc.html().to_string())
251 })
252}
253
254fn clean_markdown(input: &str) -> String {
256 let mut result = String::with_capacity(input.len());
257 let mut blank_count = 0u8;
258 for line in input.lines() {
259 if line.trim().is_empty() {
260 blank_count = blank_count.saturating_add(1);
261 if blank_count <= 2 {
262 result.push('\n');
263 }
264 } else {
265 blank_count = 0;
266 result.push_str(line);
267 result.push('\n');
268 }
269 }
270 result
271}
272
273#[cfg(test)]
274mod tests {
275 use super::*;
276
277 #[test]
278 fn is_nextjs_error_page_detects_nextjs() {
279 assert!(is_nextjs_error_page(
280 "Application error: a client-side exception has occurred"
281 ));
282 }
283
284 #[test]
285 fn is_nextjs_error_page_ignores_normal_content() {
286 assert!(!is_nextjs_error_page("This article discusses error handling in Rust."));
287 assert!(!is_nextjs_error_page(
288 "A long page about many topics that happens to mention errors somewhere in the middle of a paragraph."
289 ));
290 }
291
292 #[test]
293 fn clean_markdown_collapses_blank_lines() {
294 let input = "line1\n\n\n\n\nline2\n";
295 let result = clean_markdown(input);
296 assert_eq!(result, "line1\n\n\nline2\n");
297 }
298
299 #[test]
300 fn clean_markdown_preserves_single_blank() {
301 let input = "a\n\nb\n";
302 assert_eq!(clean_markdown(input), "a\n\nb\n");
303 }
304
305 #[test]
306 fn filter_without_layout_returns_original() {
307 let html = "<html><body>hello</body></html>";
308 let result = filter(html, None);
309 assert_eq!(result.as_ref(), html);
310 }
311
312 #[test]
313 fn filter_strips_footer() {
314 let html = r#"<html><body><footer style="position:static">nav</footer><p>content</p></body></html>"#;
315 let layout = r#"[{"tag":"FOOTER","role":null,"w":1280,"h":100,"position":"static"}]"#;
316 let result = filter(html, Some(layout));
317 assert!(!result.contains("<footer"));
318 assert!(result.contains("content"));
319 }
320
321 #[test]
322 fn extract_input_builder() {
323 let input = ExtractInput::new("<html></html>", "https://example.com")
324 .with_layout_json(Some("[]"))
325 .with_inner_text(Some("hello"))
326 .with_selector(Some("article"));
327 assert_eq!(input.layout_json, Some("[]"));
328 assert_eq!(input.inner_text, Some("hello"));
329 assert_eq!(input.selector, Some("article"));
330 }
331
332 #[test]
333 fn clean_markdown_no_trailing_newline() {
334 let input = "line1\nline2";
335 let result = clean_markdown(input);
336 assert_eq!(result, "line1\nline2\n");
337 }
338}