1use std::borrow::Cow;
4use std::fmt::Write;
5
6use dom_query::Document;
7use dom_smoothie::Readability;
8use htmd::HtmlToMarkdown;
9use serde::Serialize;
10
11use crate::layout::{self, LayoutElement};
12
13#[derive(Debug, thiserror::Error)]
15#[non_exhaustive]
16pub enum ExtractError {
17 #[error("markdown formatting failed")]
19 Fmt(#[from] std::fmt::Error),
20 #[error("JSON serialization failed")]
22 Json(#[from] serde_json::Error),
23}
24
25#[derive(Serialize)]
27#[non_exhaustive]
28pub struct ArticleData {
29 pub title: String,
31 pub content: String,
33 pub text_content: String,
35 #[serde(skip_serializing_if = "Option::is_none")]
37 pub byline: Option<String>,
38 #[serde(skip_serializing_if = "Option::is_none")]
40 pub excerpt: Option<String>,
41 #[serde(skip_serializing_if = "Option::is_none")]
43 pub lang: Option<String>,
44 #[serde(skip_serializing_if = "Option::is_none")]
46 pub url: Option<String>,
47}
48
49#[must_use]
53pub fn extract_pdf(data: &[u8]) -> String {
54 match pdf_extract::extract_text_from_mem(data) {
55 Ok(text) => text,
56 Err(e) => {
57 eprintln!("warning: PDF text extraction failed: {e}");
58 String::new()
59 }
60 }
61}
62
63#[non_exhaustive]
65pub struct ExtractInput<'a> {
66 pub html: &'a str,
68 pub url: &'a str,
70 pub layout_json: Option<&'a str>,
72 pub inner_text: Option<&'a str>,
74 pub selector: Option<&'a str>,
76}
77
78impl<'a> ExtractInput<'a> {
79 #[must_use]
81 pub fn new(html: &'a str, url: &'a str) -> Self {
82 Self {
83 html,
84 url,
85 layout_json: None,
86 inner_text: None,
87 selector: None,
88 }
89 }
90
91 #[must_use]
93 pub fn with_layout_json(mut self, layout_json: Option<&'a str>) -> Self {
94 self.layout_json = layout_json;
95 self
96 }
97
98 #[must_use]
100 pub fn with_inner_text(mut self, inner_text: Option<&'a str>) -> Self {
101 self.inner_text = inner_text;
102 self
103 }
104
105 #[must_use]
107 pub fn with_selector(mut self, selector: Option<&'a str>) -> Self {
108 self.selector = selector;
109 self
110 }
111}
112
113pub fn extract_text(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
119 if let Some(selector) = input.selector {
120 return Ok(extract_by_selector(input.html, input.layout_json, selector));
121 }
122 let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
123
124 let mut out = String::new();
125 if !article.title.is_empty() {
126 writeln!(out, "# {}\n", article.title)?;
127 }
128 if let Some(ref byline) = article.byline {
129 writeln!(out, "*{}*\n", byline.replace('*', r"\*"))?;
130 }
131 if let Some(ref excerpt) = article.excerpt {
132 writeln!(out, "> {excerpt}\n")?;
133 }
134 write!(out, "{}", article.text_content)?;
135 Ok(clean_markdown(&out))
136}
137
138pub fn extract_json(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
144 if let Some(selector) = input.selector {
145 let text = extract_by_selector(input.html, input.layout_json, selector);
146 let data = ArticleData {
147 title: String::new(),
148 content: String::new(),
149 text_content: text,
150 byline: None,
151 excerpt: None,
152 lang: None,
153 url: Some(input.url.to_string()),
154 };
155 return Ok(serde_json::to_string_pretty(&data)?);
156 }
157 let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
158 let data = ArticleData {
159 title: article.title,
160 content: article.content,
161 text_content: article.text_content,
162 byline: article.byline,
163 excerpt: article.excerpt,
164 lang: article.lang,
165 url: Some(input.url.to_string()),
166 };
167 Ok(serde_json::to_string_pretty(&data)?)
168}
169
170struct ParsedArticle {
171 title: String,
172 content: String,
173 text_content: String,
174 byline: Option<String>,
175 excerpt: Option<String>,
176 lang: Option<String>,
177}
178
179fn is_nextjs_error_page(text: &str) -> bool {
180 let t = text.trim();
181 t.contains("client-side exception has occurred") || t.contains("Application error: a")
182}
183
184fn parse_article(html: &str, url: &str, layout_json: Option<&str>, inner_text: Option<&str>) -> ParsedArticle {
185 let filtered = filter(html, layout_json);
186
187 let doc = Document::from(filtered.as_ref());
188 if let Ok(mut readability) = Readability::with_document(doc, Some(url), None) {
189 if let Ok(article) = readability.parse() {
190 if !is_nextjs_error_page(&article.text_content) {
191 let converter = HtmlToMarkdown::builder().build();
192 let markdown = converter
193 .convert(&article.content)
194 .unwrap_or_else(|_| article.text_content.to_string());
195 return ParsedArticle {
196 title: article.title.clone(),
197 content: article.content.to_string(),
198 text_content: markdown,
199 byline: article.byline.clone(),
200 excerpt: article.excerpt.clone(),
201 lang: article.lang,
202 };
203 }
204 }
205 }
206
207 let doc = Document::from(filtered.as_ref());
209 let title = doc.select("title").text().to_string();
210 let body_text = inner_text.filter(|s| !s.trim().is_empty()).map_or_else(
211 || {
212 eprintln!(
213 "warning: could not extract content. \
214 Try --js \"document.body.innerText\" for JS-heavy sites."
215 );
216 String::new()
217 },
218 String::from,
219 );
220 ParsedArticle {
221 title,
222 content: String::new(),
223 text_content: body_text,
224 byline: None,
225 excerpt: None,
226 lang: None,
227 }
228}
229
230fn extract_by_selector(html: &str, layout_json: Option<&str>, selector: &str) -> String {
231 let filtered = filter(html, layout_json);
232 let doc = Document::from(filtered.as_ref());
233 let selected = doc.select(selector);
234 let fragment = selected.html();
235 if fragment.is_empty() {
236 return String::new();
237 }
238 let converter = HtmlToMarkdown::builder().skip_tags(vec!["script", "style"]).build();
239 let markdown = converter
240 .convert(&fragment)
241 .unwrap_or_else(|_| selected.text().to_string());
242 clean_markdown(&markdown)
243}
244
245fn filter<'a>(html: &'a str, layout_json: Option<&str>) -> Cow<'a, str> {
246 layout_json
247 .and_then(|lj| serde_json::from_str::<Vec<LayoutElement>>(lj).ok())
248 .map_or(Cow::Borrowed(html), |els| {
249 let sels = layout::selectors_to_strip(&els);
250 if sels.is_empty() {
251 return Cow::Borrowed(html);
252 }
253 let doc = Document::from(html);
254 for sel in &sels {
255 doc.select(sel).remove();
256 }
257 Cow::Owned(doc.html().to_string())
258 })
259}
260
261fn clean_markdown(input: &str) -> String {
263 let mut result = String::with_capacity(input.len());
264 let mut blank_count = 0u8;
265 for line in input.lines() {
266 if line.trim().is_empty() {
267 blank_count = blank_count.saturating_add(1);
268 if blank_count <= 2 {
269 result.push('\n');
270 }
271 } else {
272 blank_count = 0;
273 result.push_str(line);
274 result.push('\n');
275 }
276 }
277 result
278}
279
280#[cfg(test)]
281mod tests {
282 use super::*;
283
284 #[test]
285 fn is_nextjs_error_page_detects_nextjs() {
286 assert!(is_nextjs_error_page(
287 "Application error: a client-side exception has occurred"
288 ));
289 }
290
291 #[test]
292 fn is_nextjs_error_page_ignores_normal_content() {
293 assert!(!is_nextjs_error_page("This article discusses error handling in Rust."));
294 assert!(!is_nextjs_error_page(
295 "A long page about many topics that happens to mention errors somewhere in the middle of a paragraph."
296 ));
297 }
298
299 #[test]
300 fn clean_markdown_collapses_blank_lines() {
301 let input = "line1\n\n\n\n\nline2\n";
302 let result = clean_markdown(input);
303 assert_eq!(result, "line1\n\n\nline2\n");
304 }
305
306 #[test]
307 fn clean_markdown_preserves_single_blank() {
308 let input = "a\n\nb\n";
309 assert_eq!(clean_markdown(input), "a\n\nb\n");
310 }
311
312 #[test]
313 fn filter_without_layout_returns_original() {
314 let html = "<html><body>hello</body></html>";
315 let result = filter(html, None);
316 assert_eq!(result.as_ref(), html);
317 }
318
319 #[test]
320 fn filter_strips_footer() {
321 let html = r#"<html><body><footer style="position:static">nav</footer><p>content</p></body></html>"#;
322 let layout = r#"[{"tag":"FOOTER","role":null,"w":1280,"h":100,"position":"static"}]"#;
323 let result = filter(html, Some(layout));
324 assert!(!result.contains("<footer"));
325 assert!(result.contains("content"));
326 }
327
328 #[test]
329 fn extract_input_builder() {
330 let input = ExtractInput::new("<html></html>", "https://example.com")
331 .with_layout_json(Some("[]"))
332 .with_inner_text(Some("hello"))
333 .with_selector(Some("article"));
334 assert_eq!(input.layout_json, Some("[]"));
335 assert_eq!(input.inner_text, Some("hello"));
336 assert_eq!(input.selector, Some("article"));
337 }
338}