1use std::borrow::Cow;
4use std::fmt::Write;
5
6use dom_query::Document;
7use dom_smoothie::Readability;
8use htmd::HtmlToMarkdown;
9use serde::Serialize;
10
11use crate::layout::{self, LayoutElement};
12
13#[derive(Debug, thiserror::Error)]
15#[non_exhaustive]
16pub enum ExtractError {
17 #[error("markdown formatting failed")]
19 Fmt(#[from] std::fmt::Error),
20 #[error("JSON serialization failed")]
22 Json(#[from] serde_json::Error),
23 #[error("invalid CSS selector")]
25 InvalidSelector,
26}
27
28#[derive(Serialize)]
30#[non_exhaustive]
31pub struct ArticleData {
32 pub title: String,
34 pub content: String,
36 pub text_content: String,
38 #[serde(skip_serializing_if = "Option::is_none")]
40 pub byline: Option<String>,
41 #[serde(skip_serializing_if = "Option::is_none")]
43 pub excerpt: Option<String>,
44 #[serde(skip_serializing_if = "Option::is_none")]
46 pub lang: Option<String>,
47 #[serde(skip_serializing_if = "Option::is_none")]
49 pub url: Option<String>,
50}
51
52#[must_use]
54pub fn extract_pdf(data: &[u8]) -> String {
55 match pdf_extract::extract_text_from_mem(data) {
56 Ok(text) => text,
57 Err(e) => {
58 tracing::warn!(error = %e, "PDF text extraction failed");
59 String::new()
60 }
61 }
62}
63
64#[non_exhaustive]
66pub struct ExtractInput<'a> {
67 pub html: &'a str,
69 pub url: &'a str,
71 pub layout_json: Option<&'a str>,
73 pub inner_text: Option<&'a str>,
75 pub selector: Option<&'a str>,
77}
78
79impl<'a> ExtractInput<'a> {
80 #[must_use]
82 pub fn new(html: &'a str, url: &'a str) -> Self {
83 Self {
84 html,
85 url,
86 layout_json: None,
87 inner_text: None,
88 selector: None,
89 }
90 }
91
92 #[must_use]
94 pub fn with_layout_json(mut self, layout_json: Option<&'a str>) -> Self {
95 self.layout_json = layout_json;
96 self
97 }
98
99 #[must_use]
101 pub fn with_inner_text(mut self, inner_text: Option<&'a str>) -> Self {
102 self.inner_text = inner_text;
103 self
104 }
105
106 #[must_use]
108 pub fn with_selector(mut self, selector: Option<&'a str>) -> Self {
109 self.selector = selector;
110 self
111 }
112}
113
114pub fn extract_text(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
119 if let Some(selector) = input.selector {
120 return extract_by_selector(input.html, input.layout_json, selector);
121 }
122 let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
123
124 let mut out = String::new();
125 if !article.title.is_empty() {
126 writeln!(out, "# {}\n", article.title)?;
127 }
128 if let Some(ref byline) = article.byline {
129 writeln!(out, "*{}*\n", byline.replace('*', r"\*"))?;
130 }
131 if let Some(ref excerpt) = article.excerpt {
132 writeln!(out, "> {excerpt}\n")?;
133 }
134 write!(out, "{}", article.text_content)?;
135 Ok(clean_markdown(&out))
136}
137
138pub fn extract_json(input: &ExtractInput<'_>) -> Result<String, ExtractError> {
143 if let Some(selector) = input.selector {
144 let text = extract_by_selector(input.html, input.layout_json, selector)?;
145 let data = ArticleData {
146 title: String::new(),
147 content: String::new(),
148 text_content: text,
149 byline: None,
150 excerpt: None,
151 lang: None,
152 url: Some(input.url.to_string()),
153 };
154 return Ok(serde_json::to_string_pretty(&data)?);
155 }
156 let article = parse_article(input.html, input.url, input.layout_json, input.inner_text);
157 let data = ArticleData {
158 title: article.title,
159 content: article.content,
160 text_content: article.text_content,
161 byline: article.byline,
162 excerpt: article.excerpt,
163 lang: article.lang,
164 url: Some(input.url.to_string()),
165 };
166 Ok(serde_json::to_string_pretty(&data)?)
167}
168
169struct ParsedArticle {
170 title: String,
171 content: String,
172 text_content: String,
173 byline: Option<String>,
174 excerpt: Option<String>,
175 lang: Option<String>,
176}
177
178fn is_nextjs_error_page(text: &str) -> bool {
179 let t = text.trim();
180 t.contains("client-side exception has occurred") || t.contains("Application error: a")
181}
182
183fn parse_article(html: &str, url: &str, layout_json: Option<&str>, inner_text: Option<&str>) -> ParsedArticle {
184 let filtered = filter(html, layout_json);
185
186 let doc = Document::from(filtered.as_ref());
187 if let Ok(mut readability) = Readability::with_document(doc, Some(url), None) {
188 if let Ok(article) = readability.parse() {
189 if !is_nextjs_error_page(&article.text_content) {
190 let converter = HtmlToMarkdown::builder().build();
191 let markdown = converter
192 .convert(&article.content)
193 .unwrap_or_else(|_| article.text_content.to_string());
194 return ParsedArticle {
195 title: article.title.clone(),
196 content: article.content.to_string(),
197 text_content: markdown,
198 byline: article.byline.clone(),
199 excerpt: article.excerpt.clone(),
200 lang: article.lang,
201 };
202 }
203 }
204 }
205
206 let doc = Document::from(filtered.as_ref());
208 let title = doc.select("title").text().to_string();
209 let body_text = inner_text.filter(|s| !s.trim().is_empty()).map_or_else(
210 || {
211 tracing::warn!(r#"could not extract content; try --js "document.body.innerText" for JS-heavy sites"#);
212 String::new()
213 },
214 String::from,
215 );
216 ParsedArticle {
217 title,
218 content: String::new(),
219 text_content: body_text,
220 byline: None,
221 excerpt: None,
222 lang: None,
223 }
224}
225
226fn extract_by_selector(html: &str, layout_json: Option<&str>, selector: &str) -> Result<String, ExtractError> {
227 let matcher = dom_query::Matcher::new(selector).map_err(|_| ExtractError::InvalidSelector)?;
228 let filtered = filter(html, layout_json);
229 let doc = Document::from(filtered.as_ref());
230 let selected = doc.select_matcher(&matcher);
231 let fragment = selected.html();
232 if fragment.is_empty() {
233 return Ok(String::new());
234 }
235 let converter = HtmlToMarkdown::builder().skip_tags(vec!["script", "style"]).build();
236 let markdown = converter
237 .convert(&fragment)
238 .unwrap_or_else(|_| selected.text().to_string());
239 Ok(clean_markdown(&markdown))
240}
241
242fn filter<'a>(html: &'a str, layout_json: Option<&str>) -> Cow<'a, str> {
243 layout_json
244 .and_then(|lj| serde_json::from_str::<Vec<LayoutElement>>(lj).ok())
245 .map_or(Cow::Borrowed(html), |els| {
246 let sels = layout::selectors_to_strip(&els);
247 if sels.is_empty() {
248 return Cow::Borrowed(html);
249 }
250 let doc = Document::from(html);
251 for sel in &sels {
252 doc.select(sel).remove();
253 }
254 Cow::Owned(doc.html().to_string())
255 })
256}
257
258fn clean_markdown(input: &str) -> String {
260 let mut result = String::with_capacity(input.len());
261 let mut blank_count = 0u8;
262 for line in input.lines() {
263 if line.trim().is_empty() {
264 blank_count = blank_count.saturating_add(1);
265 if blank_count <= 2 {
266 result.push('\n');
267 }
268 } else {
269 blank_count = 0;
270 result.push_str(line);
271 result.push('\n');
272 }
273 }
274 result
275}
276
277#[cfg(test)]
278mod tests {
279 use super::*;
280
281 #[test]
282 fn is_nextjs_error_page_detects_nextjs() {
283 assert!(is_nextjs_error_page(
284 "Application error: a client-side exception has occurred"
285 ));
286 }
287
288 #[test]
289 fn is_nextjs_error_page_ignores_normal_content() {
290 assert!(!is_nextjs_error_page("This article discusses error handling in Rust."));
291 assert!(!is_nextjs_error_page(
292 "A long page about many topics that happens to mention errors somewhere in the middle of a paragraph."
293 ));
294 }
295
296 #[test]
297 fn clean_markdown_collapses_blank_lines() {
298 let input = "line1\n\n\n\n\nline2\n";
299 let result = clean_markdown(input);
300 assert_eq!(result, "line1\n\n\nline2\n");
301 }
302
303 #[test]
304 fn clean_markdown_preserves_single_blank() {
305 let input = "a\n\nb\n";
306 assert_eq!(clean_markdown(input), "a\n\nb\n");
307 }
308
309 #[test]
310 fn filter_without_layout_returns_original() {
311 let html = "<html><body>hello</body></html>";
312 let result = filter(html, None);
313 assert_eq!(result.as_ref(), html);
314 }
315
316 #[test]
317 fn filter_strips_footer() {
318 let html = r#"<html><body><footer style="position:static">nav</footer><p>content</p></body></html>"#;
319 let layout = r#"[{"tag":"FOOTER","role":null,"w":1280,"h":100,"position":"static"}]"#;
320 let result = filter(html, Some(layout));
321 assert!(!result.contains("<footer"));
322 assert!(result.contains("content"));
323 }
324
325 #[test]
326 fn extract_input_builder() {
327 let input = ExtractInput::new("<html></html>", "https://example.com")
328 .with_layout_json(Some("[]"))
329 .with_inner_text(Some("hello"))
330 .with_selector(Some("article"));
331 assert_eq!(input.layout_json, Some("[]"));
332 assert_eq!(input.inner_text, Some("hello"));
333 assert_eq!(input.selector, Some("article"));
334 }
335
336 #[test]
337 fn clean_markdown_no_trailing_newline() {
338 let input = "line1\nline2";
339 let result = clean_markdown(input);
340 assert_eq!(result, "line1\nline2\n");
341 }
342}