1use reqwest::Client;
7use url::Url;
8
9use super::types::{ContentFormat, PageContent};
10
11pub(crate) const USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
13 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
14pub(crate) const ACCEPT_HEADER: &str =
15 "text/markdown,text/plain;q=0.9,text/html;q=0.8,application/xhtml+xml;q=0.7,*/*;q=0.5";
16
17pub async fn fetch_and_extract(client: &Client, url: &str) -> Result<PageContent, ReadError> {
19 let parsed_url = Url::parse(url).map_err(|e| ReadError::InvalidUrl(e.to_string()))?;
20
21 if super::youtube::is_youtube_url(&parsed_url) {
22 return super::youtube::fetch_and_extract(client, url)
23 .await
24 .map_err(|err| ReadError::Youtube(err.to_string()));
25 }
26
27 let requested_url = url.to_string();
28
29 let response = client
30 .get(url)
31 .header("User-Agent", USER_AGENT)
32 .header("Accept", ACCEPT_HEADER)
33 .header("Accept-Language", "en-US,en;q=0.9")
34 .send()
35 .await
36 .map_err(|e| ReadError::Fetch(e.to_string()))?;
37
38 let status_code = response.status().as_u16();
39 if !response.status().is_success() {
40 return Err(ReadError::HttpStatus(
41 status_code,
42 response
43 .status()
44 .canonical_reason()
45 .unwrap_or("Unknown")
46 .to_string(),
47 ));
48 }
49
50 let content_type = response
51 .headers()
52 .get("content-type")
53 .and_then(|v| v.to_str().ok())
54 .unwrap_or("")
55 .to_string();
56
57 let format_received = detect_content_format(&content_type);
58
59 let is_text = content_type.is_empty()
61 || content_type.contains("text/")
62 || content_type.contains("application/json")
63 || content_type.contains("application/xml")
64 || content_type.contains("application/xhtml")
65 || content_type.contains("application/javascript")
66 || content_type.contains("+xml")
67 || content_type.contains("+json");
68 if !is_text {
69 return Err(ReadError::NotHtml(content_type));
70 }
71
72 let final_url = response.url().to_string();
73 let was_redirected = final_url != requested_url;
74 let html = response
75 .text()
76 .await
77 .map_err(|e| ReadError::Fetch(e.to_string()))?;
78 let raw_body_bytes = html.len();
79
80 if html.len() < 100 {
81 return Err(ReadError::InsufficientContent);
82 }
83
84 let meta = ResponseMeta {
86 requested_url,
87 status_code,
88 content_type: if content_type.is_empty() {
89 None
90 } else {
91 Some(content_type.clone())
92 },
93 format_received,
94 was_redirected,
95 raw_body_bytes,
96 };
97
98 match format_received {
99 ContentFormat::Markdown | ContentFormat::PlainText => {
100 let cleaned = clean_text(&html);
101 let mut page = PageContent {
102 title: None,
103 content_length: cleaned.len(),
104 text: cleaned,
105 url: final_url,
106 requested_url: meta.requested_url,
107 status_code: meta.status_code,
108 content_type: meta.content_type,
109 format_received: meta.format_received,
110 was_redirected: meta.was_redirected,
111 raw_body_bytes: meta.raw_body_bytes,
112 diagnostics: Vec::new(),
113 };
114 page.diagnostics = diagnose(&page, "");
115 Ok(page)
116 }
117 ContentFormat::Html => {
118 let mut page = extract_readable(&html, &final_url)?;
119 page.requested_url = meta.requested_url;
120 page.status_code = meta.status_code;
121 page.content_type = meta.content_type;
122 page.format_received = meta.format_received;
123 page.was_redirected = meta.was_redirected;
124 page.raw_body_bytes = meta.raw_body_bytes;
125 page.diagnostics = diagnose(&page, &html);
126 Ok(page)
127 }
128 }
129}
130
131struct ResponseMeta {
133 requested_url: String,
134 status_code: u16,
135 content_type: Option<String>,
136 format_received: ContentFormat,
137 was_redirected: bool,
138 raw_body_bytes: usize,
139}
140
141fn extract_readable(html: &str, url: &str) -> Result<PageContent, ReadError> {
143 use readability_rust::Readability;
144
145 let mut parser = Readability::new_with_base_uri(html, url, None)
146 .map_err(|e| ReadError::Parse(format!("{e}")))?;
147
148 let article = parser.parse().ok_or(ReadError::NoContent)?;
149
150 let title = article.title.clone();
151
152 let text = article
155 .text_content
156 .as_deref()
157 .or(article.content.as_deref())
158 .unwrap_or("")
159 .to_string();
160
161 if text.len() < 50 {
162 return Err(ReadError::InsufficientContent);
163 }
164
165 Ok(PageContent {
166 content_length: text.len(),
167 title,
168 text: clean_text(&text),
169 url: url.to_string(),
170 requested_url: url.to_string(),
172 status_code: 200,
173 content_type: None,
174 format_received: ContentFormat::Html,
175 was_redirected: false,
176 raw_body_bytes: 0,
177 diagnostics: Vec::new(),
178 })
179}
180
181pub fn diagnose(page: &PageContent, raw_html: &str) -> Vec<String> {
182 let mut warnings = Vec::new();
183 let text_lower = page.text.to_lowercase();
184 let html_lower = raw_html.to_lowercase();
185
186 let short_text = page.content_length < 500;
187 let has_loading_indicator = ["loading...", "loading documentation"]
188 .iter()
189 .any(|needle| text_lower.contains(needle));
190 let has_noscript = html_lower.contains("<noscript");
191 let nav_link_count = html_lower.matches("<nav").count()
192 + html_lower.matches("<a ").count()
193 + html_lower.matches("<a>").count();
194 let has_nav_shell_pattern = short_text && nav_link_count >= 8;
195 if short_text && (has_loading_indicator || has_noscript || has_nav_shell_pattern) {
196 warnings.push(
197 "Page appears to be a client-rendered shell. Content may require JavaScript."
198 .to_string(),
199 );
200 }
201
202 let very_short_text = page.content_length < 300;
203 let has_soft_404_indicator = [
204 "page not found",
205 "can't find that page",
206 "404",
207 "doesn't exist",
208 "has been moved",
209 ]
210 .iter()
211 .any(|needle| text_lower.contains(needle));
212 if page.status_code == 200 && very_short_text && has_soft_404_indicator {
213 warnings
214 .push("Page appears to be a soft 404 (HTTP 200 but error page content).".to_string());
215 }
216
217 if page.raw_body_bytes > 20 * 1024 && page.content_length < 2 * 1024 {
218 warnings.push(format!(
219 "Large page ({} bytes) but only {} chars extracted. Content may be incomplete.",
220 page.raw_body_bytes, page.content_length
221 ));
222 }
223
224 if page.raw_body_bytes > 100 * 1024
225 && (page.content_length as f64) < (page.raw_body_bytes as f64 * 0.1)
226 {
227 let pct = ((page.content_length as f64 / page.raw_body_bytes as f64) * 100.0).round();
228 warnings.push(format!(
229 "Significant content may have been lost during extraction ({}% of response retained).",
230 pct as usize
231 ));
232 }
233
234 warnings
235}
236
237fn clean_text(text: &str) -> String {
239 let mut result = String::with_capacity(text.len());
240 let mut blank_count = 0u32;
241
242 for line in text.lines() {
243 let trimmed = line.trim();
244 if trimmed.is_empty() {
245 blank_count += 1;
246 if blank_count <= 2 {
247 result.push('\n');
248 }
249 } else {
250 blank_count = 0;
251 result.push_str(trimmed);
252 result.push('\n');
253 }
254 }
255
256 result.trim().to_string()
257}
258
259fn detect_content_format(content_type: &str) -> ContentFormat {
260 let content_type = content_type.to_ascii_lowercase();
261
262 if content_type.contains("text/markdown") || content_type.contains("text/x-markdown") {
263 ContentFormat::Markdown
264 } else if content_type.contains("text/html") || content_type.contains("application/xhtml+xml") {
265 ContentFormat::Html
266 } else {
267 ContentFormat::PlainText
268 }
269}
270
271#[derive(Debug)]
272pub enum ReadError {
273 InvalidUrl(String),
274 Fetch(String),
275 HttpStatus(u16, String),
276 NotHtml(String),
277 Parse(String),
278 NoContent,
279 InsufficientContent,
280 Youtube(String),
281}
282
283impl std::fmt::Display for ReadError {
284 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285 match self {
286 Self::InvalidUrl(msg) => write!(f, "Invalid URL: {msg}"),
287 Self::Fetch(msg) => write!(f, "Fetch failed: {msg}"),
288 Self::HttpStatus(code, reason) => write!(f, "HTTP {code} {reason}"),
289 Self::NotHtml(ct) => write!(f, "Not an HTML page (content-type: {ct})"),
290 Self::Parse(msg) => write!(f, "Parse error: {msg}"),
291 Self::NoContent => write!(f, "Could not extract readable content from page"),
292 Self::InsufficientContent => write!(f, "Page returned insufficient content"),
293 Self::Youtube(msg) => write!(f, "YouTube extraction failed: {msg}"),
294 }
295 }
296}
297
298#[cfg(test)]
299mod tests {
300 use super::*;
301
302 #[test]
303 fn accept_header_prefers_markdown() {
304 assert_eq!(
305 ACCEPT_HEADER,
306 "text/markdown,text/plain;q=0.9,text/html;q=0.8,application/xhtml+xml;q=0.7,*/*;q=0.5"
307 );
308 }
309
310 #[test]
311 fn detect_content_format_treats_markdown_as_markdown() {
312 assert_eq!(
313 detect_content_format("text/markdown; charset=utf-8"),
314 ContentFormat::Markdown
315 );
316 }
317
318 #[test]
319 fn detect_content_format_treats_plain_text_as_plain_text() {
320 assert_eq!(
321 detect_content_format("text/plain; charset=utf-8"),
322 ContentFormat::PlainText
323 );
324 assert_eq!(
325 detect_content_format("application/json"),
326 ContentFormat::PlainText
327 );
328 }
329
330 #[test]
331 fn markdown_and_plain_text_skip_readability_cleaning_path() {
332 let markdown = "# Title\n\n\nParagraph";
333 let cleaned_markdown = clean_text(markdown);
334 assert_eq!(cleaned_markdown, "# Title\n\n\nParagraph");
335 assert_eq!(
336 detect_content_format("text/markdown"),
337 ContentFormat::Markdown
338 );
339
340 let plain = " hello \n\n\nworld ";
341 let cleaned_plain = clean_text(plain);
342 assert_eq!(cleaned_plain, "hello\n\n\nworld");
343 assert_eq!(
344 detect_content_format("text/plain"),
345 ContentFormat::PlainText
346 );
347 }
348
349 #[test]
350 fn clean_text_collapses_blank_lines() {
351 let input = "Hello\n\n\n\n\nWorld\n\nFoo";
352 let cleaned = clean_text(input);
353 assert!(cleaned.starts_with("Hello\n"));
355 assert!(cleaned.contains("World"));
356 assert!(!cleaned.contains("\n\n\n\n"));
357 }
358
359 #[test]
360 fn clean_text_trims_lines() {
361 let input = " hello \n world ";
362 let cleaned = clean_text(input);
363 assert_eq!(cleaned, "hello\nworld");
364 }
365
366 #[test]
367 fn extract_readable_from_html() {
368 let html = r#"
369 <html>
370 <head><title>Test Article</title></head>
371 <body>
372 <nav>Skip this navigation</nav>
373 <article>
374 <h1>Test Article Title</h1>
375 <p>This is the main content of the article. It has enough text to be
376 considered readable content by the readability algorithm. We need to make
377 sure there is sufficient content here for the extraction to work properly.
378 The readability algorithm looks for substantial blocks of text content.</p>
379 <p>Here is another paragraph with more substantial content to ensure that
380 the extraction algorithm has enough material to work with. This paragraph
381 adds additional context and information that would be typical in a real
382 web article about some topic.</p>
383 </article>
384 <footer>Copyright 2024</footer>
385 </body>
386 </html>"#;
387
388 let result = extract_readable(html, "https://example.com/test");
389 match result {
390 Ok(page) => {
391 assert!(page.text.contains("main content"));
392 assert!(!page.text.contains("Skip this navigation"));
393 assert_eq!(page.url, "https://example.com/test");
394 assert_eq!(page.requested_url, "https://example.com/test");
395 assert_eq!(page.status_code, 200);
396 assert!(!page.was_redirected);
397 assert_eq!(page.raw_body_bytes, 0);
398 assert!(page.content_type.is_none());
399 assert!(page.diagnostics.is_empty());
400 }
401 Err(ReadError::InsufficientContent) | Err(ReadError::NoContent) => {
402 }
404 Err(e) => panic!("Unexpected error: {e}"),
405 }
406 }
407
408 #[test]
409 fn response_metadata_can_be_applied_after_extraction() {
410 let html = r#"
411 <html>
412 <head><title>Redirected Article</title></head>
413 <body>
414 <article>
415 <p>This article has enough body text to survive readability extraction and
416 prove that metadata can be preserved when the requested URL differs from
417 the final URL after redirects.</p>
418 <p>Additional text keeps the extractor happy and representative of a real page.</p>
419 </article>
420 </body>
421 </html>"#;
422
423 let mut page = extract_readable(html, "https://example.com/final").unwrap();
424 page.requested_url = "https://example.com/start".to_string();
425 page.status_code = 200;
426 page.content_type = Some("text/html; charset=utf-8".to_string());
427 page.format_received = ContentFormat::Html;
428 page.was_redirected = true;
429 page.raw_body_bytes = html.len();
430
431 assert_eq!(page.url, "https://example.com/final");
432 assert_eq!(page.requested_url, "https://example.com/start");
433 assert_eq!(page.status_code, 200);
434 assert_eq!(
435 page.content_type.as_deref(),
436 Some("text/html; charset=utf-8")
437 );
438 assert!(page.was_redirected);
439 assert_eq!(page.raw_body_bytes, html.len());
440 }
441
442 #[test]
443 fn diagnose_spa_shell_from_loading_text() {
444 let page = PageContent {
445 title: Some("Docs".to_string()),
446 text: "Loading documentation...".to_string(),
447 url: "https://example.com/docs".to_string(),
448 content_length: "Loading documentation...".len(),
449 requested_url: "https://example.com/docs".to_string(),
450 status_code: 200,
451 content_type: Some("text/html".to_string()),
452 format_received: ContentFormat::Html,
453 was_redirected: false,
454 raw_body_bytes: 2_000,
455 diagnostics: Vec::new(),
456 };
457
458 let warnings = diagnose(
459 &page,
460 "<html><body><noscript>Enable JS</noscript></body></html>",
461 );
462 assert!(warnings.iter().any(|w| w.contains("client-rendered shell")));
463 }
464
465 #[test]
466 fn diagnose_soft_404_with_http_200() {
467 let text = "Page not found. The page has been moved.";
468 let page = PageContent {
469 title: Some("Missing".to_string()),
470 text: text.to_string(),
471 url: "https://example.com/missing".to_string(),
472 content_length: text.len(),
473 requested_url: "https://example.com/missing".to_string(),
474 status_code: 200,
475 content_type: Some("text/html".to_string()),
476 format_received: ContentFormat::Html,
477 was_redirected: false,
478 raw_body_bytes: 1_500,
479 diagnostics: Vec::new(),
480 };
481
482 let warnings = diagnose(&page, "<html><body>404</body></html>");
483 assert!(warnings.iter().any(|w| w.contains("soft 404")));
484 }
485
486 #[test]
487 fn diagnose_does_not_flag_normal_page() {
488 let text = "This is a normal documentation page with enough content to explain installation, configuration, and usage in detail. It includes several paragraphs of useful information for readers and should not be treated as a shell or error page. Extra explanation here keeps it comfortably above the short-content heuristics and avoids false positives.";
489 let page = PageContent {
490 title: Some("Guide".to_string()),
491 text: text.to_string(),
492 url: "https://example.com/guide".to_string(),
493 content_length: text.len(),
494 requested_url: "https://example.com/guide".to_string(),
495 status_code: 200,
496 content_type: Some("text/html".to_string()),
497 format_received: ContentFormat::Html,
498 was_redirected: false,
499 raw_body_bytes: 8_000,
500 diagnostics: Vec::new(),
501 };
502
503 let warnings = diagnose(
504 &page,
505 "<html><body><article>real docs</article></body></html>",
506 );
507 assert!(warnings.is_empty());
508 }
509
510 #[test]
511 fn diagnose_low_extraction_ratio_warning() {
512 let text = "A short extracted summary.";
513 let page = PageContent {
514 title: Some("Big Page".to_string()),
515 text: text.to_string(),
516 url: "https://example.com/big".to_string(),
517 content_length: text.len(),
518 requested_url: "https://example.com/big".to_string(),
519 status_code: 200,
520 content_type: Some("text/html".to_string()),
521 format_received: ContentFormat::Html,
522 was_redirected: false,
523 raw_body_bytes: 150_000,
524 diagnostics: Vec::new(),
525 };
526
527 let warnings = diagnose(&page, "<html></html>");
528 assert!(warnings.iter().any(|w| w.contains("Large page")));
529 assert!(warnings
530 .iter()
531 .any(|w| w.contains("Significant content may have been lost")));
532 }
533}