1use reqwest::Client;
7use url::Url;
8
9use super::types::{ContentFormat, ExtractionQuality, PageContent};
10
11pub(crate) const USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
13 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
14pub(crate) const ACCEPT_HEADER: &str =
15 "text/markdown,text/plain;q=0.9,text/html;q=0.8,application/xhtml+xml;q=0.7,*/*;q=0.5";
16const MAX_RESPONSE_BYTES: u64 = 5 * 1024 * 1024;
17
18pub async fn fetch_and_extract(client: &Client, url: &str) -> Result<PageContent, ReadError> {
20 let parsed_url = validate_url(url)?;
21
22 if super::youtube::is_youtube_url(&parsed_url) {
23 return super::youtube::fetch_and_extract(client, url)
24 .await
25 .map_err(|err| ReadError::Youtube(err.to_string()));
26 }
27
28 let requested_url = url.to_string();
29
30 let response = client
31 .get(url)
32 .header("User-Agent", USER_AGENT)
33 .header("Accept", ACCEPT_HEADER)
34 .header("Accept-Language", "en-US,en;q=0.9")
35 .send()
36 .await
37 .map_err(|e| ReadError::Fetch(e.to_string()))?;
38
39 let status_code = response.status().as_u16();
40 if !response.status().is_success() {
41 return Err(ReadError::HttpStatus(
42 status_code,
43 response
44 .status()
45 .canonical_reason()
46 .unwrap_or("Unknown")
47 .to_string(),
48 ));
49 }
50
51 let content_type = response
52 .headers()
53 .get("content-type")
54 .and_then(|v| v.to_str().ok())
55 .unwrap_or("")
56 .to_string();
57
58 let format_received = detect_content_format(&content_type);
59
60 let is_text = content_type.is_empty()
62 || content_type.contains("text/")
63 || content_type.contains("application/json")
64 || content_type.contains("application/xml")
65 || content_type.contains("application/xhtml")
66 || content_type.contains("application/javascript")
67 || content_type.contains("+xml")
68 || content_type.contains("+json");
69 if !is_text {
70 return Err(ReadError::NotHtml(content_type));
71 }
72
73 let final_url = response.url().to_string();
74 validate_url(&final_url)?;
75 let was_redirected = final_url != requested_url;
76 if let Some(content_length) = response.content_length() {
77 if content_length > MAX_RESPONSE_BYTES {
78 return Err(ReadError::ResponseTooLarge(content_length));
79 }
80 }
81 let bytes = response
82 .bytes()
83 .await
84 .map_err(|e| ReadError::Fetch(e.to_string()))?;
85 if bytes.len() as u64 > MAX_RESPONSE_BYTES {
86 return Err(ReadError::ResponseTooLarge(bytes.len() as u64));
87 }
88 let raw_body_bytes = bytes.len();
89 let html = String::from_utf8_lossy(&bytes).into_owned();
90
91 if html.len() < 100 {
92 return Err(ReadError::InsufficientContent);
93 }
94
95 let meta = ResponseMeta {
97 requested_url,
98 status_code,
99 content_type: if content_type.is_empty() {
100 None
101 } else {
102 Some(content_type.clone())
103 },
104 format_received,
105 was_redirected,
106 raw_body_bytes,
107 };
108
109 match format_received {
110 ContentFormat::Markdown | ContentFormat::PlainText => {
111 let cleaned = clean_text(&html);
112 let mut page = PageContent {
113 title: None,
114 content_length: cleaned.len(),
115 text: cleaned,
116 url: final_url,
117 requested_url: meta.requested_url,
118 status_code: meta.status_code,
119 content_type: meta.content_type,
120 format_received: meta.format_received,
121 was_redirected: meta.was_redirected,
122 raw_body_bytes: meta.raw_body_bytes,
123 diagnostics: Vec::new(),
124 quality: ExtractionQuality::Good,
125 quality_reasons: Vec::new(),
126 };
127 page.diagnostics = diagnose(&page, "");
128 apply_quality(&mut page);
129 Ok(page)
130 }
131 ContentFormat::Html => {
132 let mut page = extract_readable(&html, &final_url)?;
133 page.requested_url = meta.requested_url;
134 page.status_code = meta.status_code;
135 page.content_type = meta.content_type;
136 page.format_received = meta.format_received;
137 page.was_redirected = meta.was_redirected;
138 page.raw_body_bytes = meta.raw_body_bytes;
139 page.diagnostics = diagnose(&page, &html);
140 apply_quality(&mut page);
141 Ok(page)
142 }
143 }
144}
145
146struct ResponseMeta {
148 requested_url: String,
149 status_code: u16,
150 content_type: Option<String>,
151 format_received: ContentFormat,
152 was_redirected: bool,
153 raw_body_bytes: usize,
154}
155
156fn extract_readable(html: &str, url: &str) -> Result<PageContent, ReadError> {
158 use readability_rust::Readability;
159
160 let mut parser = Readability::new_with_base_uri(html, url, None)
161 .map_err(|e| ReadError::Parse(format!("{e}")))?;
162
163 let article = parser.parse().ok_or(ReadError::NoContent)?;
164
165 let title = article.title.clone();
166
167 let text = article
170 .text_content
171 .as_deref()
172 .or(article.content.as_deref())
173 .unwrap_or("")
174 .to_string();
175
176 if text.len() < 50 {
177 return Err(ReadError::InsufficientContent);
178 }
179
180 Ok(PageContent {
181 content_length: text.len(),
182 title,
183 text: clean_text(&text),
184 url: url.to_string(),
185 requested_url: url.to_string(),
187 status_code: 200,
188 content_type: None,
189 format_received: ContentFormat::Html,
190 was_redirected: false,
191 raw_body_bytes: 0,
192 diagnostics: Vec::new(),
193 quality: ExtractionQuality::Good,
194 quality_reasons: Vec::new(),
195 })
196}
197
198fn validate_url(url: &str) -> Result<Url, ReadError> {
199 let parsed = Url::parse(url).map_err(|e| ReadError::InvalidUrl(e.to_string()))?;
200 match parsed.scheme() {
201 "http" | "https" => {}
202 scheme => {
203 return Err(ReadError::UnsafeUrl(format!(
204 "unsupported URL scheme: {scheme}"
205 )));
206 }
207 }
208
209 let Some(host) = parsed.host_str() else {
210 return Err(ReadError::UnsafeUrl("missing URL host".to_string()));
211 };
212 let host = host.trim_end_matches('.').to_ascii_lowercase();
213 if matches!(host.as_str(), "localhost" | "metadata.google.internal") {
214 return Err(ReadError::UnsafeUrl(format!("blocked host: {host}")));
215 }
216 if host.ends_with(".localhost") || host.ends_with(".local") {
217 return Err(ReadError::UnsafeUrl(format!("blocked local host: {host}")));
218 }
219 if let Ok(ip) = host.parse::<std::net::IpAddr>() {
220 if is_blocked_ip(ip) {
221 return Err(ReadError::UnsafeUrl(format!(
222 "blocked private address: {ip}"
223 )));
224 }
225 } else if let Some(ip) = parsed.host().and_then(|host| match host {
226 url::Host::Ipv4(ip) => Some(std::net::IpAddr::V4(ip)),
227 url::Host::Ipv6(ip) => Some(std::net::IpAddr::V6(ip)),
228 url::Host::Domain(_) => None,
229 }) {
230 if is_blocked_ip(ip) {
231 return Err(ReadError::UnsafeUrl(format!(
232 "blocked private address: {ip}"
233 )));
234 }
235 }
236
237 Ok(parsed)
238}
239
240fn is_blocked_ip(ip: std::net::IpAddr) -> bool {
241 match ip {
242 std::net::IpAddr::V4(ip) => {
243 ip.is_private()
244 || ip.is_loopback()
245 || ip.is_link_local()
246 || ip.is_broadcast()
247 || is_documentation_ipv4(ip)
248 || ip.is_unspecified()
249 || ip.octets()[0] == 0
250 || ip.octets()[0] >= 224
251 || ip == std::net::Ipv4Addr::new(169, 254, 169, 254)
252 }
253 std::net::IpAddr::V6(ip) => {
254 ip.is_loopback()
255 || ip.is_unspecified()
256 || ip.is_unique_local()
257 || ip.is_unicast_link_local()
258 || is_documentation_ipv6(ip)
259 }
260 }
261}
262
263fn is_documentation_ipv4(ip: std::net::Ipv4Addr) -> bool {
264 let octets = ip.octets();
265 octets[0] == 192 && octets[1] == 0 && octets[2] == 2
266 || octets[0] == 198 && octets[1] == 51 && octets[2] == 100
267 || octets[0] == 203 && octets[1] == 0 && octets[2] == 113
268}
269
270fn is_documentation_ipv6(ip: std::net::Ipv6Addr) -> bool {
271 ip.segments()[0] == 0x2001 && ip.segments()[1] == 0x0db8
272}
273
274fn apply_quality(page: &mut PageContent) {
275 let mut reasons = Vec::new();
276 if page.content_length < 300 {
277 reasons.push("short_content".to_string());
278 }
279 if !page.diagnostics.is_empty() {
280 reasons.push("diagnostics".to_string());
281 }
282 if page.raw_body_bytes > 100 * 1024
283 && (page.content_length as f64) < (page.raw_body_bytes as f64 * 0.1)
284 {
285 reasons.push("low_extraction_ratio".to_string());
286 }
287
288 page.quality = if reasons
289 .iter()
290 .any(|reason| reason == "low_extraction_ratio")
291 || reasons.len() >= 2
292 {
293 ExtractionQuality::Poor
294 } else if reasons.is_empty() {
295 ExtractionQuality::Good
296 } else {
297 ExtractionQuality::Partial
298 };
299 page.quality_reasons = reasons;
300}
301
302pub fn diagnose(page: &PageContent, raw_html: &str) -> Vec<String> {
303 let mut warnings = Vec::new();
304 let text_lower = page.text.to_lowercase();
305 let html_lower = raw_html.to_lowercase();
306
307 let short_text = page.content_length < 500;
308 let has_loading_indicator = ["loading...", "loading documentation"]
309 .iter()
310 .any(|needle| text_lower.contains(needle));
311 let has_noscript = html_lower.contains("<noscript");
312 let nav_link_count = html_lower.matches("<nav").count()
313 + html_lower.matches("<a ").count()
314 + html_lower.matches("<a>").count();
315 let has_nav_shell_pattern = short_text && nav_link_count >= 8;
316 if short_text && (has_loading_indicator || has_noscript || has_nav_shell_pattern) {
317 warnings.push(
318 "Page appears to be a client-rendered shell. Content may require JavaScript."
319 .to_string(),
320 );
321 }
322
323 let very_short_text = page.content_length < 300;
324 let has_soft_404_indicator = [
325 "page not found",
326 "can't find that page",
327 "404",
328 "doesn't exist",
329 "has been moved",
330 ]
331 .iter()
332 .any(|needle| text_lower.contains(needle));
333 if page.status_code == 200 && very_short_text && has_soft_404_indicator {
334 warnings
335 .push("Page appears to be a soft 404 (HTTP 200 but error page content).".to_string());
336 }
337
338 if page.raw_body_bytes > 20 * 1024 && page.content_length < 2 * 1024 {
339 warnings.push(format!(
340 "Large page ({} bytes) but only {} chars extracted. Content may be incomplete.",
341 page.raw_body_bytes, page.content_length
342 ));
343 }
344
345 if page.raw_body_bytes > 100 * 1024
346 && (page.content_length as f64) < (page.raw_body_bytes as f64 * 0.1)
347 {
348 let pct = ((page.content_length as f64 / page.raw_body_bytes as f64) * 100.0).round();
349 warnings.push(format!(
350 "Significant content may have been lost during extraction ({}% of response retained).",
351 pct as usize
352 ));
353 }
354
355 warnings
356}
357
358fn clean_text(text: &str) -> String {
360 let mut result = String::with_capacity(text.len());
361 let mut blank_count = 0u32;
362
363 for line in text.lines() {
364 let trimmed = line.trim();
365 if trimmed.is_empty() {
366 blank_count += 1;
367 if blank_count <= 2 {
368 result.push('\n');
369 }
370 } else {
371 blank_count = 0;
372 result.push_str(trimmed);
373 result.push('\n');
374 }
375 }
376
377 result.trim().to_string()
378}
379
380fn detect_content_format(content_type: &str) -> ContentFormat {
381 let content_type = content_type.to_ascii_lowercase();
382
383 if content_type.contains("text/markdown") || content_type.contains("text/x-markdown") {
384 ContentFormat::Markdown
385 } else if content_type.contains("text/html") || content_type.contains("application/xhtml+xml") {
386 ContentFormat::Html
387 } else {
388 ContentFormat::PlainText
389 }
390}
391
392#[derive(Debug)]
393pub enum ReadError {
394 InvalidUrl(String),
395 UnsafeUrl(String),
396 Fetch(String),
397 HttpStatus(u16, String),
398 NotHtml(String),
399 Parse(String),
400 NoContent,
401 InsufficientContent,
402 ResponseTooLarge(u64),
403 Youtube(String),
404}
405
406impl std::fmt::Display for ReadError {
407 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
408 match self {
409 Self::InvalidUrl(msg) => write!(f, "Invalid URL: {msg}"),
410 Self::UnsafeUrl(msg) => write!(f, "Unsafe URL: {msg}"),
411 Self::Fetch(msg) => write!(f, "Fetch failed: {msg}"),
412 Self::HttpStatus(code, reason) => write!(f, "HTTP {code} {reason}"),
413 Self::NotHtml(ct) => write!(f, "Not an HTML page (content-type: {ct})"),
414 Self::Parse(msg) => write!(f, "Parse error: {msg}"),
415 Self::NoContent => write!(f, "Could not extract readable content from page"),
416 Self::InsufficientContent => write!(f, "Page returned insufficient content"),
417 Self::ResponseTooLarge(bytes) => write!(
418 f,
419 "Response too large: {bytes} bytes exceeds {} byte limit",
420 MAX_RESPONSE_BYTES
421 ),
422 Self::Youtube(msg) => write!(f, "YouTube extraction failed: {msg}"),
423 }
424 }
425}
426
427#[cfg(test)]
428mod tests {
429 use super::*;
430
431 #[test]
432 fn accept_header_prefers_markdown() {
433 assert_eq!(
434 ACCEPT_HEADER,
435 "text/markdown,text/plain;q=0.9,text/html;q=0.8,application/xhtml+xml;q=0.7,*/*;q=0.5"
436 );
437 }
438
439 #[test]
440 fn validate_url_rejects_unsafe_targets() {
441 for url in [
442 "file:///etc/passwd",
443 "http://localhost:3000",
444 "https://service.local/path",
445 "http://127.0.0.1",
446 "http://10.0.0.1",
447 "http://169.254.169.254/latest/meta-data",
448 "http://[::1]/",
449 ] {
450 let result = validate_url(url);
451 assert!(
452 matches!(result, Err(ReadError::UnsafeUrl(_))),
453 "expected unsafe URL error for {url}, got {result:?}"
454 );
455 }
456 }
457
458 #[test]
459 fn validate_url_allows_public_http_urls() {
460 assert!(validate_url("https://example.com/path").is_ok());
461 assert!(validate_url("http://93.184.216.34/").is_ok());
462 }
463
464 #[test]
465 fn quality_marks_low_extraction_ratio_as_poor() {
466 let mut page = PageContent {
467 title: Some("Big Page".to_string()),
468 text: "short".to_string(),
469 url: "https://example.com/big".to_string(),
470 content_length: 5,
471 requested_url: "https://example.com/big".to_string(),
472 status_code: 200,
473 content_type: Some("text/html".to_string()),
474 format_received: ContentFormat::Html,
475 was_redirected: false,
476 raw_body_bytes: 150_000,
477 diagnostics: vec!["warning".to_string()],
478 quality: ExtractionQuality::Good,
479 quality_reasons: Vec::new(),
480 };
481
482 apply_quality(&mut page);
483
484 assert_eq!(page.quality.name(), "poor");
485 assert!(page
486 .quality_reasons
487 .iter()
488 .any(|reason| reason == "low_extraction_ratio"));
489 }
490
491 #[test]
492 fn detect_content_format_treats_markdown_as_markdown() {
493 assert_eq!(
494 detect_content_format("text/markdown; charset=utf-8"),
495 ContentFormat::Markdown
496 );
497 }
498
499 #[test]
500 fn detect_content_format_treats_plain_text_as_plain_text() {
501 assert_eq!(
502 detect_content_format("text/plain; charset=utf-8"),
503 ContentFormat::PlainText
504 );
505 assert_eq!(
506 detect_content_format("application/json"),
507 ContentFormat::PlainText
508 );
509 }
510
511 #[test]
512 fn markdown_and_plain_text_skip_readability_cleaning_path() {
513 let markdown = "# Title\n\n\nParagraph";
514 let cleaned_markdown = clean_text(markdown);
515 assert_eq!(cleaned_markdown, "# Title\n\n\nParagraph");
516 assert_eq!(
517 detect_content_format("text/markdown"),
518 ContentFormat::Markdown
519 );
520
521 let plain = " hello \n\n\nworld ";
522 let cleaned_plain = clean_text(plain);
523 assert_eq!(cleaned_plain, "hello\n\n\nworld");
524 assert_eq!(
525 detect_content_format("text/plain"),
526 ContentFormat::PlainText
527 );
528 }
529
530 #[test]
531 fn clean_text_collapses_blank_lines() {
532 let input = "Hello\n\n\n\n\nWorld\n\nFoo";
533 let cleaned = clean_text(input);
534 assert!(cleaned.starts_with("Hello\n"));
536 assert!(cleaned.contains("World"));
537 assert!(!cleaned.contains("\n\n\n\n"));
538 }
539
540 #[test]
541 fn clean_text_trims_lines() {
542 let input = " hello \n world ";
543 let cleaned = clean_text(input);
544 assert_eq!(cleaned, "hello\nworld");
545 }
546
547 #[test]
548 fn extract_readable_from_html() {
549 let html = r#"
550 <html>
551 <head><title>Test Article</title></head>
552 <body>
553 <nav>Skip this navigation</nav>
554 <article>
555 <h1>Test Article Title</h1>
556 <p>This is the main content of the article. It has enough text to be
557 considered readable content by the readability algorithm. We need to make
558 sure there is sufficient content here for the extraction to work properly.
559 The readability algorithm looks for substantial blocks of text content.</p>
560 <p>Here is another paragraph with more substantial content to ensure that
561 the extraction algorithm has enough material to work with. This paragraph
562 adds additional context and information that would be typical in a real
563 web article about some topic.</p>
564 </article>
565 <footer>Copyright 2024</footer>
566 </body>
567 </html>"#;
568
569 let result = extract_readable(html, "https://example.com/test");
570 match result {
571 Ok(page) => {
572 assert!(page.text.contains("main content"));
573 assert!(!page.text.contains("Skip this navigation"));
574 assert_eq!(page.url, "https://example.com/test");
575 assert_eq!(page.requested_url, "https://example.com/test");
576 assert_eq!(page.status_code, 200);
577 assert!(!page.was_redirected);
578 assert_eq!(page.raw_body_bytes, 0);
579 assert!(page.content_type.is_none());
580 assert!(page.diagnostics.is_empty());
581 }
582 Err(ReadError::InsufficientContent) | Err(ReadError::NoContent) => {
583 }
585 Err(e) => panic!("Unexpected error: {e}"),
586 }
587 }
588
589 #[test]
590 fn response_metadata_can_be_applied_after_extraction() {
591 let html = r#"
592 <html>
593 <head><title>Redirected Article</title></head>
594 <body>
595 <article>
596 <p>This article has enough body text to survive readability extraction and
597 prove that metadata can be preserved when the requested URL differs from
598 the final URL after redirects.</p>
599 <p>Additional text keeps the extractor happy and representative of a real page.</p>
600 </article>
601 </body>
602 </html>"#;
603
604 let mut page = extract_readable(html, "https://example.com/final").unwrap();
605 page.requested_url = "https://example.com/start".to_string();
606 page.status_code = 200;
607 page.content_type = Some("text/html; charset=utf-8".to_string());
608 page.format_received = ContentFormat::Html;
609 page.was_redirected = true;
610 page.raw_body_bytes = html.len();
611
612 assert_eq!(page.url, "https://example.com/final");
613 assert_eq!(page.requested_url, "https://example.com/start");
614 assert_eq!(page.status_code, 200);
615 assert_eq!(
616 page.content_type.as_deref(),
617 Some("text/html; charset=utf-8")
618 );
619 assert!(page.was_redirected);
620 assert_eq!(page.raw_body_bytes, html.len());
621 }
622
623 #[test]
624 fn diagnose_spa_shell_from_loading_text() {
625 let page = PageContent {
626 title: Some("Docs".to_string()),
627 text: "Loading documentation...".to_string(),
628 url: "https://example.com/docs".to_string(),
629 content_length: "Loading documentation...".len(),
630 requested_url: "https://example.com/docs".to_string(),
631 status_code: 200,
632 content_type: Some("text/html".to_string()),
633 format_received: ContentFormat::Html,
634 was_redirected: false,
635 raw_body_bytes: 2_000,
636 diagnostics: Vec::new(),
637 quality: ExtractionQuality::Good,
638 quality_reasons: Vec::new(),
639 };
640
641 let warnings = diagnose(
642 &page,
643 "<html><body><noscript>Enable JS</noscript></body></html>",
644 );
645 assert!(warnings.iter().any(|w| w.contains("client-rendered shell")));
646 }
647
648 #[test]
649 fn diagnose_soft_404_with_http_200() {
650 let text = "Page not found. The page has been moved.";
651 let page = PageContent {
652 title: Some("Missing".to_string()),
653 text: text.to_string(),
654 url: "https://example.com/missing".to_string(),
655 content_length: text.len(),
656 requested_url: "https://example.com/missing".to_string(),
657 status_code: 200,
658 content_type: Some("text/html".to_string()),
659 format_received: ContentFormat::Html,
660 was_redirected: false,
661 raw_body_bytes: 1_500,
662 diagnostics: Vec::new(),
663 quality: ExtractionQuality::Good,
664 quality_reasons: Vec::new(),
665 };
666
667 let warnings = diagnose(&page, "<html><body>404</body></html>");
668 assert!(warnings.iter().any(|w| w.contains("soft 404")));
669 }
670
671 #[test]
672 fn diagnose_does_not_flag_normal_page() {
673 let text = "This is a normal documentation page with enough content to explain installation, configuration, and usage in detail. It includes several paragraphs of useful information for readers and should not be treated as a shell or error page. Extra explanation here keeps it comfortably above the short-content heuristics and avoids false positives.";
674 let page = PageContent {
675 title: Some("Guide".to_string()),
676 text: text.to_string(),
677 url: "https://example.com/guide".to_string(),
678 content_length: text.len(),
679 requested_url: "https://example.com/guide".to_string(),
680 status_code: 200,
681 content_type: Some("text/html".to_string()),
682 format_received: ContentFormat::Html,
683 was_redirected: false,
684 raw_body_bytes: 8_000,
685 diagnostics: Vec::new(),
686 quality: ExtractionQuality::Good,
687 quality_reasons: Vec::new(),
688 };
689
690 let warnings = diagnose(
691 &page,
692 "<html><body><article>real docs</article></body></html>",
693 );
694 assert!(warnings.is_empty());
695 }
696
697 #[test]
698 fn diagnose_low_extraction_ratio_warning() {
699 let text = "A short extracted summary.";
700 let page = PageContent {
701 title: Some("Big Page".to_string()),
702 text: text.to_string(),
703 url: "https://example.com/big".to_string(),
704 content_length: text.len(),
705 requested_url: "https://example.com/big".to_string(),
706 status_code: 200,
707 content_type: Some("text/html".to_string()),
708 format_received: ContentFormat::Html,
709 was_redirected: false,
710 raw_body_bytes: 150_000,
711 diagnostics: Vec::new(),
712 quality: ExtractionQuality::Good,
713 quality_reasons: Vec::new(),
714 };
715
716 let warnings = diagnose(&page, "<html></html>");
717 assert!(warnings.iter().any(|w| w.contains("Large page")));
718 assert!(warnings
719 .iter()
720 .any(|w| w.contains("Significant content may have been lost")));
721 }
722}