1pub mod advanced_extraction;
2pub mod image_processing;
3pub mod markdown;
4pub mod metadata;
5
6use crate::{
7 engines::{validate_scrape_quality, RawScrapeResult},
8 error::{Result, ScrapeError},
9 types::{Document, ScrapeRequest},
10};
11use tracing::{debug, warn};
12
13pub async fn process_scrape_result(
15 raw: RawScrapeResult,
16 request: &ScrapeRequest,
17) -> Result<Document> {
18 let metadata = metadata::extract_metadata(&raw, request)?;
20
21 let mut doc = Document {
22 title: metadata.title.clone(),
23 description: metadata.description.clone(),
24 url: Some(raw.url.clone()),
25 markdown: None,
26 html: None,
27 raw_html: None,
28 links: None,
29 images: None,
30 screenshot: None,
31 metadata,
32 };
33
34 for format in &request.formats {
36 match format.as_str() {
37 "markdown" => {
38 let markdown_content = markdown::html_to_markdown(
39 &raw.html,
40 &raw.url,
41 request.only_main_content,
42 )?;
43
44 if let Err(e) = validate_scrape_quality(&raw, &markdown_content) {
46 match raw.status_code {
48 401 | 403 => {
49 warn!(
50 "Authentication or permission error ({}): {}. Consider using browser engine or different authentication.",
51 raw.status_code, e
52 );
53 return Err(e);
54 }
55 429 => {
56 warn!(
57 "Rate limit hit (429): {}. Consider adding wait_for or retry with backoff.",
58 e
59 );
60 return Err(e);
61 }
62 _ => {
63 debug!("Content quality validation failed: {}", e);
64 return Err(e);
65 }
66 }
67 }
68
69 doc.markdown = Some(markdown_content);
70 }
71 "html" => {
72 doc.html = Some(if request.only_main_content {
73 markdown::extract_main_content_html(&raw.html)?
74 } else {
75 raw.html.clone()
76 });
77 }
78 "rawHtml" => {
79 doc.raw_html = Some(raw.html.clone());
80 }
81 "links" => {
82 doc.links = Some(extract_links(&raw.html, &raw.url)?);
83 }
84 "images" => {
85 doc.images = Some(extract_images(&raw.html, &raw.url)?);
86 }
87 unsupported => {
88 return Err(ScrapeError::UnsupportedFormat(format!(
89 "Format '{}' is not supported in HTTP-only mode. Supported formats: markdown, html, rawHtml, links, images",
90 unsupported
91 )));
92 }
93 }
94 }
95
96 Ok(doc)
97}
98
99fn extract_links(html: &str, base_url: &str) -> Result<Vec<String>> {
101 use scraper::{Html, Selector};
102
103 let document = Html::parse_document(html);
104 let selector = Selector::parse("a[href]")
105 .map_err(|e| ScrapeError::ParseError(format!("Invalid selector: {:?}", e)))?;
106
107 let base = url::Url::parse(base_url)
108 .map_err(|e| ScrapeError::InvalidUrl(format!("Invalid base URL: {}", e)))?;
109
110 let mut links = Vec::new();
111 for element in document.select(&selector) {
112 if let Some(href) = element.value().attr("href") {
113 if let Ok(absolute) = base.join(href) {
115 links.push(absolute.to_string());
116 }
117 }
118 }
119
120 Ok(links)
121}
122
123fn extract_images(html: &str, base_url: &str) -> Result<Vec<String>> {
125 use scraper::{Html, Selector};
126
127 let document = Html::parse_document(html);
128 let selector = Selector::parse("img[src]")
129 .map_err(|e| ScrapeError::ParseError(format!("Invalid selector: {:?}", e)))?;
130
131 let base = url::Url::parse(base_url)
132 .map_err(|e| ScrapeError::InvalidUrl(format!("Invalid base URL: {}", e)))?;
133
134 let mut images = Vec::new();
135 for element in document.select(&selector) {
136 if let Some(src) = element.value().attr("src") {
137 if src.starts_with("data:") {
139 continue;
140 }
141
142 if let Ok(absolute) = base.join(src) {
144 images.push(absolute.to_string());
145 }
146 }
147 }
148
149 Ok(images)
150}