1use crate::{
2 config::Settings,
3 engines::{
4 browser::BrowserEngine, detect_engine_needed, http::HttpEngine, racer::EngineRacer,
5 EngineType, ScrapeEngine,
6 },
7 error::ScrapeError,
8 format,
9 types::{ScrapeRequest, ScrapeResponse},
10 utils::robots,
11 validation,
12};
13use axum::Json;
14use tracing::{error, info, warn};
15
16pub async fn scrape_core_logic(request: &ScrapeRequest) -> Result<ScrapeResponse, ScrapeError> {
18 info!(
19 "Scrape request received for URL: {} with engine: {}",
20 request.url, request.engine
21 );
22
23 validation::validate_scrape_request(request).await?;
25
26 match robots::is_allowed_default(&request.url).await {
28 Ok(allowed) => {
29 if !allowed {
30 warn!("Robots.txt disallows scraping for URL: {}", request.url);
31 }
34 }
35 Err(e) => {
36 warn!("Failed to check robots.txt: {}, continuing anyway", e);
37 }
38 }
39
40 let settings = Settings::new().map_err(|e| {
42 error!("Failed to load settings: {}", e);
43 ScrapeError::Configuration(format!("Failed to load settings: {}", e))
44 })?;
45
46 let use_browser = match request.engine.as_str() {
48 "browser" => true,
49 "http" => false,
50 _ => {
51 if settings.engine.waterfall_enabled {
53 info!(
54 "Using waterfall racing for URL: {} (delay: {}ms)",
55 request.url, settings.engine.waterfall_delay_ms
56 );
57
58 let racer = EngineRacer::with_delay(settings.engine.waterfall_delay_ms)
59 .await
60 .map_err(|e| {
61 error!("Failed to create engine racer: {}", e);
62 e
63 })?;
64
65 let (raw_result, metrics) = racer.race_scrape_with_metrics(request).await.map_err(|e| {
66 error!("Waterfall race failed for URL {}: {}", request.url, e);
67 e
68 })?;
69
70 info!(
71 "Waterfall race completed: winner={}, elapsed={}ms, browser_started={}",
72 metrics.winning_engine, metrics.elapsed_ms, metrics.browser_started
73 );
74
75 let document = format::process_scrape_result(raw_result, request)
77 .await
78 .map_err(|e| {
79 error!("Failed to process scrape result: {}", e);
80 e
81 })?;
82
83 info!("Successfully processed document for URL: {}", request.url);
84 return Ok(ScrapeResponse::success(document));
85 } else {
86 info!("Auto-detecting engine type (waterfall disabled)...");
88
89 let http_engine = HttpEngine::with_options(request.timeout, request.skip_tls_verification)
90 .map_err(|e| {
91 error!("Failed to create HTTP engine: {}", e);
92 e
93 })?;
94
95 let http_result = http_engine.scrape(request).await.map_err(|e| {
96 error!("Failed to scrape URL with HTTP engine {}: {}", request.url, e);
97 e
98 })?;
99
100 let detected_engine = detect_engine_needed(&http_result.url, &http_result.html);
101
102 if detected_engine == EngineType::Browser {
103 info!(
104 "Auto-detection recommends Browser engine for URL: {}",
105 request.url
106 );
107 true
108 } else {
109 info!(
110 "Auto-detection recommends HTTP engine for URL: {}",
111 request.url
112 );
113
114 let document = format::process_scrape_result(http_result, request)
116 .await
117 .map_err(|e| {
118 error!("Failed to process scrape result: {}", e);
119 e
120 })?;
121
122 info!("Successfully processed document for URL: {}", request.url);
123 return Ok(ScrapeResponse::success(document));
124 }
125 }
126 }
127 };
128
129 if use_browser {
131 info!("Using Browser engine for URL: {}", request.url);
132
133 let browser_engine = BrowserEngine::new().await.map_err(|e| {
134 error!("Failed to create browser engine: {}", e);
135 e
136 })?;
137
138 let raw_result = browser_engine.scrape(request).await.map_err(|e| {
139 error!("Failed to scrape URL with browser {}: {}", request.url, e);
140 e
141 })?;
142
143 info!(
144 "Successfully fetched URL with browser: {} (status: {})",
145 raw_result.url, raw_result.status_code
146 );
147
148 let screenshot = if request.screenshot {
150 info!("Capturing screenshot...");
151 None
154 } else {
155 None
156 };
157
158 let mut document = format::process_scrape_result(raw_result, request)
160 .await
161 .map_err(|e| {
162 error!("Failed to process scrape result: {}", e);
163 e
164 })?;
165
166 if let Some(screenshot_data) = screenshot {
168 document.screenshot = Some(screenshot_data);
169 }
170
171 info!("Successfully processed document for URL: {}", request.url);
172 Ok(ScrapeResponse::success(document))
173 } else {
174 info!("Using HTTP engine for URL: {}", request.url);
176
177 let http_engine = HttpEngine::with_options(request.timeout, request.skip_tls_verification)
178 .map_err(|e| {
179 error!("Failed to create HTTP engine: {}", e);
180 e
181 })?;
182
183 let raw_result = http_engine.scrape(request).await.map_err(|e| {
184 error!("Failed to scrape URL {}: {}", request.url, e);
185 e
186 })?;
187
188 info!(
189 "Successfully fetched URL: {} (status: {})",
190 raw_result.url, raw_result.status_code
191 );
192
193 if raw_result.status_code >= 400 {
194 warn!("URL returned error status code: {}", raw_result.status_code);
195 }
196
197 let document = format::process_scrape_result(raw_result, request)
198 .await
199 .map_err(|e| {
200 error!("Failed to process scrape result: {}", e);
201 e
202 })?;
203
204 info!("Successfully processed document for URL: {}", request.url);
205 Ok(ScrapeResponse::success(document))
206 }
207}
208
209pub async fn scrape_handler(
211 Json(request): Json<ScrapeRequest>,
212) -> Result<Json<ScrapeResponse>, ScrapeError> {
213 let response = scrape_core_logic(&request).await?;
214 Ok(Json(response))
215}
216
217#[cfg(test)]
218mod tests {
219 use super::*;
220
221 #[tokio::test]
222 async fn test_scrape_handler_invalid_url() {
223 let request = ScrapeRequest {
224 url: "".to_string(),
225 formats: vec!["markdown".to_string()],
226 headers: Default::default(),
227 include_tags: vec![],
228 exclude_tags: vec![],
229 only_main_content: true,
230 timeout: 30000,
231 wait_for: 0,
232 remove_base64_images: true,
233 skip_tls_verification: false,
234 engine: "auto".to_string(),
235 wait_for_selector: None,
236 actions: vec![],
237 screenshot: false,
238 screenshot_format: "png".to_string(),
239 };
240
241 let result = scrape_handler(Json(request)).await;
242 assert!(result.is_err());
243 }
244}