use crate::{
config::Settings,
engines::{
browser::BrowserEngine, detect_engine_needed, http::HttpEngine, racer::EngineRacer,
EngineType, ScrapeEngine,
},
error::ScrapeError,
format,
types::{ScrapeRequest, ScrapeResponse},
utils::robots,
validation,
};
use axum::Json;
use tracing::{error, info, warn};
pub async fn scrape_core_logic(request: &ScrapeRequest) -> Result<ScrapeResponse, ScrapeError> {
info!(
"Scrape request received for URL: {} with engine: {}",
request.url, request.engine
);
validation::validate_scrape_request(request).await?;
match robots::is_allowed_default(&request.url).await {
Ok(allowed) => {
if !allowed {
warn!("Robots.txt disallows scraping for URL: {}", request.url);
}
}
Err(e) => {
warn!("Failed to check robots.txt: {}, continuing anyway", e);
}
}
let settings = Settings::new().map_err(|e| {
error!("Failed to load settings: {}", e);
ScrapeError::Configuration(format!("Failed to load settings: {}", e))
})?;
let use_browser = match request.engine.as_str() {
"browser" => true,
"http" => false,
_ => {
if settings.engine.waterfall_enabled {
info!(
"Using waterfall racing for URL: {} (delay: {}ms)",
request.url, settings.engine.waterfall_delay_ms
);
let racer = EngineRacer::with_delay(settings.engine.waterfall_delay_ms)
.await
.map_err(|e| {
error!("Failed to create engine racer: {}", e);
e
})?;
let (raw_result, metrics) = racer.race_scrape_with_metrics(request).await.map_err(|e| {
error!("Waterfall race failed for URL {}: {}", request.url, e);
e
})?;
info!(
"Waterfall race completed: winner={}, elapsed={}ms, browser_started={}",
metrics.winning_engine, metrics.elapsed_ms, metrics.browser_started
);
let document = format::process_scrape_result(raw_result, request)
.await
.map_err(|e| {
error!("Failed to process scrape result: {}", e);
e
})?;
info!("Successfully processed document for URL: {}", request.url);
return Ok(ScrapeResponse::success(document));
} else {
info!("Auto-detecting engine type (waterfall disabled)...");
let http_engine = HttpEngine::with_options(request.timeout, request.skip_tls_verification)
.map_err(|e| {
error!("Failed to create HTTP engine: {}", e);
e
})?;
let http_result = http_engine.scrape(request).await.map_err(|e| {
error!("Failed to scrape URL with HTTP engine {}: {}", request.url, e);
e
})?;
let detected_engine = detect_engine_needed(&http_result.url, &http_result.html);
if detected_engine == EngineType::Browser {
info!(
"Auto-detection recommends Browser engine for URL: {}",
request.url
);
true
} else {
info!(
"Auto-detection recommends HTTP engine for URL: {}",
request.url
);
let document = format::process_scrape_result(http_result, request)
.await
.map_err(|e| {
error!("Failed to process scrape result: {}", e);
e
})?;
info!("Successfully processed document for URL: {}", request.url);
return Ok(ScrapeResponse::success(document));
}
}
}
};
if use_browser {
info!("Using Browser engine for URL: {}", request.url);
let browser_engine = BrowserEngine::new().await.map_err(|e| {
error!("Failed to create browser engine: {}", e);
e
})?;
let raw_result = browser_engine.scrape(request).await.map_err(|e| {
error!("Failed to scrape URL with browser {}: {}", request.url, e);
e
})?;
info!(
"Successfully fetched URL with browser: {} (status: {})",
raw_result.url, raw_result.status_code
);
let screenshot = if request.screenshot {
info!("Capturing screenshot...");
None
} else {
None
};
let mut document = format::process_scrape_result(raw_result, request)
.await
.map_err(|e| {
error!("Failed to process scrape result: {}", e);
e
})?;
if let Some(screenshot_data) = screenshot {
document.screenshot = Some(screenshot_data);
}
info!("Successfully processed document for URL: {}", request.url);
Ok(ScrapeResponse::success(document))
} else {
info!("Using HTTP engine for URL: {}", request.url);
let http_engine = HttpEngine::with_options(request.timeout, request.skip_tls_verification)
.map_err(|e| {
error!("Failed to create HTTP engine: {}", e);
e
})?;
let raw_result = http_engine.scrape(request).await.map_err(|e| {
error!("Failed to scrape URL {}: {}", request.url, e);
e
})?;
info!(
"Successfully fetched URL: {} (status: {})",
raw_result.url, raw_result.status_code
);
if raw_result.status_code >= 400 {
warn!("URL returned error status code: {}", raw_result.status_code);
}
let document = format::process_scrape_result(raw_result, request)
.await
.map_err(|e| {
error!("Failed to process scrape result: {}", e);
e
})?;
info!("Successfully processed document for URL: {}", request.url);
Ok(ScrapeResponse::success(document))
}
}
pub async fn scrape_handler(
Json(request): Json<ScrapeRequest>,
) -> Result<Json<ScrapeResponse>, ScrapeError> {
let response = scrape_core_logic(&request).await?;
Ok(Json(response))
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_scrape_handler_invalid_url() {
let request = ScrapeRequest {
url: "".to_string(),
formats: vec!["markdown".to_string()],
headers: Default::default(),
include_tags: vec![],
exclude_tags: vec![],
only_main_content: true,
timeout: 30000,
wait_for: 0,
remove_base64_images: true,
skip_tls_verification: false,
engine: "auto".to_string(),
wait_for_selector: None,
actions: vec![],
screenshot: false,
screenshot_format: "png".to_string(),
};
let result = scrape_handler(Json(request)).await;
assert!(result.is_err());
}
}