use axum::Json;
use axum::extract::State;
use axum::extract::rejection::JsonRejection;
use crw_core::Deadline;
use crw_core::error::CrwError;
use crw_core::types::{ApiResponse, OutputFormat, ScrapeData, ScrapeRequest};
use crw_crawl::single::scrape_url;
use crate::error::AppError;
use crate::state::{AppState, validate_renderer_pin};
pub async fn scrape(
State(state): State<AppState>,
body: Result<Json<ScrapeRequest>, JsonRejection>,
) -> Result<Json<ApiResponse<ScrapeData>>, AppError> {
let Json(req) = body.map_err(AppError::from)?;
let parsed_url = url::Url::parse(&req.url)
.map_err(|e| CrwError::InvalidRequest(format!("Invalid URL: {e}")))?;
crw_core::url_safety::validate_safe_url_resolved(&parsed_url)
.await
.map_err(CrwError::InvalidRequest)?;
validate_renderer_pin(req.renderer, req.render_js, &state)?;
let llm_config = state.config.extraction.llm.as_ref();
if req.formats.contains(&OutputFormat::Summary)
&& llm_config.is_none()
&& req.llm_api_key.is_none()
{
return Err(AppError::from(CrwError::InvalidRequest(
"summary format requires LLM config: set CRW_EXTRACTION__LLM__API_KEY \
in server config or pass llm_api_key in the request body"
.into(),
)));
}
if let Some(cfg) = llm_config
&& let Some(header_name) = cfg.require_byok_header.as_deref()
&& (req.formats.contains(&OutputFormat::Summary)
|| req.formats.contains(&OutputFormat::Json))
&& req.llm_api_key.is_none()
{
let _ = header_name;
return Err(AppError::from(CrwError::InvalidRequest(
"LLM features require a per-request llm_api_key (BYOK header guard active)".into(),
)));
}
let user_agent = &state.config.crawler.user_agent;
let default_stealth =
state.config.crawler.stealth.enabled && state.config.crawler.stealth.inject_headers;
let deadline = Deadline::from_request_ms(
state
.config
.effective_deadline_ms(req.deadline_ms, req.wait_for),
);
let data = scrape_url(
&req,
&state.renderer,
llm_config,
&state.config.extraction,
user_agent,
default_stealth,
state.config.renderer.render_js_default,
deadline,
)
.await?;
let status_code = data.metadata.status_code;
if status_code >= 400 {
let body_len = [
data.markdown.as_deref(),
data.plain_text.as_deref(),
data.html.as_deref(),
data.raw_html.as_deref(),
]
.iter()
.filter_map(|opt| opt.map(|t| t.len()))
.max()
.unwrap_or(0);
if body_len < 200 {
let error_msg = data
.warning
.clone()
.unwrap_or_else(|| format!("Target returned HTTP {status_code}"));
return Ok(Json(ApiResponse {
success: false,
data: Some(data),
error: Some(error_msg),
error_code: Some("http_error".into()),
warning: None,
}));
}
}
let md_empty = data
.markdown
.as_deref()
.map(|s| s.trim().len() < 100)
.unwrap_or(true);
if md_empty {
let warning_blocked = data
.warning
.as_deref()
.map(|w| w.starts_with("Blocked by anti-bot"))
.unwrap_or(false);
let typed = if state.config.renderer.antibot.enabled {
let html = data
.raw_html
.as_deref()
.or(data.html.as_deref())
.unwrap_or("");
crw_extract::antibot::classify(Some(status_code), html)
} else {
crw_extract::antibot::AntibotResult::none()
};
if typed.signal.is_blocked() || warning_blocked {
let error_msg = if typed.signal.is_blocked() {
format!(
"Blocked by anti-bot ({}): {}",
typed.signal.class_name(),
typed.reason
)
} else {
data.warning
.clone()
.unwrap_or_else(|| "Blocked by anti-bot protection".into())
};
return Ok(Json(ApiResponse {
success: false,
data: Some(data),
error: Some(error_msg),
error_code: Some("anti_bot".into()),
warning: None,
}));
}
}
let warning = data.warning.clone();
let mut resp = ApiResponse::ok(data);
resp.warning = warning;
Ok(Json(resp))
}