use axum::Json;
use axum::extract::rejection::JsonRejection;
use axum::extract::{Path, Query, State};
use axum::http::HeaderMap;
use serde::Serialize;
use serde_json::Value;
use uuid::Uuid;
use crw_core::error::CrwError;
use super::adapters::{DEFAULT_PAGE_LIMIT, V2CrawlStatus, build_crawl_status};
use super::crawl::{PageQuery, base_url};
use super::scrape::{V2ScrapeRequest, to_internal};
use crate::error::AppError;
use crate::state::AppState;
#[derive(Debug, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct V2BatchStartResponse {
pub success: bool,
pub id: String,
pub url: String,
#[serde(rename = "invalidURLs")]
pub invalid_urls: Vec<String>,
}
pub async fn start_batch(
State(state): State<AppState>,
headers: HeaderMap,
body: Result<Json<Value>, JsonRejection>,
) -> Result<Json<V2BatchStartResponse>, AppError> {
let Json(mut raw) = body.map_err(AppError::from)?;
let obj = raw
.as_object_mut()
.ok_or_else(|| CrwError::InvalidRequest("request body must be a JSON object".into()))?;
let urls_val = obj
.remove("urls")
.ok_or_else(|| CrwError::InvalidRequest("`urls` is required".into()))?;
let urls: Vec<String> = serde_json::from_value(urls_val)
.map_err(|e| CrwError::InvalidRequest(format!("invalid `urls`: {e}")))?;
if urls.is_empty() {
return Err(AppError::from(CrwError::InvalidRequest(
"`urls` must contain at least one URL".into(),
)));
}
let ignore_invalid = obj
.get("ignoreInvalidURLs")
.and_then(Value::as_bool)
.unwrap_or(true);
obj.insert(
"url".to_string(),
Value::String("https://placeholder.invalid/".into()),
);
let template_v2: V2ScrapeRequest = serde_json::from_value(Value::Object(obj.clone()))
.map_err(|e| CrwError::InvalidRequest(format!("invalid batch scrape options: {e}")))?;
let (mut template, _decomposed, _tier) = to_internal(template_v2)?;
template.url = String::new();
let mut valid = Vec::new();
let mut invalid = Vec::new();
for u in urls {
match url::Url::parse(&u) {
Ok(parsed) => {
if crw_core::url_safety::validate_safe_url_resolved(&parsed)
.await
.is_ok()
{
valid.push(u);
} else {
invalid.push(u);
}
}
Err(_) => invalid.push(u),
}
}
if valid.is_empty() {
return Err(AppError::from(CrwError::InvalidRequest(
"no valid URLs to scrape".into(),
)));
}
if !invalid.is_empty() && !ignore_invalid {
return Err(AppError::from(CrwError::InvalidRequest(format!(
"invalid URLs: {}",
invalid.join(", ")
))));
}
let id = state.start_batch_job(valid, template).await;
let base = base_url(&headers);
Ok(Json(V2BatchStartResponse {
success: true,
id: id.to_string(),
url: format!("{base}/v2/batch/scrape/{id}"),
invalid_urls: invalid,
}))
}
pub async fn get_batch(
State(state): State<AppState>,
headers: HeaderMap,
Path(id): Path<Uuid>,
Query(page): Query<PageQuery>,
) -> Result<Json<V2CrawlStatus>, AppError> {
let (snapshot, created_at) = {
let jobs = state.crawl_jobs.read().await;
let job = jobs
.get(&id)
.ok_or_else(|| CrwError::NotFound(format!("Batch job {id} not found")))?;
(job.rx.borrow().clone(), job.created_at)
};
let skip = page.skip.unwrap_or(0);
let limit = page.limit.unwrap_or(DEFAULT_PAGE_LIMIT);
let base = base_url(&headers);
Ok(Json(build_crawl_status(
&snapshot,
created_at,
state.config.crawler.job_ttl_secs,
skip,
limit,
&base,
"/v2/batch/scrape",
id,
"basic",
)))
}
pub async fn cancel_batch(state: State<AppState>, id: Path<Uuid>) -> Result<Json<Value>, AppError> {
super::crawl::cancel_crawl(state, id).await
}
pub async fn get_errors(state: State<AppState>, id: Path<Uuid>) -> Result<Json<Value>, AppError> {
super::crawl::get_errors(state, id).await
}