crw_server/routes/v2/
map.rs1use std::time::Duration;
6
7use axum::Json;
8use axum::extract::State;
9use axum::extract::rejection::JsonRejection;
10use serde::{Deserialize, Serialize};
11
12use crw_core::error::CrwError;
13use crw_crawl::crawl::{DiscoverOptions, discover_urls};
14
15use super::adapters::V2Link;
16use crate::error::AppError;
17use crate::state::AppState;
18
19#[derive(Debug, Deserialize)]
20#[serde(rename_all = "camelCase")]
21pub struct V2MapRequest {
22 pub url: String,
23 #[serde(default)]
24 pub limit: Option<usize>,
25 #[serde(default)]
26 pub include_paths: Vec<String>,
27 #[serde(default)]
28 pub exclude_paths: Vec<String>,
29 #[serde(default)]
30 pub search: Option<String>,
31 #[serde(default = "default_sitemap")]
33 pub sitemap: String,
34 #[serde(default)]
35 pub max_discovery_depth: Option<u32>,
36 #[serde(default)]
38 pub timeout: Option<u64>,
39}
40
41fn default_sitemap() -> String {
42 "include".to_string()
43}
44
45#[derive(Debug, Serialize)]
46pub struct V2MapResponse {
47 pub success: bool,
48 pub links: Vec<V2Link>,
49}
50
51pub async fn map(
52 State(state): State<AppState>,
53 body: Result<Json<V2MapRequest>, JsonRejection>,
54) -> Result<Json<V2MapResponse>, AppError> {
55 let Json(req) = body.map_err(AppError::from)?;
56 let parsed_url = url::Url::parse(&req.url)
57 .map_err(|e| CrwError::InvalidRequest(format!("Invalid URL: {e}")))?;
58 crw_core::url_safety::validate_safe_url_resolved(&parsed_url)
59 .await
60 .map_err(CrwError::InvalidRequest)?;
61
62 let use_sitemap = !req.sitemap.eq_ignore_ascii_case("skip");
63 let crawl_fallback = !req.sitemap.eq_ignore_ascii_case("only");
64 let max_depth = req
65 .max_discovery_depth
66 .unwrap_or(state.config.crawler.default_max_depth);
67 let timeout_secs = req
68 .timeout
69 .map(|ms| (ms / 1000).max(1))
70 .unwrap_or(120)
71 .min(300);
72
73 let fut = discover_urls(DiscoverOptions {
74 base_url: &req.url,
75 max_depth,
76 use_sitemap,
77 renderer: &state.renderer,
78 max_concurrency: state.config.crawler.max_concurrency,
79 requests_per_second: state.config.crawler.requests_per_second,
80 user_agent: &state.config.crawler.user_agent,
81 proxy: state.config.crawler.proxy.clone(),
82 deadline_ms_per_page: state.config.effective_deadline_ms(None, None),
83 per_host_max_concurrent: state.config.crawler.per_host_max_concurrent,
84 crawl_fallback,
85 url_filter: state.url_filter.clone(),
86 });
87
88 let result = match tokio::time::timeout(Duration::from_secs(timeout_secs), fut).await {
89 Ok(r) => r?,
90 Err(_) => return Err(AppError(CrwError::Timeout(timeout_secs * 1000))),
91 };
92
93 let mut urls = result.urls;
94 if !req.include_paths.is_empty() {
95 urls.retain(|u| req.include_paths.iter().any(|p| u.contains(p.as_str())));
96 }
97 if !req.exclude_paths.is_empty() {
98 urls.retain(|u| !req.exclude_paths.iter().any(|p| u.contains(p.as_str())));
99 }
100 if let Some(s) = req.search.as_ref().filter(|s| !s.is_empty()) {
101 let needle = s.to_lowercase();
102 urls.retain(|u| u.to_lowercase().contains(&needle));
103 }
104 if let Some(limit) = req.limit {
105 urls.truncate(limit);
106 }
107
108 let links = urls
109 .into_iter()
110 .map(|url| V2Link {
111 url,
112 title: None,
113 description: None,
114 })
115 .collect();
116
117 Ok(Json(V2MapResponse {
118 success: true,
119 links,
120 }))
121}