1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4
5#[cfg(feature = "mcp_tool")]
6use schemars::JsonSchema;
7
8use crate::{
9 batch_scrape::Webhook,
10 document::Document,
11 scrape::{ScrapeFormats, ScrapeOptions},
12 FirecrawlApp, FirecrawlError, API_VERSION,
13};
14
15#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
16#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
17pub enum CrawlScrapeFormats {
18 #[serde(rename = "markdown")]
20 Markdown,
21
22 #[serde(rename = "html")]
24 HTML,
25
26 #[serde(rename = "rawHtml")]
28 RawHTML,
29
30 #[serde(rename = "links")]
32 Links,
33
34 #[serde(rename = "screenshot")]
38 Screenshot,
39
40 #[serde(rename = "screenshot@fullPage")]
44 ScreenshotFullPage,
45}
46
47impl From<CrawlScrapeFormats> for ScrapeFormats {
48 fn from(value: CrawlScrapeFormats) -> Self {
49 match value {
50 CrawlScrapeFormats::Markdown => Self::Markdown,
51 CrawlScrapeFormats::HTML => Self::HTML,
52 CrawlScrapeFormats::RawHTML => Self::RawHTML,
53 CrawlScrapeFormats::Links => Self::Links,
54 CrawlScrapeFormats::Screenshot => Self::Screenshot,
55 CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
56 }
57 }
58}
59
60#[serde_with::skip_serializing_none]
61#[derive(Deserialize, Serialize, Debug, Default, Clone)]
62#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
63#[serde(rename_all = "camelCase")]
64pub struct CrawlOptions {
65 pub scrape_options: Option<ScrapeOptions>,
67
68 pub include_paths: Option<Vec<String>>,
70
71 pub exclude_paths: Option<Vec<String>>,
73
74 pub max_depth: Option<u32>,
76
77 pub ignore_sitemap: Option<bool>,
79
80 pub limit: Option<u32>,
82
83 pub allow_backward_links: Option<bool>,
85
86 pub allow_external_links: Option<bool>,
88
89 #[serde(rename = "deduplicateSimilarURLs")]
91 pub deduplicate_similar_urls: Option<bool>,
92
93 pub ignore_query_parameters: Option<bool>,
95}
96
97#[derive(Deserialize, Serialize, Debug, Default)]
98#[serde(rename_all = "camelCase")]
99pub struct CrawlRequestBody {
100 pub url: String,
102
103 #[serde(flatten)]
104 pub options: CrawlOptions,
105
106 pub webhook: Webhook,
108}
109
110#[derive(Deserialize, Serialize, Debug, Default)]
111#[serde(rename_all = "camelCase")]
112pub struct CrawlResponse {
113 pub success: bool,
116
117 pub data: Document,
119}
120
121#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
122#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
123#[serde(rename_all = "camelCase")]
124pub enum CrawlStatusTypes {
125 Scraping,
127
128 Completed,
130
131 Failed,
133
134 Cancelled,
136}
137
138#[serde_with::skip_serializing_none]
139#[derive(Deserialize, Serialize, Debug, Clone)]
140#[serde(rename_all = "camelCase")]
141pub struct CrawlStatus {
142 pub status: CrawlStatusTypes,
144
145 pub total: u32,
147
148 pub completed: u32,
150
151 pub credits_used: u32,
153
154 pub expires_at: String, pub next: Option<String>,
160
161 pub data: Vec<Document>,
163}
164
165#[derive(Deserialize, Serialize, Debug, Clone)]
166#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
167#[serde(rename_all = "camelCase")]
168pub struct CrawlAsyncResponse {
169 success: bool,
170
171 pub id: String,
173
174 pub url: String,
176}
177
178#[derive(Deserialize, Serialize, Debug, Default)]
179#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
180#[serde(rename_all = "camelCase")]
181pub struct CrawlUrlInput {
182 pub url: String,
184
185 #[serde(flatten)]
186 pub options: CrawlOptions,
187
188 pub poll_interval: Option<u64>,
190
191 #[serde(skip)]
192 pub idempotency_key: Option<String>,
193
194 pub webhook: Option<Webhook>,
196}
197
198impl FirecrawlApp {
199 pub async fn crawl_url_async(
201 &self,
202 url: impl AsRef<str>,
203 options: Option<CrawlOptions>,
204 idempotency_key: Option<String>,
205 webhook: Webhook,
206 ) -> Result<CrawlAsyncResponse, FirecrawlError> {
207 let body = CrawlRequestBody {
208 url: url.as_ref().to_string(),
209 options: options.unwrap_or_default(),
210 webhook,
211 };
212
213 let headers = self.prepare_headers(idempotency_key.as_ref());
214
215 let response = self
216 .client
217 .post(format!("{}/{}/crawl", self.api_url, API_VERSION))
218 .headers(headers.clone())
219 .json(&body)
220 .send()
221 .await
222 .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
223
224 self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
225 .await
226 }
227
228 pub async fn crawl_url(
230 &self,
231 url: impl AsRef<str>,
232 options: impl Into<Option<CrawlOptions>>,
233 webhook: Webhook,
234 poll_interval: Option<u64>,
235 idempotency_key: Option<String>,
236 ) -> Result<CrawlStatus, FirecrawlError> {
237 let options = options.into();
238 let poll_interval = poll_interval.unwrap_or(2000);
239
240 let res = self
241 .crawl_url_async(url, options, idempotency_key, webhook)
242 .await?;
243
244 self.monitor_crawl_status(&res.id, poll_interval).await
245 }
246
247 async fn check_crawl_status_next(
248 &self,
249 next: impl AsRef<str>,
250 ) -> Result<CrawlStatus, FirecrawlError> {
251 let response = self
252 .client
253 .get(next.as_ref())
254 .headers(self.prepare_headers(None))
255 .send()
256 .await
257 .map_err(|e| {
258 FirecrawlError::HttpError(
259 format!("Paginating crawl using URL {:?}", next.as_ref()),
260 e,
261 )
262 })?;
263
264 self.handle_response(
265 response,
266 format!("Paginating crawl using URL {:?}", next.as_ref()),
267 )
268 .await
269 }
270
271 pub async fn check_crawl_status(
273 &self,
274 id: impl AsRef<str>,
275 ) -> Result<CrawlStatus, FirecrawlError> {
276 let response = self
277 .client
278 .get(format!(
279 "{}/{}/crawl/{}",
280 self.api_url,
281 API_VERSION,
282 id.as_ref()
283 ))
284 .headers(self.prepare_headers(None))
285 .send()
286 .await
287 .map_err(|e| {
288 FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
289 })?;
290
291 let mut status: CrawlStatus = self
292 .handle_response(
293 response,
294 format!("Checking status of crawl {}", id.as_ref()),
295 )
296 .await?;
297
298 if status.status == CrawlStatusTypes::Completed {
299 while let Some(next) = status.next {
300 let new_status = self.check_crawl_status_next(next).await?;
301 status.data.extend_from_slice(&new_status.data);
302 status.next = new_status.next;
303 }
304 }
305
306 Ok(status)
307 }
308
309 async fn monitor_crawl_status(
310 &self,
311 id: &str,
312 poll_interval: u64,
313 ) -> Result<CrawlStatus, FirecrawlError> {
314 let mut all_data = Vec::new();
315 let mut current_cursor: Option<String> = None;
316
317 loop {
318 let mut status_data = if let Some(ref cursor) = current_cursor {
320 self.check_crawl_status_next(cursor).await?
321 } else {
322 self.check_crawl_status(id).await?
323 };
324
325 all_data.append(&mut status_data.data);
327
328 if let Some(next) = status_data.next {
330 current_cursor = Some(next);
331 continue;
332 }
333
334 match status_data.status {
336 CrawlStatusTypes::Completed => {
337 status_data.data = all_data;
339 break Ok(status_data);
340 }
341 CrawlStatusTypes::Scraping => {
342 tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
343 }
345 CrawlStatusTypes::Failed => {
346 status_data.data = all_data;
348 break Err(FirecrawlError::CrawlJobFailed(
349 "Crawl job failed.".to_string(),
350 status_data,
351 ));
352 }
353 CrawlStatusTypes::Cancelled => {
354 status_data.data = all_data;
356 break Err(FirecrawlError::CrawlJobCancelled(status_data));
357 }
358 }
359 }
360 }
361}
362
363#[cfg(all(test, feature = "mcp_tool"))]
364mod schema_tests {
365 use super::*;
366 use async_claude;
367 use serde_json::json;
368
369 #[test]
370 fn test_crawl_options_schema() {
371 let actual_schema = async_claude::tool::parse_input_schema::<CrawlOptions>().unwrap();
372
373 println!(
375 "Schema properties: {}",
376 serde_json::to_string_pretty(&actual_schema["properties"]).unwrap()
377 );
378
379 assert_eq!(actual_schema["type"], "object");
381
382 let properties = &actual_schema["properties"];
384 assert!(properties.is_object());
385
386 let property_keys: Vec<String> = properties
388 .as_object()
389 .unwrap()
390 .keys()
391 .map(|k| k.to_string())
392 .collect();
393
394 println!("Actual property keys: {:?}", property_keys);
395
396 assert!(
398 property_keys.contains(&"scrapeOptions".to_string()),
399 "scrapeOptions not found"
400 );
401 assert!(
402 property_keys.contains(&"includePaths".to_string()),
403 "includePaths not found"
404 );
405 assert!(
406 property_keys.contains(&"excludePaths".to_string()),
407 "excludePaths not found"
408 );
409 assert!(
410 property_keys.contains(&"maxDepth".to_string()),
411 "maxDepth not found"
412 );
413 assert!(
414 property_keys.contains(&"ignoreSitemap".to_string()),
415 "ignoreSitemap not found"
416 );
417 assert!(
418 property_keys.contains(&"limit".to_string()),
419 "limit not found"
420 );
421 assert!(
422 property_keys.contains(&"allowBackwardLinks".to_string()),
423 "allowBackwardLinks not found"
424 );
425 assert!(
426 property_keys.contains(&"allowExternalLinks".to_string()),
427 "allowExternalLinks not found"
428 );
429 assert!(
430 property_keys.contains(&"webhook".to_string()),
431 "webhook not found"
432 );
433
434 assert!(
436 property_keys
437 .iter()
438 .any(|k| k.to_lowercase() == "deduplicatesimilarurls"),
439 "deduplicateSimilarURLs not found"
440 );
441
442 assert!(
444 property_keys.contains(&"ignoreQueryParameters".to_string()),
445 "ignoreQueryParameters not found"
446 );
447
448 assert_eq!(properties["scrapeOptions"]["type"], "object");
450
451 assert_eq!(properties["includePaths"]["type"], "array");
453 assert_eq!(properties["includePaths"]["items"]["type"], "string");
454 assert_eq!(properties["excludePaths"]["type"], "array");
455 assert_eq!(properties["excludePaths"]["items"]["type"], "string");
456
457 assert_eq!(properties["ignoreSitemap"]["type"], "boolean");
459 assert_eq!(properties["allowBackwardLinks"]["type"], "boolean");
460 assert_eq!(properties["allowExternalLinks"]["type"], "boolean");
461
462 assert!(
464 properties["maxDepth"]["type"] == "integer"
465 || properties["maxDepth"]["type"] == "number",
466 "Property maxDepth should be numeric"
467 );
468 assert!(
469 properties["limit"]["type"] == "integer" || properties["limit"]["type"] == "number",
470 "Property limit should be numeric"
471 );
472 }
473}