1use serde::{Deserialize, Serialize};
2
3#[cfg(feature = "mcp-tool")]
4use schemars::JsonSchema;
5
6use crate::{
7 API_VERSION, FirecrawlApp, FirecrawlError,
8 batch_scrape::Webhook,
9 document::Document,
10 scrape::{ScrapeFormats, ScrapeOptions},
11};
12
13#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
14#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
15pub enum CrawlScrapeFormats {
16 #[serde(rename = "markdown")]
18 Markdown,
19
20 #[serde(rename = "html")]
22 HTML,
23
24 #[serde(rename = "rawHtml")]
26 RawHTML,
27
28 #[serde(rename = "links")]
30 Links,
31
32 #[serde(rename = "screenshot")]
36 Screenshot,
37
38 #[serde(rename = "screenshot@fullPage")]
42 ScreenshotFullPage,
43}
44
45impl From<CrawlScrapeFormats> for ScrapeFormats {
46 fn from(value: CrawlScrapeFormats) -> Self {
47 match value {
48 CrawlScrapeFormats::Markdown => Self::Markdown,
49 CrawlScrapeFormats::HTML => Self::HTML,
50 CrawlScrapeFormats::RawHTML => Self::RawHTML,
51 CrawlScrapeFormats::Links => Self::Links,
52 CrawlScrapeFormats::Screenshot => Self::Screenshot,
53 CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
54 }
55 }
56}
57
58#[serde_with::skip_serializing_none]
59#[derive(Deserialize, Serialize, Debug, Default, Clone)]
60#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
61#[serde(rename_all = "camelCase")]
62pub struct CrawlOptions {
63 pub scrape_options: Option<ScrapeOptions>,
65
66 pub include_paths: Option<Vec<String>>,
68
69 pub exclude_paths: Option<Vec<String>>,
71
72 pub max_depth: Option<u32>,
74
75 pub ignore_sitemap: Option<bool>,
77
78 pub limit: Option<u32>,
80
81 pub allow_backward_links: Option<bool>,
83
84 pub allow_external_links: Option<bool>,
86
87 #[serde(rename = "deduplicateSimilarURLs")]
89 pub deduplicate_similar_urls: Option<bool>,
90
91 pub ignore_query_parameters: Option<bool>,
93}
94
95#[derive(Deserialize, Serialize, Debug, Default)]
96#[serde(rename_all = "camelCase")]
97pub struct CrawlRequestBody {
98 pub url: String,
100
101 #[serde(flatten)]
102 pub options: CrawlOptions,
103
104 pub webhook: Webhook,
106}
107
108#[derive(Deserialize, Serialize, Debug, Default)]
109#[serde(rename_all = "camelCase")]
110pub struct CrawlResponse {
111 pub success: bool,
114
115 pub data: Document,
117}
118
119#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
120#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
121#[serde(rename_all = "camelCase")]
122pub enum CrawlStatusTypes {
123 Scraping,
125
126 Completed,
128
129 Failed,
131
132 Cancelled,
134}
135
136#[serde_with::skip_serializing_none]
137#[derive(Deserialize, Serialize, Debug, Clone)]
138#[serde(rename_all = "camelCase")]
139pub struct CrawlStatus {
140 pub status: CrawlStatusTypes,
142
143 pub total: u32,
145
146 pub completed: u32,
148
149 pub credits_used: u32,
151
152 pub expires_at: String, pub next: Option<String>,
158
159 pub data: Vec<Document>,
161}
162
163#[derive(Deserialize, Serialize, Debug, Clone)]
164#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
165#[serde(rename_all = "camelCase")]
166pub struct CrawlAsyncResponse {
167 success: bool,
168
169 pub id: String,
171
172 pub url: String,
174}
175
176#[derive(Deserialize, Serialize, Debug, Default)]
177#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
178#[serde(rename_all = "camelCase")]
179pub struct CrawlUrlInput {
180 pub url: String,
182
183 #[serde(flatten)]
184 pub options: CrawlOptions,
185
186 pub poll_interval: Option<u64>,
188
189 #[serde(skip)]
190 pub idempotency_key: Option<String>,
191
192 pub webhook: Option<Webhook>,
194}
195
196impl FirecrawlApp {
197 pub async fn crawl_url_async(
199 &self,
200 url: impl AsRef<str>,
201 options: Option<CrawlOptions>,
202 idempotency_key: Option<String>,
203 webhook: Webhook,
204 ) -> Result<CrawlAsyncResponse, FirecrawlError> {
205 let body = CrawlRequestBody {
206 url: url.as_ref().to_string(),
207 options: options.unwrap_or_default(),
208 webhook,
209 };
210
211 let headers = self.prepare_headers(idempotency_key.as_ref());
212
213 let response = self
214 .client
215 .post(format!("{}/{}/crawl", self.api_url, API_VERSION))
216 .headers(headers.clone())
217 .json(&body)
218 .send()
219 .await
220 .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
221
222 self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
223 .await
224 }
225
226 pub async fn crawl_url(
228 &self,
229 url: impl AsRef<str>,
230 options: impl Into<Option<CrawlOptions>>,
231 webhook: Webhook,
232 poll_interval: Option<u64>,
233 idempotency_key: Option<String>,
234 ) -> Result<CrawlStatus, FirecrawlError> {
235 let options = options.into();
236 let poll_interval = poll_interval.unwrap_or(2000);
237
238 let res = self
239 .crawl_url_async(url, options, idempotency_key, webhook)
240 .await?;
241
242 self.monitor_crawl_status(&res.id, poll_interval).await
243 }
244
245 async fn check_crawl_status_next(
246 &self,
247 next: impl AsRef<str>,
248 ) -> Result<CrawlStatus, FirecrawlError> {
249 let response = self
250 .client
251 .get(next.as_ref())
252 .headers(self.prepare_headers(None))
253 .send()
254 .await
255 .map_err(|e| {
256 FirecrawlError::HttpError(
257 format!("Paginating crawl using URL {:?}", next.as_ref()),
258 e,
259 )
260 })?;
261
262 self.handle_response(
263 response,
264 format!("Paginating crawl using URL {:?}", next.as_ref()),
265 )
266 .await
267 }
268
269 pub async fn check_crawl_status(
271 &self,
272 id: impl AsRef<str>,
273 ) -> Result<CrawlStatus, FirecrawlError> {
274 let response = self
275 .client
276 .get(format!(
277 "{}/{}/crawl/{}",
278 self.api_url,
279 API_VERSION,
280 id.as_ref()
281 ))
282 .headers(self.prepare_headers(None))
283 .send()
284 .await
285 .map_err(|e| {
286 FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
287 })?;
288
289 let mut status: CrawlStatus = self
290 .handle_response(
291 response,
292 format!("Checking status of crawl {}", id.as_ref()),
293 )
294 .await?;
295
296 if status.status == CrawlStatusTypes::Completed {
297 while let Some(next) = status.next {
298 let new_status = self.check_crawl_status_next(next).await?;
299 status.data.extend_from_slice(&new_status.data);
300 status.next = new_status.next;
301 }
302 }
303
304 Ok(status)
305 }
306
307 async fn monitor_crawl_status(
308 &self,
309 id: &str,
310 poll_interval: u64,
311 ) -> Result<CrawlStatus, FirecrawlError> {
312 let mut all_data = Vec::new();
313 let mut current_cursor: Option<String> = None;
314
315 loop {
316 let mut status_data = if let Some(ref cursor) = current_cursor {
318 self.check_crawl_status_next(cursor).await?
319 } else {
320 self.check_crawl_status(id).await?
321 };
322
323 all_data.append(&mut status_data.data);
325
326 if let Some(next) = status_data.next {
328 current_cursor = Some(next);
329 continue;
330 }
331
332 match status_data.status {
334 CrawlStatusTypes::Completed => {
335 status_data.data = all_data;
337 break Ok(status_data);
338 }
339 CrawlStatusTypes::Scraping => {
340 tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
341 }
343 CrawlStatusTypes::Failed => {
344 status_data.data = all_data;
346 break Err(FirecrawlError::CrawlJobFailed(
347 "Crawl job failed.".to_string(),
348 status_data,
349 ));
350 }
351 CrawlStatusTypes::Cancelled => {
352 status_data.data = all_data;
354 break Err(FirecrawlError::CrawlJobCancelled(status_data));
355 }
356 }
357 }
358 }
359}
360
361#[cfg(all(test, feature = "mcp-tool"))]
362mod schema_tests {
363 use super::*;
364 use async_claude;
365
366 #[test]
367 fn test_crawl_options_schema() {
368 let actual_schema = async_claude::tool::parse_input_schema::<CrawlOptions>().unwrap();
369
370 println!(
372 "Schema properties: {}",
373 serde_json::to_string_pretty(&actual_schema["properties"]).unwrap()
374 );
375
376 assert_eq!(actual_schema["type"], "object");
378
379 let properties = &actual_schema["properties"];
381 assert!(properties.is_object());
382
383 let property_keys: Vec<String> = properties
385 .as_object()
386 .unwrap()
387 .keys()
388 .map(|k| k.to_string())
389 .collect();
390
391 println!("Actual property keys: {:?}", property_keys);
392
393 assert!(
395 property_keys.contains(&"scrapeOptions".to_string()),
396 "scrapeOptions not found"
397 );
398 assert!(
399 property_keys.contains(&"includePaths".to_string()),
400 "includePaths not found"
401 );
402 assert!(
403 property_keys.contains(&"excludePaths".to_string()),
404 "excludePaths not found"
405 );
406 assert!(
407 property_keys.contains(&"maxDepth".to_string()),
408 "maxDepth not found"
409 );
410 assert!(
411 property_keys.contains(&"ignoreSitemap".to_string()),
412 "ignoreSitemap not found"
413 );
414 assert!(
415 property_keys.contains(&"limit".to_string()),
416 "limit not found"
417 );
418 assert!(
419 property_keys.contains(&"allowBackwardLinks".to_string()),
420 "allowBackwardLinks not found"
421 );
422 assert!(
423 property_keys.contains(&"allowExternalLinks".to_string()),
424 "allowExternalLinks not found"
425 );
426 assert!(
427 property_keys.contains(&"webhook".to_string()),
428 "webhook not found"
429 );
430
431 assert!(
433 property_keys
434 .iter()
435 .any(|k| k.to_lowercase() == "deduplicatesimilarurls"),
436 "deduplicateSimilarURLs not found"
437 );
438
439 assert!(
441 property_keys.contains(&"ignoreQueryParameters".to_string()),
442 "ignoreQueryParameters not found"
443 );
444
445 assert_eq!(properties["scrapeOptions"]["type"], "object");
447
448 assert_eq!(properties["includePaths"]["type"], "array");
450 assert_eq!(properties["includePaths"]["items"]["type"], "string");
451 assert_eq!(properties["excludePaths"]["type"], "array");
452 assert_eq!(properties["excludePaths"]["items"]["type"], "string");
453
454 assert_eq!(properties["ignoreSitemap"]["type"], "boolean");
456 assert_eq!(properties["allowBackwardLinks"]["type"], "boolean");
457 assert_eq!(properties["allowExternalLinks"]["type"], "boolean");
458
459 assert!(
461 properties["maxDepth"]["type"] == "integer"
462 || properties["maxDepth"]["type"] == "number",
463 "Property maxDepth should be numeric"
464 );
465 assert!(
466 properties["limit"]["type"] == "integer" || properties["limit"]["type"] == "number",
467 "Property limit should be numeric"
468 );
469 }
470}