1use serde::{Deserialize, Serialize};
2
3#[cfg(feature = "mcp_tool")]
4use schemars::JsonSchema;
5
6use crate::{
7 batch_scrape::Webhook,
8 document::Document,
9 scrape::{ScrapeFormats, ScrapeOptions},
10 FirecrawlApp, FirecrawlError, API_VERSION,
11};
12
13#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
14#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
15pub enum CrawlScrapeFormats {
16 #[serde(rename = "markdown")]
18 Markdown,
19
20 #[serde(rename = "html")]
22 HTML,
23
24 #[serde(rename = "rawHtml")]
26 RawHTML,
27
28 #[serde(rename = "links")]
30 Links,
31
32 #[serde(rename = "screenshot")]
36 Screenshot,
37
38 #[serde(rename = "screenshot@fullPage")]
42 ScreenshotFullPage,
43}
44
45impl From<CrawlScrapeFormats> for ScrapeFormats {
46 fn from(value: CrawlScrapeFormats) -> Self {
47 match value {
48 CrawlScrapeFormats::Markdown => Self::Markdown,
49 CrawlScrapeFormats::HTML => Self::HTML,
50 CrawlScrapeFormats::RawHTML => Self::RawHTML,
51 CrawlScrapeFormats::Links => Self::Links,
52 CrawlScrapeFormats::Screenshot => Self::Screenshot,
53 CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
54 }
55 }
56}
57
58#[serde_with::skip_serializing_none]
59#[derive(Deserialize, Serialize, Debug, Default, Clone)]
60#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
61#[serde(rename_all = "camelCase")]
62pub struct CrawlOptions {
63 pub scrape_options: Option<ScrapeOptions>,
65
66 pub include_paths: Option<Vec<String>>,
68
69 pub exclude_paths: Option<Vec<String>>,
71
72 pub max_depth: Option<u32>,
74
75 pub ignore_sitemap: Option<bool>,
77
78 pub limit: Option<u32>,
80
81 pub allow_backward_links: Option<bool>,
83
84 pub allow_external_links: Option<bool>,
86
87 #[serde(rename = "deduplicateSimilarURLs")]
89 pub deduplicate_similar_urls: Option<bool>,
90
91 pub ignore_query_parameters: Option<bool>,
93}
94
95#[derive(Deserialize, Serialize, Debug, Default)]
96#[serde(rename_all = "camelCase")]
97pub struct CrawlRequestBody {
98 pub url: String,
100
101 #[serde(flatten)]
102 pub options: CrawlOptions,
103
104 pub webhook: Webhook,
106}
107
108#[derive(Deserialize, Serialize, Debug, Default)]
109#[serde(rename_all = "camelCase")]
110pub struct CrawlResponse {
111 pub success: bool,
114
115 pub data: Document,
117}
118
119#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
120#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
121#[serde(rename_all = "camelCase")]
122pub enum CrawlStatusTypes {
123 Scraping,
125
126 Completed,
128
129 Failed,
131
132 Cancelled,
134}
135
136#[serde_with::skip_serializing_none]
137#[derive(Deserialize, Serialize, Debug, Clone)]
138#[serde(rename_all = "camelCase")]
139pub struct CrawlStatus {
140 pub status: CrawlStatusTypes,
142
143 pub total: u32,
145
146 pub completed: u32,
148
149 pub credits_used: u32,
151
152 pub expires_at: String, pub next: Option<String>,
158
159 pub data: Vec<Document>,
161}
162
163#[derive(Deserialize, Serialize, Debug, Clone)]
164#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
165#[serde(rename_all = "camelCase")]
166pub struct CrawlAsyncResponse {
167 success: bool,
168
169 pub id: String,
171
172 pub url: String,
174}
175
176#[derive(Deserialize, Serialize, Debug, Default)]
177#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
178#[serde(rename_all = "camelCase")]
179pub struct CrawlUrlInput {
180 pub url: String,
182
183 #[serde(flatten)]
184 pub options: CrawlOptions,
185
186 pub poll_interval: Option<u64>,
188
189 #[serde(skip)]
190 pub idempotency_key: Option<String>,
191
192 pub webhook: Option<Webhook>,
194}
195
196impl FirecrawlApp {
197 pub async fn crawl_url_async(
199 &self,
200 url: impl AsRef<str>,
201 options: Option<CrawlOptions>,
202 idempotency_key: Option<String>,
203 webhook: Webhook,
204 ) -> Result<CrawlAsyncResponse, FirecrawlError> {
205 let body = CrawlRequestBody {
206 url: url.as_ref().to_string(),
207 options: options.unwrap_or_default(),
208 webhook,
209 };
210
211 let headers = self.prepare_headers(idempotency_key.as_ref());
212
213 let response = self
214 .client
215 .post(format!("{}/{}/crawl", self.api_url, API_VERSION))
216 .headers(headers.clone())
217 .json(&body)
218 .send()
219 .await
220 .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
221
222 self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
223 .await
224 }
225
226 pub async fn crawl_url(
228 &self,
229 url: impl AsRef<str>,
230 options: impl Into<Option<CrawlOptions>>,
231 webhook: Webhook,
232 poll_interval: Option<u64>,
233 idempotency_key: Option<String>,
234 ) -> Result<CrawlStatus, FirecrawlError> {
235 let options = options.into();
236 let poll_interval = poll_interval.unwrap_or(2000);
237
238 let res = self
239 .crawl_url_async(url, options, idempotency_key, webhook)
240 .await?;
241
242 self.monitor_crawl_status(&res.id, poll_interval).await
243 }
244
245 async fn check_crawl_status_next(
246 &self,
247 next: impl AsRef<str>,
248 ) -> Result<CrawlStatus, FirecrawlError> {
249 let response = self
250 .client
251 .get(next.as_ref())
252 .headers(self.prepare_headers(None))
253 .send()
254 .await
255 .map_err(|e| {
256 FirecrawlError::HttpError(
257 format!("Paginating crawl using URL {:?}", next.as_ref()),
258 e,
259 )
260 })?;
261
262 self.handle_response(
263 response,
264 format!("Paginating crawl using URL {:?}", next.as_ref()),
265 )
266 .await
267 }
268
269 pub async fn check_crawl_status(
271 &self,
272 id: impl AsRef<str>,
273 ) -> Result<CrawlStatus, FirecrawlError> {
274 let response = self
275 .client
276 .get(format!(
277 "{}/{}/crawl/{}",
278 self.api_url,
279 API_VERSION,
280 id.as_ref()
281 ))
282 .headers(self.prepare_headers(None))
283 .send()
284 .await
285 .map_err(|e| {
286 FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
287 })?;
288
289 let mut status: CrawlStatus = self
290 .handle_response(
291 response,
292 format!("Checking status of crawl {}", id.as_ref()),
293 )
294 .await?;
295
296 if status.status == CrawlStatusTypes::Completed {
297 while let Some(next) = status.next {
298 let new_status = self.check_crawl_status_next(next).await?;
299 status.data.extend_from_slice(&new_status.data);
300 status.next = new_status.next;
301 }
302 }
303
304 Ok(status)
305 }
306
307 async fn monitor_crawl_status(
308 &self,
309 id: &str,
310 poll_interval: u64,
311 ) -> Result<CrawlStatus, FirecrawlError> {
312 let mut all_data = Vec::new();
313 let mut current_cursor: Option<String> = None;
314
315 loop {
316 let mut status_data = if let Some(ref cursor) = current_cursor {
318 self.check_crawl_status_next(cursor).await?
319 } else {
320 self.check_crawl_status(id).await?
321 };
322
323 all_data.append(&mut status_data.data);
325
326 if let Some(next) = status_data.next {
328 current_cursor = Some(next);
329 continue;
330 }
331
332 match status_data.status {
334 CrawlStatusTypes::Completed => {
335 status_data.data = all_data;
337 break Ok(status_data);
338 }
339 CrawlStatusTypes::Scraping => {
340 tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
341 }
343 CrawlStatusTypes::Failed => {
344 status_data.data = all_data;
346 break Err(FirecrawlError::CrawlJobFailed(
347 "Crawl job failed.".to_string(),
348 status_data,
349 ));
350 }
351 CrawlStatusTypes::Cancelled => {
352 status_data.data = all_data;
354 break Err(FirecrawlError::CrawlJobCancelled(status_data));
355 }
356 }
357 }
358 }
359}
360
361#[cfg(all(test, feature = "mcp_tool"))]
362mod schema_tests {
363 use super::*;
364 use async_claude;
365 use serde_json::json;
366
367 #[test]
368 fn test_crawl_options_schema() {
369 let actual_schema = async_claude::tool::parse_input_schema::<CrawlOptions>().unwrap();
370
371 println!(
373 "Schema properties: {}",
374 serde_json::to_string_pretty(&actual_schema["properties"]).unwrap()
375 );
376
377 assert_eq!(actual_schema["type"], "object");
379
380 let properties = &actual_schema["properties"];
382 assert!(properties.is_object());
383
384 let property_keys: Vec<String> = properties
386 .as_object()
387 .unwrap()
388 .keys()
389 .map(|k| k.to_string())
390 .collect();
391
392 println!("Actual property keys: {:?}", property_keys);
393
394 assert!(
396 property_keys.contains(&"scrapeOptions".to_string()),
397 "scrapeOptions not found"
398 );
399 assert!(
400 property_keys.contains(&"includePaths".to_string()),
401 "includePaths not found"
402 );
403 assert!(
404 property_keys.contains(&"excludePaths".to_string()),
405 "excludePaths not found"
406 );
407 assert!(
408 property_keys.contains(&"maxDepth".to_string()),
409 "maxDepth not found"
410 );
411 assert!(
412 property_keys.contains(&"ignoreSitemap".to_string()),
413 "ignoreSitemap not found"
414 );
415 assert!(
416 property_keys.contains(&"limit".to_string()),
417 "limit not found"
418 );
419 assert!(
420 property_keys.contains(&"allowBackwardLinks".to_string()),
421 "allowBackwardLinks not found"
422 );
423 assert!(
424 property_keys.contains(&"allowExternalLinks".to_string()),
425 "allowExternalLinks not found"
426 );
427 assert!(
428 property_keys.contains(&"webhook".to_string()),
429 "webhook not found"
430 );
431
432 assert!(
434 property_keys
435 .iter()
436 .any(|k| k.to_lowercase() == "deduplicatesimilarurls"),
437 "deduplicateSimilarURLs not found"
438 );
439
440 assert!(
442 property_keys.contains(&"ignoreQueryParameters".to_string()),
443 "ignoreQueryParameters not found"
444 );
445
446 assert_eq!(properties["scrapeOptions"]["type"], "object");
448
449 assert_eq!(properties["includePaths"]["type"], "array");
451 assert_eq!(properties["includePaths"]["items"]["type"], "string");
452 assert_eq!(properties["excludePaths"]["type"], "array");
453 assert_eq!(properties["excludePaths"]["items"]["type"], "string");
454
455 assert_eq!(properties["ignoreSitemap"]["type"], "boolean");
457 assert_eq!(properties["allowBackwardLinks"]["type"], "boolean");
458 assert_eq!(properties["allowExternalLinks"]["type"], "boolean");
459
460 assert!(
462 properties["maxDepth"]["type"] == "integer"
463 || properties["maxDepth"]["type"] == "number",
464 "Property maxDepth should be numeric"
465 );
466 assert!(
467 properties["limit"]["type"] == "integer" || properties["limit"]["type"] == "number",
468 "Property limit should be numeric"
469 );
470 }
471}