1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4
5#[cfg(feature = "mcp_tool")]
6use schemars::JsonSchema;
7
8use crate::{
9 document::Document,
10 scrape::{ScrapeFormats, ScrapeOptions},
11 FirecrawlApp, FirecrawlError, API_VERSION,
12};
13
14#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
15#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
16pub enum CrawlScrapeFormats {
17 #[serde(rename = "markdown")]
19 Markdown,
20
21 #[serde(rename = "html")]
23 HTML,
24
25 #[serde(rename = "rawHtml")]
27 RawHTML,
28
29 #[serde(rename = "links")]
31 Links,
32
33 #[serde(rename = "screenshot")]
37 Screenshot,
38
39 #[serde(rename = "screenshot@fullPage")]
43 ScreenshotFullPage,
44}
45
46impl From<CrawlScrapeFormats> for ScrapeFormats {
47 fn from(value: CrawlScrapeFormats) -> Self {
48 match value {
49 CrawlScrapeFormats::Markdown => Self::Markdown,
50 CrawlScrapeFormats::HTML => Self::HTML,
51 CrawlScrapeFormats::RawHTML => Self::RawHTML,
52 CrawlScrapeFormats::Links => Self::Links,
53 CrawlScrapeFormats::Screenshot => Self::Screenshot,
54 CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
55 }
56 }
57}
58
59#[serde_with::skip_serializing_none]
60#[derive(Deserialize, Serialize, Debug, Default, Clone)]
61#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
62#[serde(rename_all = "camelCase")]
63pub struct CrawlScrapeOptions {
64 pub formats: Option<Vec<CrawlScrapeFormats>>,
66
67 pub only_main_content: Option<bool>,
69
70 pub include_tags: Option<Vec<String>>,
74
75 pub exclude_tags: Option<Vec<String>>,
79
80 pub headers: Option<HashMap<String, String>>,
82
83 pub wait_for: Option<u32>,
85
86 pub timeout: Option<u32>,
88}
89
90impl From<CrawlScrapeOptions> for ScrapeOptions {
91 fn from(value: CrawlScrapeOptions) -> Self {
92 ScrapeOptions {
93 formats: value
94 .formats
95 .map(|formats| formats.into_iter().map(|x| x.into()).collect()),
96 only_main_content: value.only_main_content,
97 include_tags: value.include_tags,
98 exclude_tags: value.exclude_tags,
99 headers: value.headers,
100 wait_for: value.wait_for,
101 timeout: value.timeout,
102 ..Default::default()
103 }
104 }
105}
106
107#[serde_with::skip_serializing_none]
108#[derive(Deserialize, Serialize, Debug, Default, Clone)]
109#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
110#[serde(rename_all = "camelCase")]
111pub struct CrawlOptions {
112 pub scrape_options: Option<CrawlScrapeOptions>,
114
115 pub include_paths: Option<Vec<String>>,
117
118 pub exclude_paths: Option<Vec<String>>,
120
121 pub max_depth: Option<u32>,
123
124 pub ignore_sitemap: Option<bool>,
126
127 pub limit: Option<u32>,
129
130 pub allow_backward_links: Option<bool>,
132
133 pub allow_external_links: Option<bool>,
135
136 pub webhook: Option<String>,
138
139 #[serde(rename = "deduplicateSimilarURLs")]
141 pub deduplicate_similar_urls: Option<bool>,
142
143 #[serde(rename = "ignoreQueryParameters")]
145 pub ignore_query_parameters: Option<bool>,
146
147 #[serde(skip)]
149 pub idempotency_key: Option<String>,
150
151 #[serde(skip)]
153 pub poll_interval: Option<u64>,
154}
155
156#[derive(Deserialize, Serialize, Debug, Default)]
157#[serde(rename_all = "camelCase")]
158struct CrawlRequestBody {
159 url: String,
160
161 #[serde(flatten)]
162 options: CrawlOptions,
163}
164
165#[derive(Deserialize, Serialize, Debug, Default)]
166#[serde(rename_all = "camelCase")]
167struct CrawlResponse {
168 success: bool,
171
172 data: Document,
174}
175
176#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
177#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
178#[serde(rename_all = "camelCase")]
179pub enum CrawlStatusTypes {
180 Scraping,
182
183 Completed,
185
186 Failed,
188
189 Cancelled,
191}
192
193#[serde_with::skip_serializing_none]
194#[derive(Deserialize, Serialize, Debug, Clone)]
195#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
196#[serde(rename_all = "camelCase")]
197pub struct CrawlStatus {
198 pub status: CrawlStatusTypes,
200
201 pub total: u32,
203
204 pub completed: u32,
206
207 pub credits_used: u32,
209
210 pub expires_at: String, pub next: Option<String>,
216
217 pub data: Vec<Document>,
219}
220
221#[derive(Deserialize, Serialize, Debug, Clone)]
222#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
223#[serde(rename_all = "camelCase")]
224pub struct CrawlAsyncResponse {
225 success: bool,
226
227 pub id: String,
229
230 pub url: String,
232}
233
234impl FirecrawlApp {
235 pub async fn crawl_url_async(
237 &self,
238 url: impl AsRef<str>,
239 options: Option<CrawlOptions>,
240 ) -> Result<CrawlAsyncResponse, FirecrawlError> {
241 let body = CrawlRequestBody {
242 url: url.as_ref().to_string(),
243 options: options.unwrap_or_default(),
244 };
245
246 let headers = self.prepare_headers(body.options.idempotency_key.as_ref());
247
248 let response = self
249 .client
250 .post(&format!("{}/{}/crawl", self.api_url, API_VERSION))
251 .headers(headers.clone())
252 .json(&body)
253 .send()
254 .await
255 .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
256
257 self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
258 .await
259 }
260
261 pub async fn crawl_url(
263 &self,
264 url: impl AsRef<str>,
265 options: impl Into<Option<CrawlOptions>>,
266 ) -> Result<CrawlStatus, FirecrawlError> {
267 let options = options.into();
268 let poll_interval = options
269 .as_ref()
270 .and_then(|x| x.poll_interval)
271 .unwrap_or(2000);
272 let res = self.crawl_url_async(url, options).await?;
273
274 self.monitor_job_status(&res.id, poll_interval).await
275 }
276
277 async fn check_crawl_status_next(
278 &self,
279 next: impl AsRef<str>,
280 ) -> Result<CrawlStatus, FirecrawlError> {
281 let response = self
282 .client
283 .get(next.as_ref())
284 .headers(self.prepare_headers(None))
285 .send()
286 .await
287 .map_err(|e| {
288 FirecrawlError::HttpError(
289 format!("Paginating crawl using URL {:?}", next.as_ref()),
290 e,
291 )
292 })?;
293
294 self.handle_response(
295 response,
296 format!("Paginating crawl using URL {:?}", next.as_ref()),
297 )
298 .await
299 }
300
301 pub async fn check_crawl_status(
303 &self,
304 id: impl AsRef<str>,
305 ) -> Result<CrawlStatus, FirecrawlError> {
306 let response = self
307 .client
308 .get(&format!(
309 "{}/{}/crawl/{}",
310 self.api_url,
311 API_VERSION,
312 id.as_ref()
313 ))
314 .headers(self.prepare_headers(None))
315 .send()
316 .await
317 .map_err(|e| {
318 FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
319 })?;
320
321 let mut status: CrawlStatus = self
322 .handle_response(
323 response,
324 format!("Checking status of crawl {}", id.as_ref()),
325 )
326 .await?;
327
328 if status.status == CrawlStatusTypes::Completed {
329 while let Some(next) = status.next {
330 let new_status = self.check_crawl_status_next(next).await?;
331 status.data.extend_from_slice(&new_status.data);
332 status.next = new_status.next;
333 }
334 }
335
336 Ok(status)
337 }
338
339 async fn monitor_job_status(
340 &self,
341 id: &str,
342 poll_interval: u64,
343 ) -> Result<CrawlStatus, FirecrawlError> {
344 loop {
345 let status_data = self.check_crawl_status(id).await?;
346 match status_data.status {
347 CrawlStatusTypes::Completed => {
348 break Ok(status_data);
349 }
350 CrawlStatusTypes::Scraping => {
351 tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
352 }
353 CrawlStatusTypes::Failed => {
354 break Err(FirecrawlError::CrawlJobFailed(
355 format!("Crawl job failed."),
356 status_data,
357 ));
358 }
359 CrawlStatusTypes::Cancelled => {
360 break Err(FirecrawlError::CrawlJobFailed(
361 format!("Crawl job was cancelled."),
362 status_data,
363 ));
364 }
365 }
366 }
367 }
368}
369
370#[cfg(all(test, feature = "mcp_tool"))]
371mod schema_tests {
372 use super::*;
373 use async_claude;
374 use serde_json::json;
375
376 #[test]
377 fn test_crawl_options_schema() {
378 let actual_schema = async_claude::tool::parse_input_schema::<CrawlOptions>().unwrap();
379
380 println!(
382 "Schema properties: {}",
383 serde_json::to_string_pretty(&actual_schema["properties"]).unwrap()
384 );
385
386 assert_eq!(actual_schema["type"], "object");
388
389 let properties = &actual_schema["properties"];
391 assert!(properties.is_object());
392
393 let property_keys: Vec<String> = properties
395 .as_object()
396 .unwrap()
397 .keys()
398 .map(|k| k.to_string())
399 .collect();
400
401 println!("Actual property keys: {:?}", property_keys);
402
403 assert!(
405 property_keys.contains(&"scrapeOptions".to_string()),
406 "scrapeOptions not found"
407 );
408 assert!(
409 property_keys.contains(&"includePaths".to_string()),
410 "includePaths not found"
411 );
412 assert!(
413 property_keys.contains(&"excludePaths".to_string()),
414 "excludePaths not found"
415 );
416 assert!(
417 property_keys.contains(&"maxDepth".to_string()),
418 "maxDepth not found"
419 );
420 assert!(
421 property_keys.contains(&"ignoreSitemap".to_string()),
422 "ignoreSitemap not found"
423 );
424 assert!(
425 property_keys.contains(&"limit".to_string()),
426 "limit not found"
427 );
428 assert!(
429 property_keys.contains(&"allowBackwardLinks".to_string()),
430 "allowBackwardLinks not found"
431 );
432 assert!(
433 property_keys.contains(&"allowExternalLinks".to_string()),
434 "allowExternalLinks not found"
435 );
436 assert!(
437 property_keys.contains(&"webhook".to_string()),
438 "webhook not found"
439 );
440
441 assert!(
443 property_keys
444 .iter()
445 .any(|k| k.to_lowercase() == "deduplicatesimilarurls"),
446 "deduplicateSimilarURLs not found"
447 );
448
449 assert!(
451 property_keys.contains(&"ignoreQueryParameters".to_string()),
452 "ignoreQueryParameters not found"
453 );
454
455 assert_eq!(properties["scrapeOptions"]["type"], "object");
457
458 assert_eq!(properties["includePaths"]["type"], "array");
460 assert_eq!(properties["includePaths"]["items"]["type"], "string");
461 assert_eq!(properties["excludePaths"]["type"], "array");
462 assert_eq!(properties["excludePaths"]["items"]["type"], "string");
463
464 assert_eq!(properties["ignoreSitemap"]["type"], "boolean");
466 assert_eq!(properties["allowBackwardLinks"]["type"], "boolean");
467 assert_eq!(properties["allowExternalLinks"]["type"], "boolean");
468
469 assert!(
471 properties["maxDepth"]["type"] == "integer"
472 || properties["maxDepth"]["type"] == "number",
473 "Property maxDepth should be numeric"
474 );
475 assert!(
476 properties["limit"]["type"] == "integer" || properties["limit"]["type"] == "number",
477 "Property limit should be numeric"
478 );
479 }
480}