firecrawl_sdk/
crawl.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4
5#[cfg(feature = "mcp_tool")]
6use schemars::JsonSchema;
7
8use crate::{
9    batch_scrape::Webhook,
10    document::Document,
11    scrape::{ScrapeFormats, ScrapeOptions},
12    FirecrawlApp, FirecrawlError, API_VERSION,
13};
14
15#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
16#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
17pub enum CrawlScrapeFormats {
18    /// Will result in a copy of the Markdown content of the page.
19    #[serde(rename = "markdown")]
20    Markdown,
21
22    /// Will result in a copy of the filtered, content-only HTML of the page.
23    #[serde(rename = "html")]
24    HTML,
25
26    /// Will result in a copy of the raw HTML of the page.
27    #[serde(rename = "rawHtml")]
28    RawHTML,
29
30    /// Will result in a Vec of URLs found on the page.
31    #[serde(rename = "links")]
32    Links,
33
34    /// Will result in a URL to a screenshot of the page.
35    ///
36    /// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`.
37    #[serde(rename = "screenshot")]
38    Screenshot,
39
40    /// Will result in a URL to a full-page screenshot of the page.
41    ///
42    /// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`.
43    #[serde(rename = "screenshot@fullPage")]
44    ScreenshotFullPage,
45}
46
47impl From<CrawlScrapeFormats> for ScrapeFormats {
48    fn from(value: CrawlScrapeFormats) -> Self {
49        match value {
50            CrawlScrapeFormats::Markdown => Self::Markdown,
51            CrawlScrapeFormats::HTML => Self::HTML,
52            CrawlScrapeFormats::RawHTML => Self::RawHTML,
53            CrawlScrapeFormats::Links => Self::Links,
54            CrawlScrapeFormats::Screenshot => Self::Screenshot,
55            CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
56        }
57    }
58}
59
60#[serde_with::skip_serializing_none]
61#[derive(Deserialize, Serialize, Debug, Default, Clone)]
62#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
63#[serde(rename_all = "camelCase")]
64pub struct CrawlOptions {
65    /// Options for scraping each page
66    pub scrape_options: Option<ScrapeOptions>,
67
68    /// Only crawl these URL paths
69    pub include_paths: Option<Vec<String>>,
70
71    /// URL paths to exclude from crawling
72    pub exclude_paths: Option<Vec<String>>,
73
74    /// Maximum link depth to crawl. (default: `2`)
75    pub max_depth: Option<u32>,
76
77    /// Skip sitemap.xml discovery. (default: `true`)
78    pub ignore_sitemap: Option<bool>,
79
80    /// Maximum number of pages to crawl. (default: `10`)
81    pub limit: Option<u32>,
82
83    /// Allow crawling links that point to parent directories. (default: `false`)
84    pub allow_backward_links: Option<bool>,
85
86    /// Allow crawling links to external domains. (default: `false`)
87    pub allow_external_links: Option<bool>,
88
89    /// Remove similar URLs during crawl
90    #[serde(rename = "deduplicateSimilarURLs")]
91    pub deduplicate_similar_urls: Option<bool>,
92
93    /// Ignore query parameters when comparing URLs
94    pub ignore_query_parameters: Option<bool>,
95}
96
97#[derive(Deserialize, Serialize, Debug, Default)]
98#[serde(rename_all = "camelCase")]
99pub struct CrawlRequestBody {
100    /// Starting URL for the crawl
101    pub url: String,
102
103    #[serde(flatten)]
104    pub options: CrawlOptions,
105
106    /// Webhook URL to notify when crawl is complete
107    pub webhook: Webhook,
108}
109
110#[derive(Deserialize, Serialize, Debug, Default)]
111#[serde(rename_all = "camelCase")]
112pub struct CrawlResponse {
113    /// This will always be `true` due to `FirecrawlApp::handle_response`.
114    /// No need to expose.
115    pub success: bool,
116
117    /// The resulting document.
118    pub data: Document,
119}
120
121#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
122#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
123#[serde(rename_all = "camelCase")]
124pub enum CrawlStatusTypes {
125    /// The crawl job is in progress.
126    Scraping,
127
128    /// The crawl job has been completed successfully.
129    Completed,
130
131    /// The crawl job has failed.
132    Failed,
133
134    /// The crawl job has been cancelled.
135    Cancelled,
136}
137
138#[serde_with::skip_serializing_none]
139#[derive(Deserialize, Serialize, Debug, Clone)]
140#[serde(rename_all = "camelCase")]
141pub struct CrawlStatus {
142    /// The status of the crawl.
143    pub status: CrawlStatusTypes,
144
145    /// Number of pages that will be scraped in total. This number may grow as the crawler discovers new pages.
146    pub total: u32,
147
148    /// Number of pages that have been successfully scraped.
149    pub completed: u32,
150
151    /// Amount of credits used by the crawl job.
152    pub credits_used: u32,
153
154    /// Expiry time of crawl data. After this date, the crawl data will be unavailable from the API.
155    pub expires_at: String, // TODO: parse into date
156
157    /// URL to call to get the next batch of documents.
158    /// Unless you are sidestepping the SDK, you do not need to deal with this.
159    pub next: Option<String>,
160
161    /// List of documents returned by the crawl
162    pub data: Vec<Document>,
163}
164
165#[derive(Deserialize, Serialize, Debug, Clone)]
166#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
167#[serde(rename_all = "camelCase")]
168pub struct CrawlAsyncResponse {
169    success: bool,
170
171    /// Crawl ID
172    pub id: String,
173
174    /// URL to get the status of the crawl job
175    pub url: String,
176}
177
178#[derive(Deserialize, Serialize, Debug, Default)]
179#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
180#[serde(rename_all = "camelCase")]
181pub struct CrawlUrlInput {
182    /// Starting URL for the crawl
183    pub url: String,
184
185    #[serde(flatten)]
186    pub options: CrawlOptions,
187
188    /// How often the status of the job should be checked, in milliseconds. (default: `2000`)
189    pub poll_interval: Option<u64>,
190
191    #[serde(skip)]
192    pub idempotency_key: Option<String>,
193
194    /// Webhook URL to notify when crawl is complete, default to a dummy url
195    pub webhook: Option<Webhook>,
196}
197
198impl FirecrawlApp {
199    /// Initiates a crawl job for a URL using the Firecrawl API.
200    pub async fn crawl_url_async(
201        &self,
202        url: impl AsRef<str>,
203        options: Option<CrawlOptions>,
204        idempotency_key: Option<String>,
205        webhook: Webhook,
206    ) -> Result<CrawlAsyncResponse, FirecrawlError> {
207        let body = CrawlRequestBody {
208            url: url.as_ref().to_string(),
209            options: options.unwrap_or_default(),
210            webhook,
211        };
212
213        let headers = self.prepare_headers(idempotency_key.as_ref());
214
215        let response = self
216            .client
217            .post(format!("{}/{}/crawl", self.api_url, API_VERSION))
218            .headers(headers.clone())
219            .json(&body)
220            .send()
221            .await
222            .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
223
224        self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
225            .await
226    }
227
228    /// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
229    pub async fn crawl_url(
230        &self,
231        url: impl AsRef<str>,
232        options: impl Into<Option<CrawlOptions>>,
233        webhook: Webhook,
234        poll_interval: Option<u64>,
235        idempotency_key: Option<String>,
236    ) -> Result<CrawlStatus, FirecrawlError> {
237        let options = options.into();
238        let poll_interval = poll_interval.unwrap_or(2000);
239
240        let res = self
241            .crawl_url_async(url, options, idempotency_key, webhook)
242            .await?;
243
244        self.monitor_crawl_status(&res.id, poll_interval).await
245    }
246
247    async fn check_crawl_status_next(
248        &self,
249        next: impl AsRef<str>,
250    ) -> Result<CrawlStatus, FirecrawlError> {
251        let response = self
252            .client
253            .get(next.as_ref())
254            .headers(self.prepare_headers(None))
255            .send()
256            .await
257            .map_err(|e| {
258                FirecrawlError::HttpError(
259                    format!("Paginating crawl using URL {:?}", next.as_ref()),
260                    e,
261                )
262            })?;
263
264        self.handle_response(
265            response,
266            format!("Paginating crawl using URL {:?}", next.as_ref()),
267        )
268        .await
269    }
270
271    /// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
272    pub async fn check_crawl_status(
273        &self,
274        id: impl AsRef<str>,
275    ) -> Result<CrawlStatus, FirecrawlError> {
276        let response = self
277            .client
278            .get(format!(
279                "{}/{}/crawl/{}",
280                self.api_url,
281                API_VERSION,
282                id.as_ref()
283            ))
284            .headers(self.prepare_headers(None))
285            .send()
286            .await
287            .map_err(|e| {
288                FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
289            })?;
290
291        let mut status: CrawlStatus = self
292            .handle_response(
293                response,
294                format!("Checking status of crawl {}", id.as_ref()),
295            )
296            .await?;
297
298        if status.status == CrawlStatusTypes::Completed {
299            while let Some(next) = status.next {
300                let new_status = self.check_crawl_status_next(next).await?;
301                status.data.extend_from_slice(&new_status.data);
302                status.next = new_status.next;
303            }
304        }
305
306        Ok(status)
307    }
308
309    async fn monitor_crawl_status(
310        &self,
311        id: &str,
312        poll_interval: u64,
313    ) -> Result<CrawlStatus, FirecrawlError> {
314        let mut all_data = Vec::new();
315        let mut current_cursor: Option<String> = None;
316
317        loop {
318            // Get status data, either from the base endpoint or using the next cursor
319            let mut status_data = if let Some(ref cursor) = current_cursor {
320                self.check_crawl_status_next(cursor).await?
321            } else {
322                self.check_crawl_status(id).await?
323            };
324
325            // Collect data from this page
326            all_data.append(&mut status_data.data);
327
328            // Check if we need to paginate
329            if let Some(next) = status_data.next {
330                current_cursor = Some(next);
331                continue;
332            }
333
334            // Check job status
335            match status_data.status {
336                CrawlStatusTypes::Completed => {
337                    // Put all collected data back into the status
338                    status_data.data = all_data;
339                    break Ok(status_data);
340                }
341                CrawlStatusTypes::Scraping => {
342                    tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
343                    // Keep the cursor as is, to continue from where we left off
344                }
345                CrawlStatusTypes::Failed => {
346                    // Put all collected data back into the status for error context
347                    status_data.data = all_data;
348                    break Err(FirecrawlError::CrawlJobFailed(
349                        "Crawl job failed.".to_string(),
350                        status_data,
351                    ));
352                }
353                CrawlStatusTypes::Cancelled => {
354                    // Put all collected data back into the status for error context
355                    status_data.data = all_data;
356                    break Err(FirecrawlError::CrawlJobCancelled(status_data));
357                }
358            }
359        }
360    }
361}
362
363#[cfg(all(test, feature = "mcp_tool"))]
364mod schema_tests {
365    use super::*;
366    use async_claude;
367    use serde_json::json;
368
369    #[test]
370    fn test_crawl_options_schema() {
371        let actual_schema = async_claude::tool::parse_input_schema::<CrawlOptions>().unwrap();
372
373        // For debugging
374        println!(
375            "Schema properties: {}",
376            serde_json::to_string_pretty(&actual_schema["properties"]).unwrap()
377        );
378
379        // Check basic structure
380        assert_eq!(actual_schema["type"], "object");
381
382        // Get properties object
383        let properties = &actual_schema["properties"];
384        assert!(properties.is_object());
385
386        // Get the actual property keys from the schema
387        let property_keys: Vec<String> = properties
388            .as_object()
389            .unwrap()
390            .keys()
391            .map(|k| k.to_string())
392            .collect();
393
394        println!("Actual property keys: {:?}", property_keys);
395
396        // Check that important properties exist, being flexible with URL vs Url
397        assert!(
398            property_keys.contains(&"scrapeOptions".to_string()),
399            "scrapeOptions not found"
400        );
401        assert!(
402            property_keys.contains(&"includePaths".to_string()),
403            "includePaths not found"
404        );
405        assert!(
406            property_keys.contains(&"excludePaths".to_string()),
407            "excludePaths not found"
408        );
409        assert!(
410            property_keys.contains(&"maxDepth".to_string()),
411            "maxDepth not found"
412        );
413        assert!(
414            property_keys.contains(&"ignoreSitemap".to_string()),
415            "ignoreSitemap not found"
416        );
417        assert!(
418            property_keys.contains(&"limit".to_string()),
419            "limit not found"
420        );
421        assert!(
422            property_keys.contains(&"allowBackwardLinks".to_string()),
423            "allowBackwardLinks not found"
424        );
425        assert!(
426            property_keys.contains(&"allowExternalLinks".to_string()),
427            "allowExternalLinks not found"
428        );
429        assert!(
430            property_keys.contains(&"webhook".to_string()),
431            "webhook not found"
432        );
433
434        // Check for deduplicateSimilarURLs or deduplicateSimilarUrls
435        assert!(
436            property_keys
437                .iter()
438                .any(|k| k.to_lowercase() == "deduplicatesimilarurls"),
439            "deduplicateSimilarURLs not found"
440        );
441
442        // Check for ignoreQueryParameters
443        assert!(
444            property_keys.contains(&"ignoreQueryParameters".to_string()),
445            "ignoreQueryParameters not found"
446        );
447
448        // Check expected property types and descriptions for properties that certainly exist
449        assert_eq!(properties["scrapeOptions"]["type"], "object");
450
451        // Check array properties
452        assert_eq!(properties["includePaths"]["type"], "array");
453        assert_eq!(properties["includePaths"]["items"]["type"], "string");
454        assert_eq!(properties["excludePaths"]["type"], "array");
455        assert_eq!(properties["excludePaths"]["items"]["type"], "string");
456
457        // Check boolean properties
458        assert_eq!(properties["ignoreSitemap"]["type"], "boolean");
459        assert_eq!(properties["allowBackwardLinks"]["type"], "boolean");
460        assert_eq!(properties["allowExternalLinks"]["type"], "boolean");
461
462        // Check numeric properties
463        assert!(
464            properties["maxDepth"]["type"] == "integer"
465                || properties["maxDepth"]["type"] == "number",
466            "Property maxDepth should be numeric"
467        );
468        assert!(
469            properties["limit"]["type"] == "integer" || properties["limit"]["type"] == "number",
470            "Property limit should be numeric"
471        );
472    }
473}