firecrawl_sdk/
crawl.rs

1use serde::{Deserialize, Serialize};
2
3#[cfg(feature = "mcp_tool")]
4use schemars::JsonSchema;
5
6use crate::{
7    batch_scrape::Webhook,
8    document::Document,
9    scrape::{ScrapeFormats, ScrapeOptions},
10    FirecrawlApp, FirecrawlError, API_VERSION,
11};
12
13#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
14#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
15pub enum CrawlScrapeFormats {
16    /// Will result in a copy of the Markdown content of the page.
17    #[serde(rename = "markdown")]
18    Markdown,
19
20    /// Will result in a copy of the filtered, content-only HTML of the page.
21    #[serde(rename = "html")]
22    HTML,
23
24    /// Will result in a copy of the raw HTML of the page.
25    #[serde(rename = "rawHtml")]
26    RawHTML,
27
28    /// Will result in a Vec of URLs found on the page.
29    #[serde(rename = "links")]
30    Links,
31
32    /// Will result in a URL to a screenshot of the page.
33    ///
34    /// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`.
35    #[serde(rename = "screenshot")]
36    Screenshot,
37
38    /// Will result in a URL to a full-page screenshot of the page.
39    ///
40    /// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`.
41    #[serde(rename = "screenshot@fullPage")]
42    ScreenshotFullPage,
43}
44
45impl From<CrawlScrapeFormats> for ScrapeFormats {
46    fn from(value: CrawlScrapeFormats) -> Self {
47        match value {
48            CrawlScrapeFormats::Markdown => Self::Markdown,
49            CrawlScrapeFormats::HTML => Self::HTML,
50            CrawlScrapeFormats::RawHTML => Self::RawHTML,
51            CrawlScrapeFormats::Links => Self::Links,
52            CrawlScrapeFormats::Screenshot => Self::Screenshot,
53            CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
54        }
55    }
56}
57
58#[serde_with::skip_serializing_none]
59#[derive(Deserialize, Serialize, Debug, Default, Clone)]
60#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
61#[serde(rename_all = "camelCase")]
62pub struct CrawlOptions {
63    /// Options for scraping each page
64    pub scrape_options: Option<ScrapeOptions>,
65
66    /// Only crawl these URL paths
67    pub include_paths: Option<Vec<String>>,
68
69    /// URL paths to exclude from crawling
70    pub exclude_paths: Option<Vec<String>>,
71
72    /// Maximum link depth to crawl. (default: `2`)
73    pub max_depth: Option<u32>,
74
75    /// Skip sitemap.xml discovery. (default: `true`)
76    pub ignore_sitemap: Option<bool>,
77
78    /// Maximum number of pages to crawl. (default: `10`)
79    pub limit: Option<u32>,
80
81    /// Allow crawling links that point to parent directories. (default: `false`)
82    pub allow_backward_links: Option<bool>,
83
84    /// Allow crawling links to external domains. (default: `false`)
85    pub allow_external_links: Option<bool>,
86
87    /// Remove similar URLs during crawl
88    #[serde(rename = "deduplicateSimilarURLs")]
89    pub deduplicate_similar_urls: Option<bool>,
90
91    /// Ignore query parameters when comparing URLs
92    pub ignore_query_parameters: Option<bool>,
93}
94
95#[derive(Deserialize, Serialize, Debug, Default)]
96#[serde(rename_all = "camelCase")]
97pub struct CrawlRequestBody {
98    /// Starting URL for the crawl
99    pub url: String,
100
101    #[serde(flatten)]
102    pub options: CrawlOptions,
103
104    /// Webhook URL to notify when crawl is complete
105    pub webhook: Webhook,
106}
107
108#[derive(Deserialize, Serialize, Debug, Default)]
109#[serde(rename_all = "camelCase")]
110pub struct CrawlResponse {
111    /// This will always be `true` due to `FirecrawlApp::handle_response`.
112    /// No need to expose.
113    pub success: bool,
114
115    /// The resulting document.
116    pub data: Document,
117}
118
119#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
120#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
121#[serde(rename_all = "camelCase")]
122pub enum CrawlStatusTypes {
123    /// The crawl job is in progress.
124    Scraping,
125
126    /// The crawl job has been completed successfully.
127    Completed,
128
129    /// The crawl job has failed.
130    Failed,
131
132    /// The crawl job has been cancelled.
133    Cancelled,
134}
135
136#[serde_with::skip_serializing_none]
137#[derive(Deserialize, Serialize, Debug, Clone)]
138#[serde(rename_all = "camelCase")]
139pub struct CrawlStatus {
140    /// The status of the crawl.
141    pub status: CrawlStatusTypes,
142
143    /// Number of pages that will be scraped in total. This number may grow as the crawler discovers new pages.
144    pub total: u32,
145
146    /// Number of pages that have been successfully scraped.
147    pub completed: u32,
148
149    /// Amount of credits used by the crawl job.
150    pub credits_used: u32,
151
152    /// Expiry time of crawl data. After this date, the crawl data will be unavailable from the API.
153    pub expires_at: String, // TODO: parse into date
154
155    /// URL to call to get the next batch of documents.
156    /// Unless you are sidestepping the SDK, you do not need to deal with this.
157    pub next: Option<String>,
158
159    /// List of documents returned by the crawl
160    pub data: Vec<Document>,
161}
162
163#[derive(Deserialize, Serialize, Debug, Clone)]
164#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
165#[serde(rename_all = "camelCase")]
166pub struct CrawlAsyncResponse {
167    success: bool,
168
169    /// Crawl ID
170    pub id: String,
171
172    /// URL to get the status of the crawl job
173    pub url: String,
174}
175
176#[derive(Deserialize, Serialize, Debug, Default)]
177#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
178#[serde(rename_all = "camelCase")]
179pub struct CrawlUrlInput {
180    /// Starting URL for the crawl
181    pub url: String,
182
183    #[serde(flatten)]
184    pub options: CrawlOptions,
185
186    /// How often the status of the job should be checked, in milliseconds. (default: `2000`)
187    pub poll_interval: Option<u64>,
188
189    #[serde(skip)]
190    pub idempotency_key: Option<String>,
191
192    /// Webhook URL to notify when crawl is complete, default to a dummy url
193    pub webhook: Option<Webhook>,
194}
195
196impl FirecrawlApp {
197    /// Initiates a crawl job for a URL using the Firecrawl API.
198    pub async fn crawl_url_async(
199        &self,
200        url: impl AsRef<str>,
201        options: Option<CrawlOptions>,
202        idempotency_key: Option<String>,
203        webhook: Webhook,
204    ) -> Result<CrawlAsyncResponse, FirecrawlError> {
205        let body = CrawlRequestBody {
206            url: url.as_ref().to_string(),
207            options: options.unwrap_or_default(),
208            webhook,
209        };
210
211        let headers = self.prepare_headers(idempotency_key.as_ref());
212
213        let response = self
214            .client
215            .post(format!("{}/{}/crawl", self.api_url, API_VERSION))
216            .headers(headers.clone())
217            .json(&body)
218            .send()
219            .await
220            .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
221
222        self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
223            .await
224    }
225
226    /// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
227    pub async fn crawl_url(
228        &self,
229        url: impl AsRef<str>,
230        options: impl Into<Option<CrawlOptions>>,
231        webhook: Webhook,
232        poll_interval: Option<u64>,
233        idempotency_key: Option<String>,
234    ) -> Result<CrawlStatus, FirecrawlError> {
235        let options = options.into();
236        let poll_interval = poll_interval.unwrap_or(2000);
237
238        let res = self
239            .crawl_url_async(url, options, idempotency_key, webhook)
240            .await?;
241
242        self.monitor_crawl_status(&res.id, poll_interval).await
243    }
244
245    async fn check_crawl_status_next(
246        &self,
247        next: impl AsRef<str>,
248    ) -> Result<CrawlStatus, FirecrawlError> {
249        let response = self
250            .client
251            .get(next.as_ref())
252            .headers(self.prepare_headers(None))
253            .send()
254            .await
255            .map_err(|e| {
256                FirecrawlError::HttpError(
257                    format!("Paginating crawl using URL {:?}", next.as_ref()),
258                    e,
259                )
260            })?;
261
262        self.handle_response(
263            response,
264            format!("Paginating crawl using URL {:?}", next.as_ref()),
265        )
266        .await
267    }
268
269    /// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
270    pub async fn check_crawl_status(
271        &self,
272        id: impl AsRef<str>,
273    ) -> Result<CrawlStatus, FirecrawlError> {
274        let response = self
275            .client
276            .get(format!(
277                "{}/{}/crawl/{}",
278                self.api_url,
279                API_VERSION,
280                id.as_ref()
281            ))
282            .headers(self.prepare_headers(None))
283            .send()
284            .await
285            .map_err(|e| {
286                FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
287            })?;
288
289        let mut status: CrawlStatus = self
290            .handle_response(
291                response,
292                format!("Checking status of crawl {}", id.as_ref()),
293            )
294            .await?;
295
296        if status.status == CrawlStatusTypes::Completed {
297            while let Some(next) = status.next {
298                let new_status = self.check_crawl_status_next(next).await?;
299                status.data.extend_from_slice(&new_status.data);
300                status.next = new_status.next;
301            }
302        }
303
304        Ok(status)
305    }
306
307    async fn monitor_crawl_status(
308        &self,
309        id: &str,
310        poll_interval: u64,
311    ) -> Result<CrawlStatus, FirecrawlError> {
312        let mut all_data = Vec::new();
313        let mut current_cursor: Option<String> = None;
314
315        loop {
316            // Get status data, either from the base endpoint or using the next cursor
317            let mut status_data = if let Some(ref cursor) = current_cursor {
318                self.check_crawl_status_next(cursor).await?
319            } else {
320                self.check_crawl_status(id).await?
321            };
322
323            // Collect data from this page
324            all_data.append(&mut status_data.data);
325
326            // Check if we need to paginate
327            if let Some(next) = status_data.next {
328                current_cursor = Some(next);
329                continue;
330            }
331
332            // Check job status
333            match status_data.status {
334                CrawlStatusTypes::Completed => {
335                    // Put all collected data back into the status
336                    status_data.data = all_data;
337                    break Ok(status_data);
338                }
339                CrawlStatusTypes::Scraping => {
340                    tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
341                    // Keep the cursor as is, to continue from where we left off
342                }
343                CrawlStatusTypes::Failed => {
344                    // Put all collected data back into the status for error context
345                    status_data.data = all_data;
346                    break Err(FirecrawlError::CrawlJobFailed(
347                        "Crawl job failed.".to_string(),
348                        status_data,
349                    ));
350                }
351                CrawlStatusTypes::Cancelled => {
352                    // Put all collected data back into the status for error context
353                    status_data.data = all_data;
354                    break Err(FirecrawlError::CrawlJobCancelled(status_data));
355                }
356            }
357        }
358    }
359}
360
361#[cfg(all(test, feature = "mcp_tool"))]
362mod schema_tests {
363    use super::*;
364    use async_claude;
365    use serde_json::json;
366
367    #[test]
368    fn test_crawl_options_schema() {
369        let actual_schema = async_claude::tool::parse_input_schema::<CrawlOptions>().unwrap();
370
371        // For debugging
372        println!(
373            "Schema properties: {}",
374            serde_json::to_string_pretty(&actual_schema["properties"]).unwrap()
375        );
376
377        // Check basic structure
378        assert_eq!(actual_schema["type"], "object");
379
380        // Get properties object
381        let properties = &actual_schema["properties"];
382        assert!(properties.is_object());
383
384        // Get the actual property keys from the schema
385        let property_keys: Vec<String> = properties
386            .as_object()
387            .unwrap()
388            .keys()
389            .map(|k| k.to_string())
390            .collect();
391
392        println!("Actual property keys: {:?}", property_keys);
393
394        // Check that important properties exist, being flexible with URL vs Url
395        assert!(
396            property_keys.contains(&"scrapeOptions".to_string()),
397            "scrapeOptions not found"
398        );
399        assert!(
400            property_keys.contains(&"includePaths".to_string()),
401            "includePaths not found"
402        );
403        assert!(
404            property_keys.contains(&"excludePaths".to_string()),
405            "excludePaths not found"
406        );
407        assert!(
408            property_keys.contains(&"maxDepth".to_string()),
409            "maxDepth not found"
410        );
411        assert!(
412            property_keys.contains(&"ignoreSitemap".to_string()),
413            "ignoreSitemap not found"
414        );
415        assert!(
416            property_keys.contains(&"limit".to_string()),
417            "limit not found"
418        );
419        assert!(
420            property_keys.contains(&"allowBackwardLinks".to_string()),
421            "allowBackwardLinks not found"
422        );
423        assert!(
424            property_keys.contains(&"allowExternalLinks".to_string()),
425            "allowExternalLinks not found"
426        );
427        assert!(
428            property_keys.contains(&"webhook".to_string()),
429            "webhook not found"
430        );
431
432        // Check for deduplicateSimilarURLs or deduplicateSimilarUrls
433        assert!(
434            property_keys
435                .iter()
436                .any(|k| k.to_lowercase() == "deduplicatesimilarurls"),
437            "deduplicateSimilarURLs not found"
438        );
439
440        // Check for ignoreQueryParameters
441        assert!(
442            property_keys.contains(&"ignoreQueryParameters".to_string()),
443            "ignoreQueryParameters not found"
444        );
445
446        // Check expected property types and descriptions for properties that certainly exist
447        assert_eq!(properties["scrapeOptions"]["type"], "object");
448
449        // Check array properties
450        assert_eq!(properties["includePaths"]["type"], "array");
451        assert_eq!(properties["includePaths"]["items"]["type"], "string");
452        assert_eq!(properties["excludePaths"]["type"], "array");
453        assert_eq!(properties["excludePaths"]["items"]["type"], "string");
454
455        // Check boolean properties
456        assert_eq!(properties["ignoreSitemap"]["type"], "boolean");
457        assert_eq!(properties["allowBackwardLinks"]["type"], "boolean");
458        assert_eq!(properties["allowExternalLinks"]["type"], "boolean");
459
460        // Check numeric properties
461        assert!(
462            properties["maxDepth"]["type"] == "integer"
463                || properties["maxDepth"]["type"] == "number",
464            "Property maxDepth should be numeric"
465        );
466        assert!(
467            properties["limit"]["type"] == "integer" || properties["limit"]["type"] == "number",
468            "Property limit should be numeric"
469        );
470    }
471}