firecrawl_sdk/
crawl.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4
5#[cfg(feature = "mcp_tool")]
6use schemars::JsonSchema;
7
8use crate::{
9    document::Document,
10    scrape::{ScrapeFormats, ScrapeOptions},
11    FirecrawlApp, FirecrawlError, API_VERSION,
12};
13
14#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
15#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
16pub enum CrawlScrapeFormats {
17    /// Will result in a copy of the Markdown content of the page.
18    #[serde(rename = "markdown")]
19    Markdown,
20
21    /// Will result in a copy of the filtered, content-only HTML of the page.
22    #[serde(rename = "html")]
23    HTML,
24
25    /// Will result in a copy of the raw HTML of the page.
26    #[serde(rename = "rawHtml")]
27    RawHTML,
28
29    /// Will result in a Vec of URLs found on the page.
30    #[serde(rename = "links")]
31    Links,
32
33    /// Will result in a URL to a screenshot of the page.
34    ///
35    /// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`.
36    #[serde(rename = "screenshot")]
37    Screenshot,
38
39    /// Will result in a URL to a full-page screenshot of the page.
40    ///
41    /// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`.
42    #[serde(rename = "screenshot@fullPage")]
43    ScreenshotFullPage,
44}
45
46impl From<CrawlScrapeFormats> for ScrapeFormats {
47    fn from(value: CrawlScrapeFormats) -> Self {
48        match value {
49            CrawlScrapeFormats::Markdown => Self::Markdown,
50            CrawlScrapeFormats::HTML => Self::HTML,
51            CrawlScrapeFormats::RawHTML => Self::RawHTML,
52            CrawlScrapeFormats::Links => Self::Links,
53            CrawlScrapeFormats::Screenshot => Self::Screenshot,
54            CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
55        }
56    }
57}
58
59#[serde_with::skip_serializing_none]
60#[derive(Deserialize, Serialize, Debug, Default, Clone)]
61#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
62#[serde(rename_all = "camelCase")]
63pub struct CrawlScrapeOptions {
64    /// Formats to extract from the page. (default: `[ Markdown ]`)
65    pub formats: Option<Vec<CrawlScrapeFormats>>,
66
67    /// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`)
68    pub only_main_content: Option<bool>,
69
70    /// HTML tags to exclusively include.
71    ///
72    /// For example, if you pass `div`, you will only get content from `<div>`s and their children.
73    pub include_tags: Option<Vec<String>>,
74
75    /// HTML tags to exclude.
76    ///
77    /// For example, if you pass `img`, you will never get image URLs in your results.
78    pub exclude_tags: Option<Vec<String>>,
79
80    /// Additional HTTP headers to use when loading the page.
81    pub headers: Option<HashMap<String, String>>,
82
83    /// Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`)
84    pub wait_for: Option<u32>,
85
86    /// Timeout before returning an error, in milliseconds. (default: `60000`)
87    pub timeout: Option<u32>,
88}
89
90impl From<CrawlScrapeOptions> for ScrapeOptions {
91    fn from(value: CrawlScrapeOptions) -> Self {
92        ScrapeOptions {
93            formats: value
94                .formats
95                .map(|formats| formats.into_iter().map(|x| x.into()).collect()),
96            only_main_content: value.only_main_content,
97            include_tags: value.include_tags,
98            exclude_tags: value.exclude_tags,
99            headers: value.headers,
100            wait_for: value.wait_for,
101            timeout: value.timeout,
102            ..Default::default()
103        }
104    }
105}
106
107#[serde_with::skip_serializing_none]
108#[derive(Deserialize, Serialize, Debug, Default, Clone)]
109#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
110#[serde(rename_all = "camelCase")]
111pub struct CrawlOptions {
112    /// Options for scraping each page
113    pub scrape_options: Option<CrawlScrapeOptions>,
114
115    /// Only crawl these URL paths
116    pub include_paths: Option<Vec<String>>,
117
118    /// URL paths to exclude from crawling
119    pub exclude_paths: Option<Vec<String>>,
120
121    /// Maximum link depth to crawl (default: `2`)
122    pub max_depth: Option<u32>,
123
124    /// Skip sitemap.xml discovery (default: `true`)
125    pub ignore_sitemap: Option<bool>,
126
127    /// Maximum number of pages to crawl (default: `10`)
128    pub limit: Option<u32>,
129
130    /// Allow crawling links that point to parent directories (default: `false`)
131    pub allow_backward_links: Option<bool>,
132
133    /// Allow crawling links to external domains (default: `false`)
134    pub allow_external_links: Option<bool>,
135
136    /// Webhook URL to notify when crawl is complete
137    pub webhook: Option<String>,
138
139    /// Remove similar URLs during crawl
140    #[serde(rename = "deduplicateSimilarURLs")]
141    pub deduplicate_similar_urls: Option<bool>,
142
143    /// Ignore query parameters when comparing URLs
144    #[serde(rename = "ignoreQueryParameters")]
145    pub ignore_query_parameters: Option<bool>,
146
147    /// Idempotency key to send to the crawl endpoint.
148    #[serde(skip)]
149    pub idempotency_key: Option<String>,
150
151    /// When using `FirecrawlApp::crawl_url`, this is how often the status of the job should be checked, in milliseconds. (default: `2000`)
152    #[serde(skip)]
153    pub poll_interval: Option<u64>,
154}
155
156#[derive(Deserialize, Serialize, Debug, Default)]
157#[serde(rename_all = "camelCase")]
158struct CrawlRequestBody {
159    url: String,
160
161    #[serde(flatten)]
162    options: CrawlOptions,
163}
164
165#[derive(Deserialize, Serialize, Debug, Default)]
166#[serde(rename_all = "camelCase")]
167struct CrawlResponse {
168    /// This will always be `true` due to `FirecrawlApp::handle_response`.
169    /// No need to expose.
170    success: bool,
171
172    /// The resulting document.
173    data: Document,
174}
175
176#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
177#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
178#[serde(rename_all = "camelCase")]
179pub enum CrawlStatusTypes {
180    /// The crawl job is in progress.
181    Scraping,
182
183    /// The crawl job has been completed successfully.
184    Completed,
185
186    /// The crawl job has failed.
187    Failed,
188
189    /// The crawl job has been cancelled.
190    Cancelled,
191}
192
193#[serde_with::skip_serializing_none]
194#[derive(Deserialize, Serialize, Debug, Clone)]
195#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
196#[serde(rename_all = "camelCase")]
197pub struct CrawlStatus {
198    /// The status of the crawl.
199    pub status: CrawlStatusTypes,
200
201    /// Number of pages that will be scraped in total. This number may grow as the crawler discovers new pages.
202    pub total: u32,
203
204    /// Number of pages that have been successfully scraped.
205    pub completed: u32,
206
207    /// Amount of credits used by the crawl job.
208    pub credits_used: u32,
209
210    /// Expiry time of crawl data. After this date, the crawl data will be unavailable from the API.
211    pub expires_at: String, // TODO: parse into date
212
213    /// URL to call to get the next batch of documents.
214    /// Unless you are sidestepping the SDK, you do not need to deal with this.
215    pub next: Option<String>,
216
217    /// List of documents returned by the crawl
218    pub data: Vec<Document>,
219}
220
221#[derive(Deserialize, Serialize, Debug, Clone)]
222#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
223#[serde(rename_all = "camelCase")]
224pub struct CrawlAsyncResponse {
225    success: bool,
226
227    /// Crawl ID
228    pub id: String,
229
230    /// URL to get the status of the crawl job
231    pub url: String,
232}
233
234impl FirecrawlApp {
235    /// Initiates a crawl job for a URL using the Firecrawl API.
236    pub async fn crawl_url_async(
237        &self,
238        url: impl AsRef<str>,
239        options: Option<CrawlOptions>,
240    ) -> Result<CrawlAsyncResponse, FirecrawlError> {
241        let body = CrawlRequestBody {
242            url: url.as_ref().to_string(),
243            options: options.unwrap_or_default(),
244        };
245
246        let headers = self.prepare_headers(body.options.idempotency_key.as_ref());
247
248        let response = self
249            .client
250            .post(&format!("{}/{}/crawl", self.api_url, API_VERSION))
251            .headers(headers.clone())
252            .json(&body)
253            .send()
254            .await
255            .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
256
257        self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
258            .await
259    }
260
261    /// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
262    pub async fn crawl_url(
263        &self,
264        url: impl AsRef<str>,
265        options: impl Into<Option<CrawlOptions>>,
266    ) -> Result<CrawlStatus, FirecrawlError> {
267        let options = options.into();
268        let poll_interval = options
269            .as_ref()
270            .and_then(|x| x.poll_interval)
271            .unwrap_or(2000);
272        let res = self.crawl_url_async(url, options).await?;
273
274        self.monitor_job_status(&res.id, poll_interval).await
275    }
276
277    async fn check_crawl_status_next(
278        &self,
279        next: impl AsRef<str>,
280    ) -> Result<CrawlStatus, FirecrawlError> {
281        let response = self
282            .client
283            .get(next.as_ref())
284            .headers(self.prepare_headers(None))
285            .send()
286            .await
287            .map_err(|e| {
288                FirecrawlError::HttpError(
289                    format!("Paginating crawl using URL {:?}", next.as_ref()),
290                    e,
291                )
292            })?;
293
294        self.handle_response(
295            response,
296            format!("Paginating crawl using URL {:?}", next.as_ref()),
297        )
298        .await
299    }
300
301    /// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
302    pub async fn check_crawl_status(
303        &self,
304        id: impl AsRef<str>,
305    ) -> Result<CrawlStatus, FirecrawlError> {
306        let response = self
307            .client
308            .get(&format!(
309                "{}/{}/crawl/{}",
310                self.api_url,
311                API_VERSION,
312                id.as_ref()
313            ))
314            .headers(self.prepare_headers(None))
315            .send()
316            .await
317            .map_err(|e| {
318                FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
319            })?;
320
321        let mut status: CrawlStatus = self
322            .handle_response(
323                response,
324                format!("Checking status of crawl {}", id.as_ref()),
325            )
326            .await?;
327
328        if status.status == CrawlStatusTypes::Completed {
329            while let Some(next) = status.next {
330                let new_status = self.check_crawl_status_next(next).await?;
331                status.data.extend_from_slice(&new_status.data);
332                status.next = new_status.next;
333            }
334        }
335
336        Ok(status)
337    }
338
339    async fn monitor_job_status(
340        &self,
341        id: &str,
342        poll_interval: u64,
343    ) -> Result<CrawlStatus, FirecrawlError> {
344        loop {
345            let status_data = self.check_crawl_status(id).await?;
346            match status_data.status {
347                CrawlStatusTypes::Completed => {
348                    break Ok(status_data);
349                }
350                CrawlStatusTypes::Scraping => {
351                    tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
352                }
353                CrawlStatusTypes::Failed => {
354                    break Err(FirecrawlError::CrawlJobFailed(
355                        format!("Crawl job failed."),
356                        status_data,
357                    ));
358                }
359                CrawlStatusTypes::Cancelled => {
360                    break Err(FirecrawlError::CrawlJobFailed(
361                        format!("Crawl job was cancelled."),
362                        status_data,
363                    ));
364                }
365            }
366        }
367    }
368}
369
370#[cfg(all(test, feature = "mcp_tool"))]
371mod schema_tests {
372    use super::*;
373    use async_claude;
374    use serde_json::json;
375
376    #[test]
377    fn test_crawl_options_schema() {
378        let actual_schema = async_claude::tool::parse_input_schema::<CrawlOptions>().unwrap();
379
380        // For debugging
381        println!(
382            "Schema properties: {}",
383            serde_json::to_string_pretty(&actual_schema["properties"]).unwrap()
384        );
385
386        // Check basic structure
387        assert_eq!(actual_schema["type"], "object");
388
389        // Get properties object
390        let properties = &actual_schema["properties"];
391        assert!(properties.is_object());
392
393        // Get the actual property keys from the schema
394        let property_keys: Vec<String> = properties
395            .as_object()
396            .unwrap()
397            .keys()
398            .map(|k| k.to_string())
399            .collect();
400
401        println!("Actual property keys: {:?}", property_keys);
402
403        // Check that important properties exist, being flexible with URL vs Url
404        assert!(
405            property_keys.contains(&"scrapeOptions".to_string()),
406            "scrapeOptions not found"
407        );
408        assert!(
409            property_keys.contains(&"includePaths".to_string()),
410            "includePaths not found"
411        );
412        assert!(
413            property_keys.contains(&"excludePaths".to_string()),
414            "excludePaths not found"
415        );
416        assert!(
417            property_keys.contains(&"maxDepth".to_string()),
418            "maxDepth not found"
419        );
420        assert!(
421            property_keys.contains(&"ignoreSitemap".to_string()),
422            "ignoreSitemap not found"
423        );
424        assert!(
425            property_keys.contains(&"limit".to_string()),
426            "limit not found"
427        );
428        assert!(
429            property_keys.contains(&"allowBackwardLinks".to_string()),
430            "allowBackwardLinks not found"
431        );
432        assert!(
433            property_keys.contains(&"allowExternalLinks".to_string()),
434            "allowExternalLinks not found"
435        );
436        assert!(
437            property_keys.contains(&"webhook".to_string()),
438            "webhook not found"
439        );
440
441        // Check for deduplicateSimilarURLs or deduplicateSimilarUrls
442        assert!(
443            property_keys
444                .iter()
445                .any(|k| k.to_lowercase() == "deduplicatesimilarurls"),
446            "deduplicateSimilarURLs not found"
447        );
448
449        // Check for ignoreQueryParameters
450        assert!(
451            property_keys.contains(&"ignoreQueryParameters".to_string()),
452            "ignoreQueryParameters not found"
453        );
454
455        // Check expected property types and descriptions for properties that certainly exist
456        assert_eq!(properties["scrapeOptions"]["type"], "object");
457
458        // Check array properties
459        assert_eq!(properties["includePaths"]["type"], "array");
460        assert_eq!(properties["includePaths"]["items"]["type"], "string");
461        assert_eq!(properties["excludePaths"]["type"], "array");
462        assert_eq!(properties["excludePaths"]["items"]["type"], "string");
463
464        // Check boolean properties
465        assert_eq!(properties["ignoreSitemap"]["type"], "boolean");
466        assert_eq!(properties["allowBackwardLinks"]["type"], "boolean");
467        assert_eq!(properties["allowExternalLinks"]["type"], "boolean");
468
469        // Check numeric properties
470        assert!(
471            properties["maxDepth"]["type"] == "integer"
472                || properties["maxDepth"]["type"] == "number",
473            "Property maxDepth should be numeric"
474        );
475        assert!(
476            properties["limit"]["type"] == "integer" || properties["limit"]["type"] == "number",
477            "Property limit should be numeric"
478        );
479    }
480}