firecrawl_sdk/
crawl.rs

1use serde::{Deserialize, Serialize};
2
3#[cfg(feature = "mcp-tool")]
4use schemars::JsonSchema;
5
6use crate::{
7    API_VERSION, FirecrawlApp, FirecrawlError,
8    batch_scrape::Webhook,
9    document::Document,
10    scrape::{ScrapeFormats, ScrapeOptions},
11};
12
13#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
14#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
15pub enum CrawlScrapeFormats {
16    /// Will result in a copy of the Markdown content of the page.
17    #[serde(rename = "markdown")]
18    Markdown,
19
20    /// Will result in a copy of the filtered, content-only HTML of the page.
21    #[serde(rename = "html")]
22    HTML,
23
24    /// Will result in a copy of the raw HTML of the page.
25    #[serde(rename = "rawHtml")]
26    RawHTML,
27
28    /// Will result in a Vec of URLs found on the page.
29    #[serde(rename = "links")]
30    Links,
31
32    /// Will result in a URL to a screenshot of the page.
33    ///
34    /// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`.
35    #[serde(rename = "screenshot")]
36    Screenshot,
37
38    /// Will result in a URL to a full-page screenshot of the page.
39    ///
40    /// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`.
41    #[serde(rename = "screenshot@fullPage")]
42    ScreenshotFullPage,
43}
44
45impl From<CrawlScrapeFormats> for ScrapeFormats {
46    fn from(value: CrawlScrapeFormats) -> Self {
47        match value {
48            CrawlScrapeFormats::Markdown => Self::Markdown,
49            CrawlScrapeFormats::HTML => Self::HTML,
50            CrawlScrapeFormats::RawHTML => Self::RawHTML,
51            CrawlScrapeFormats::Links => Self::Links,
52            CrawlScrapeFormats::Screenshot => Self::Screenshot,
53            CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
54        }
55    }
56}
57
58#[serde_with::skip_serializing_none]
59#[derive(Deserialize, Serialize, Debug, Default, Clone)]
60#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
61#[serde(rename_all = "camelCase")]
62pub struct CrawlOptions {
63    /// Options for scraping each page
64    pub scrape_options: Option<ScrapeOptions>,
65
66    /// Only crawl these URL paths
67    pub include_paths: Option<Vec<String>>,
68
69    /// URL paths to exclude from crawling
70    pub exclude_paths: Option<Vec<String>>,
71
72    /// Maximum link depth to crawl. (default: `2`)
73    pub max_depth: Option<u32>,
74
75    /// Skip sitemap.xml discovery. (default: `true`)
76    pub ignore_sitemap: Option<bool>,
77
78    /// Maximum number of pages to crawl. (default: `10`)
79    pub limit: Option<u32>,
80
81    /// Allow crawling links that point to parent directories. (default: `false`)
82    pub allow_backward_links: Option<bool>,
83
84    /// Allow crawling links to external domains. (default: `false`)
85    pub allow_external_links: Option<bool>,
86
87    /// Remove similar URLs during crawl
88    #[serde(rename = "deduplicateSimilarURLs")]
89    pub deduplicate_similar_urls: Option<bool>,
90
91    /// Ignore query parameters when comparing URLs
92    pub ignore_query_parameters: Option<bool>,
93}
94
95#[derive(Deserialize, Serialize, Debug, Default)]
96#[serde(rename_all = "camelCase")]
97pub struct CrawlRequestBody {
98    /// Starting URL for the crawl
99    pub url: String,
100
101    #[serde(flatten)]
102    pub options: CrawlOptions,
103
104    /// Webhook URL to notify when crawl is complete
105    pub webhook: Webhook,
106}
107
108#[derive(Deserialize, Serialize, Debug, Default)]
109#[serde(rename_all = "camelCase")]
110pub struct CrawlResponse {
111    /// This will always be `true` due to `FirecrawlApp::handle_response`.
112    /// No need to expose.
113    pub success: bool,
114
115    /// The resulting document.
116    pub data: Document,
117}
118
119#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
120#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
121#[serde(rename_all = "camelCase")]
122pub enum CrawlStatusTypes {
123    /// The crawl job is in progress.
124    Scraping,
125
126    /// The crawl job has been completed successfully.
127    Completed,
128
129    /// The crawl job has failed.
130    Failed,
131
132    /// The crawl job has been cancelled.
133    Cancelled,
134}
135
136#[serde_with::skip_serializing_none]
137#[derive(Deserialize, Serialize, Debug, Clone)]
138#[serde(rename_all = "camelCase")]
139pub struct CrawlStatus {
140    /// The status of the crawl.
141    pub status: CrawlStatusTypes,
142
143    /// Number of pages that will be scraped in total. This number may grow as the crawler discovers new pages.
144    pub total: u32,
145
146    /// Number of pages that have been successfully scraped.
147    pub completed: u32,
148
149    /// Amount of credits used by the crawl job.
150    pub credits_used: u32,
151
152    /// Expiry time of crawl data. After this date, the crawl data will be unavailable from the API.
153    pub expires_at: String, // TODO: parse into date
154
155    /// URL to call to get the next batch of documents.
156    /// Unless you are sidestepping the SDK, you do not need to deal with this.
157    pub next: Option<String>,
158
159    /// List of documents returned by the crawl
160    pub data: Vec<Document>,
161}
162
163#[derive(Deserialize, Serialize, Debug, Clone)]
164#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
165#[serde(rename_all = "camelCase")]
166pub struct CrawlAsyncResponse {
167    success: bool,
168
169    /// Crawl ID
170    pub id: String,
171
172    /// URL to get the status of the crawl job
173    pub url: String,
174}
175
176#[derive(Deserialize, Serialize, Debug, Default)]
177#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
178#[serde(rename_all = "camelCase")]
179pub struct CrawlUrlInput {
180    /// Starting URL for the crawl
181    pub url: String,
182
183    #[serde(flatten)]
184    pub options: CrawlOptions,
185
186    /// How often the status of the job should be checked, in milliseconds. (default: `2000`)
187    pub poll_interval: Option<u64>,
188
189    #[serde(skip)]
190    pub idempotency_key: Option<String>,
191
192    /// Webhook URL to notify when crawl is complete, default to a dummy url
193    pub webhook: Option<Webhook>,
194}
195
196impl FirecrawlApp {
197    /// Initiates a crawl job for a URL using the Firecrawl API.
198    pub async fn crawl_url_async(
199        &self,
200        url: impl AsRef<str>,
201        options: Option<CrawlOptions>,
202        idempotency_key: Option<String>,
203        webhook: Webhook,
204    ) -> Result<CrawlAsyncResponse, FirecrawlError> {
205        let body = CrawlRequestBody {
206            url: url.as_ref().to_string(),
207            options: options.unwrap_or_default(),
208            webhook,
209        };
210
211        let headers = self.prepare_headers(idempotency_key.as_ref());
212
213        let response = self
214            .client
215            .post(format!("{}/{}/crawl", self.api_url, API_VERSION))
216            .headers(headers.clone())
217            .json(&body)
218            .send()
219            .await
220            .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
221
222        self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
223            .await
224    }
225
226    /// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
227    pub async fn crawl_url(
228        &self,
229        url: impl AsRef<str>,
230        options: impl Into<Option<CrawlOptions>>,
231        webhook: Webhook,
232        poll_interval: Option<u64>,
233        idempotency_key: Option<String>,
234    ) -> Result<CrawlStatus, FirecrawlError> {
235        let options = options.into();
236        let poll_interval = poll_interval.unwrap_or(2000);
237
238        let res = self
239            .crawl_url_async(url, options, idempotency_key, webhook)
240            .await?;
241
242        self.monitor_crawl_status(&res.id, poll_interval).await
243    }
244
245    async fn check_crawl_status_next(
246        &self,
247        next: impl AsRef<str>,
248    ) -> Result<CrawlStatus, FirecrawlError> {
249        let response = self
250            .client
251            .get(next.as_ref())
252            .headers(self.prepare_headers(None))
253            .send()
254            .await
255            .map_err(|e| {
256                FirecrawlError::HttpError(
257                    format!("Paginating crawl using URL {:?}", next.as_ref()),
258                    e,
259                )
260            })?;
261
262        self.handle_response(
263            response,
264            format!("Paginating crawl using URL {:?}", next.as_ref()),
265        )
266        .await
267    }
268
269    /// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
270    pub async fn check_crawl_status(
271        &self,
272        id: impl AsRef<str>,
273    ) -> Result<CrawlStatus, FirecrawlError> {
274        let response = self
275            .client
276            .get(format!(
277                "{}/{}/crawl/{}",
278                self.api_url,
279                API_VERSION,
280                id.as_ref()
281            ))
282            .headers(self.prepare_headers(None))
283            .send()
284            .await
285            .map_err(|e| {
286                FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
287            })?;
288
289        let mut status: CrawlStatus = self
290            .handle_response(
291                response,
292                format!("Checking status of crawl {}", id.as_ref()),
293            )
294            .await?;
295
296        if status.status == CrawlStatusTypes::Completed {
297            while let Some(next) = status.next {
298                let new_status = self.check_crawl_status_next(next).await?;
299                status.data.extend_from_slice(&new_status.data);
300                status.next = new_status.next;
301            }
302        }
303
304        Ok(status)
305    }
306
307    async fn monitor_crawl_status(
308        &self,
309        id: &str,
310        poll_interval: u64,
311    ) -> Result<CrawlStatus, FirecrawlError> {
312        let mut all_data = Vec::new();
313        let mut current_cursor: Option<String> = None;
314
315        loop {
316            // Get status data, either from the base endpoint or using the next cursor
317            let mut status_data = if let Some(ref cursor) = current_cursor {
318                self.check_crawl_status_next(cursor).await?
319            } else {
320                self.check_crawl_status(id).await?
321            };
322
323            // Collect data from this page
324            all_data.append(&mut status_data.data);
325
326            // Check if we need to paginate
327            if let Some(next) = status_data.next {
328                current_cursor = Some(next);
329                continue;
330            }
331
332            // Check job status
333            match status_data.status {
334                CrawlStatusTypes::Completed => {
335                    // Put all collected data back into the status
336                    status_data.data = all_data;
337                    break Ok(status_data);
338                }
339                CrawlStatusTypes::Scraping => {
340                    tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
341                    // Keep the cursor as is, to continue from where we left off
342                }
343                CrawlStatusTypes::Failed => {
344                    // Put all collected data back into the status for error context
345                    status_data.data = all_data;
346                    break Err(FirecrawlError::CrawlJobFailed(
347                        "Crawl job failed.".to_string(),
348                        status_data,
349                    ));
350                }
351                CrawlStatusTypes::Cancelled => {
352                    // Put all collected data back into the status for error context
353                    status_data.data = all_data;
354                    break Err(FirecrawlError::CrawlJobCancelled(status_data));
355                }
356            }
357        }
358    }
359}
360
361#[cfg(all(test, feature = "mcp-tool"))]
362mod schema_tests {
363    use super::*;
364    use async_claude;
365
366    #[test]
367    fn test_crawl_options_schema() {
368        let actual_schema = async_claude::tool::parse_input_schema::<CrawlOptions>().unwrap();
369
370        // For debugging
371        println!(
372            "Schema properties: {}",
373            serde_json::to_string_pretty(&actual_schema["properties"]).unwrap()
374        );
375
376        // Check basic structure
377        assert_eq!(actual_schema["type"], "object");
378
379        // Get properties object
380        let properties = &actual_schema["properties"];
381        assert!(properties.is_object());
382
383        // Get the actual property keys from the schema
384        let property_keys: Vec<String> = properties
385            .as_object()
386            .unwrap()
387            .keys()
388            .map(|k| k.to_string())
389            .collect();
390
391        println!("Actual property keys: {:?}", property_keys);
392
393        // Check that important properties exist, being flexible with URL vs Url
394        assert!(
395            property_keys.contains(&"scrapeOptions".to_string()),
396            "scrapeOptions not found"
397        );
398        assert!(
399            property_keys.contains(&"includePaths".to_string()),
400            "includePaths not found"
401        );
402        assert!(
403            property_keys.contains(&"excludePaths".to_string()),
404            "excludePaths not found"
405        );
406        assert!(
407            property_keys.contains(&"maxDepth".to_string()),
408            "maxDepth not found"
409        );
410        assert!(
411            property_keys.contains(&"ignoreSitemap".to_string()),
412            "ignoreSitemap not found"
413        );
414        assert!(
415            property_keys.contains(&"limit".to_string()),
416            "limit not found"
417        );
418        assert!(
419            property_keys.contains(&"allowBackwardLinks".to_string()),
420            "allowBackwardLinks not found"
421        );
422        assert!(
423            property_keys.contains(&"allowExternalLinks".to_string()),
424            "allowExternalLinks not found"
425        );
426        assert!(
427            property_keys.contains(&"webhook".to_string()),
428            "webhook not found"
429        );
430
431        // Check for deduplicateSimilarURLs or deduplicateSimilarUrls
432        assert!(
433            property_keys
434                .iter()
435                .any(|k| k.to_lowercase() == "deduplicatesimilarurls"),
436            "deduplicateSimilarURLs not found"
437        );
438
439        // Check for ignoreQueryParameters
440        assert!(
441            property_keys.contains(&"ignoreQueryParameters".to_string()),
442            "ignoreQueryParameters not found"
443        );
444
445        // Check expected property types and descriptions for properties that certainly exist
446        assert_eq!(properties["scrapeOptions"]["type"], "object");
447
448        // Check array properties
449        assert_eq!(properties["includePaths"]["type"], "array");
450        assert_eq!(properties["includePaths"]["items"]["type"], "string");
451        assert_eq!(properties["excludePaths"]["type"], "array");
452        assert_eq!(properties["excludePaths"]["items"]["type"], "string");
453
454        // Check boolean properties
455        assert_eq!(properties["ignoreSitemap"]["type"], "boolean");
456        assert_eq!(properties["allowBackwardLinks"]["type"], "boolean");
457        assert_eq!(properties["allowExternalLinks"]["type"], "boolean");
458
459        // Check numeric properties
460        assert!(
461            properties["maxDepth"]["type"] == "integer"
462                || properties["maxDepth"]["type"] == "number",
463            "Property maxDepth should be numeric"
464        );
465        assert!(
466            properties["limit"]["type"] == "integer" || properties["limit"]["type"] == "number",
467            "Property limit should be numeric"
468        );
469    }
470}