firecrawl_sdk/
batch_scrape.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6#[cfg(feature = "mcp-tool")]
7use schemars::JsonSchema;
8
9use crate::{API_VERSION, FirecrawlApp, FirecrawlError, document::Document, scrape::ScrapeOptions};
10
11#[serde_with::skip_serializing_none]
12#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
13#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
14#[serde(rename_all = "camelCase")]
15pub struct Webhook {
16    /// Webhook URL to notify when scraping is complete
17    pub url: String,
18
19    /// Custom headers to send with webhook
20    pub headers: Option<HashMap<String, String>>,
21
22    /// Custom metadata to include in webhook payload
23    pub metadata: Option<HashMap<String, Value>>,
24
25    /// Events that trigger the webhook
26    pub events: Option<Vec<String>>,
27}
28
29impl Webhook {
30    pub fn dummy() -> Self {
31        Webhook {
32            url: "https://webhook.example.com".to_string(),
33            headers: None,
34            metadata: None,
35            events: None,
36        }
37    }
38}
39
40#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
41#[serde(rename_all = "camelCase")]
42pub struct BatchScrapeRequestBody {
43    /// List of URLs to scrape
44    pub urls: Vec<String>,
45
46    /// Webhook configuration for notifications
47    pub webhook: Webhook,
48
49    /// Whether to ignore invalid URLs
50    #[serde(rename = "ignoreInvalidURLs")]
51    pub ignore_invalid_urls: Option<bool>,
52
53    /// Scraping options
54    #[serde(flatten)]
55    pub options: ScrapeOptions,
56}
57
58#[derive(Deserialize, Serialize, Debug, Default)]
59#[serde(rename_all = "camelCase")]
60struct BatchScrapeResponse {
61    /// This will always be `true` due to `FirecrawlApp::handle_response`.
62    success: bool,
63
64    /// The ID of the batch scrape job
65    id: String,
66
67    /// The URL to check the status of the batch scrape job
68    url: String,
69
70    /// If ignoreInvalidURLs is true, this is an array containing the invalid URLs
71    /// that were specified in the request. If there were no invalid URLs, this will
72    /// be an empty array. If ignoreInvalidURLs is false, this field will be undefined.
73    #[serde(skip_serializing_if = "Option::is_none")]
74    invalid_urls: Option<Vec<String>>,
75}
76
77#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
78#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
79#[serde(rename_all = "camelCase")]
80pub struct BatchScrapeUrlsInput {
81    /// List of URLs to scrape
82    pub urls: Vec<String>,
83
84    /// Webhook configuration for notifications, the url default to a dummy url
85    pub webhook: Option<Webhook>,
86
87    /// Whether to ignore invalid URLs
88    #[serde(skip)]
89    pub ignore_invalid_urls: Option<bool>,
90
91    /// Poll interval in milliseconds. (default: 2000)
92    pub poll_interval: Option<u64>,
93
94    #[serde(skip)]
95    pub idempotency_key: Option<String>,
96
97    /// Scraping options
98    #[serde(flatten)]
99    pub options: Option<ScrapeOptions>,
100}
101
102#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
103#[serde(rename_all = "camelCase")]
104pub enum BatchScrapeStatusTypes {
105    Scraping,
106    Completed,
107    Failed,
108}
109
110impl Default for BatchScrapeStatusTypes {
111    fn default() -> Self {
112        Self::Scraping
113    }
114}
115
116#[derive(Deserialize, Serialize, Debug, Default)]
117#[serde(rename_all = "camelCase")]
118pub struct BatchScrapeStatus {
119    /// This will always be `true` due to `FirecrawlApp::handle_response`.
120    pub success: bool,
121
122    /// The status of the batch scrape job
123    pub status: BatchScrapeStatusTypes,
124
125    /// The total number of URLs in the batch
126    #[serde(default)]
127    pub total: usize,
128
129    /// The number of completed URLs in the batch
130    #[serde(default)]
131    pub completed: usize,
132
133    /// The number of credits used for this batch scrape
134    #[serde(default)]
135    pub credits_used: usize,
136
137    /// When the batch scrape results expire
138    pub expires_at: Option<String>,
139
140    /// Cursor for the next page of results, if any
141    pub next: Option<String>,
142
143    /// The resulting documents if the status is Completed
144    #[serde(default)]
145    pub data: Vec<Document>,
146}
147
148impl FirecrawlApp {
149    /// Scrapes multiple URLs in a single request using the Firecrawl API.
150    pub async fn batch_scrape_urls(
151        &self,
152        urls: Vec<String>,
153        options: impl Into<Option<ScrapeOptions>>,
154        poll_interval: Option<u64>,
155        idempotency_key: Option<String>,
156        webhook: Webhook,
157        ignore_invalid_urls: Option<bool>,
158    ) -> Result<BatchScrapeStatus, FirecrawlError> {
159        let request_body = BatchScrapeRequestBody {
160            urls,
161            webhook,
162            ignore_invalid_urls,
163            options: options.into().unwrap_or_default(),
164        };
165
166        let headers = self.prepare_headers(idempotency_key.as_ref());
167
168        let response = self
169            .client
170            .post(format!("{}/{}/batch/scrape", self.api_url, API_VERSION))
171            .headers(headers)
172            .json(&request_body)
173            .send()
174            .await
175            .map_err(|e| FirecrawlError::HttpError("Batch scraping URLs".to_string(), e))?;
176
177        let response = self
178            .handle_response::<BatchScrapeResponse>(response, "batch scrape URLs")
179            .await?;
180
181        let poll_interval = poll_interval.unwrap_or(2000);
182        self.monitor_batch_scrape_status(&response.id, poll_interval)
183            .await
184    }
185
186    /// Checks the status of a batch scrape job.
187    pub async fn check_batch_scrape_status(
188        &self,
189        id: &str,
190    ) -> Result<BatchScrapeStatus, FirecrawlError> {
191        let headers = self.prepare_headers(None);
192
193        println!("Checking batch scrape status for job: {}", id);
194
195        let response = self
196            .client
197            .get(format!(
198                "{}/{}/batch/scrape/{}",
199                self.api_url, API_VERSION, id
200            ))
201            .headers(headers)
202            .send()
203            .await
204            .map_err(|e| {
205                FirecrawlError::HttpError("Checking batch scrape status".to_string(), e)
206            })?;
207
208        self.handle_response::<BatchScrapeStatus>(response, "check batch scrape status")
209            .await
210    }
211
212    /// Monitors a batch scrape job until it completes, fails, or is cancelled.
213    pub async fn monitor_batch_scrape_status(
214        &self,
215        id: &str,
216        poll_interval: u64,
217    ) -> Result<BatchScrapeStatus, FirecrawlError> {
218        let mut all_data = Vec::new();
219        let mut current_cursor: Option<String> = None;
220
221        loop {
222            let mut status_data = if let Some(ref cursor) = current_cursor {
223                self.check_batch_scrape_status_with_cursor(id, cursor)
224                    .await?
225            } else {
226                self.check_batch_scrape_status(id).await?
227            };
228
229            // Collect data from this page
230            all_data.append(&mut status_data.data);
231
232            // Check if we need to paginate
233            if let Some(next) = status_data.next {
234                current_cursor = Some(next);
235                continue;
236            }
237
238            // Check job status
239            match status_data.status {
240                BatchScrapeStatusTypes::Completed => {
241                    // Put all collected data back into the status
242                    status_data.data = all_data;
243                    break Ok(status_data);
244                }
245                BatchScrapeStatusTypes::Scraping => {
246                    tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
247                    // Keep the cursor as is, to continue from where we left off
248                }
249                BatchScrapeStatusTypes::Failed => {
250                    break Err(FirecrawlError::BatchScrapeJobFailed(
251                        "Batch scrape job failed.".to_string(),
252                    ));
253                }
254            }
255        }
256    }
257
258    /// Checks the status of a batch scrape job with a cursor for pagination.
259    pub async fn check_batch_scrape_status_with_cursor(
260        &self,
261        id: &str,
262        cursor: &str,
263    ) -> Result<BatchScrapeStatus, FirecrawlError> {
264        let headers = self.prepare_headers(None);
265
266        let response = self
267            .client
268            .get(format!(
269                "{}/{}/batch/scrape/{}?cursor={}",
270                self.api_url, API_VERSION, id, cursor
271            ))
272            .headers(headers)
273            .send()
274            .await
275            .map_err(|e| {
276                FirecrawlError::HttpError("Checking batch scrape status".to_string(), e)
277            })?;
278
279        self.handle_response::<BatchScrapeStatus>(response, "check batch scrape status")
280            .await
281    }
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287    use crate::scrape::{Action, ActionType, JsonOptions, ScrapeFormats};
288    use serde_json::json;
289
290    #[test]
291    fn test_batch_scrape_request_serialization() {
292        // API example JSON
293        let json_data = json!({
294            "urls": ["https://example.com"],
295            "webhook": {
296                "url": "https://webhook.example.com",
297                "headers": {},
298                "metadata": {},
299                "events": ["completed"]
300            },
301            "formats": ["markdown"],
302            "onlyMainContent": true,
303            "includeTags": ["div"],
304            "excludeTags": ["img"],
305            "headers": {},
306            "waitFor": 0,
307            "mobile": false,
308            "skipTlsVerification": false,
309            "timeout": 30000,
310            "jsonOptions": {
311                "schema": { "type": "object" },
312                "systemPrompt": "Extract data",
313                "prompt": "Extract title"
314            },
315            "actions": [
316                {
317                    "type": "wait",
318                    "milliseconds": 2000,
319                    "selector": "#my-element"
320                }
321            ],
322            "location": {
323                "country": "US",
324                "languages": ["en-US"]
325            },
326            "removeBase64Images": true,
327            "blockAds": true,
328            "proxy": "basic"
329        });
330
331        // Deserialize the JSON to our request body struct
332        let req_body: BatchScrapeRequestBody =
333            serde_json::from_value(json_data).expect("Failed to deserialize JSON");
334
335        // Create the expected complete request body struct
336        let expected_req_body = BatchScrapeRequestBody {
337            urls: vec!["https://example.com".to_string()],
338            webhook: Webhook {
339                url: "https://webhook.example.com".to_string(),
340                headers: Some(HashMap::new()),
341                metadata: Some(HashMap::new()),
342                events: Some(vec!["completed".to_string()]),
343            },
344            ignore_invalid_urls: None, // This field wasn't in the JSON, so it should be None
345            options: ScrapeOptions {
346                formats: Some(vec![ScrapeFormats::Markdown]),
347                only_main_content: Some(true),
348                include_tags: Some(vec!["div".to_string()]),
349                exclude_tags: Some(vec!["img".to_string()]),
350                headers: Some(HashMap::new()),
351                wait_for: Some(0),
352                mobile: Some(false),
353                skip_tls_verification: Some(false),
354                timeout: Some(30000),
355                json_options: Some(JsonOptions {
356                    schema: Some(json!({"type": "object"})),
357                    system_prompt: Some("Extract data".to_string()),
358                    prompt: Some("Extract title".to_string()),
359                }),
360                actions: Some(vec![Action {
361                    action_type: ActionType::Wait,
362                    milliseconds: Some(2000),
363                    selector: Some("#my-element".to_string()),
364                    text: None,
365                    key: None,
366                    direction: None,
367                    script: None,
368                    full_page: None,
369                }]),
370                location: Some(crate::scrape::LocationOptions {
371                    country: "US".to_string(),
372                    languages: vec!["en-US".to_string()],
373                }),
374                remove_base64_images: Some(true),
375                block_ads: Some(true),
376                proxy: Some("basic".to_string()),
377            },
378        };
379
380        // Compare the entire structs
381        assert_eq!(req_body, expected_req_body);
382    }
383
384    #[test]
385    fn test_batch_scrape_options_to_scrape_options() {
386        let scrape_options = ScrapeOptions {
387            formats: Some(vec![ScrapeFormats::Markdown]),
388            only_main_content: Some(true),
389            include_tags: Some(vec!["div".to_string()]),
390            exclude_tags: Some(vec!["img".to_string()]),
391            headers: Some(HashMap::new()),
392            wait_for: Some(1000),
393            mobile: Some(true),
394            skip_tls_verification: Some(false),
395            timeout: Some(2000),
396            json_options: Some(crate::scrape::JsonOptions::default()),
397            actions: Some(vec![]),
398            location: Some(crate::scrape::LocationOptions::default()),
399            remove_base64_images: Some(true),
400            block_ads: Some(true),
401            proxy: Some("basic".to_string()),
402        };
403
404        assert_eq!(scrape_options.formats.as_ref().unwrap().len(), 1);
405        assert!(matches!(
406            scrape_options.formats.as_ref().unwrap()[0],
407            ScrapeFormats::Markdown
408        ));
409        assert!(scrape_options.only_main_content.unwrap());
410        assert_eq!(scrape_options.include_tags.as_ref().unwrap()[0], "div");
411        assert_eq!(scrape_options.exclude_tags.as_ref().unwrap()[0], "img");
412        assert_eq!(scrape_options.wait_for.unwrap(), 1000);
413        assert!(scrape_options.headers.is_some());
414        assert!(scrape_options.mobile.unwrap());
415        assert!(!scrape_options.skip_tls_verification.unwrap());
416        assert_eq!(scrape_options.timeout.unwrap(), 2000);
417        assert!(scrape_options.json_options.is_some());
418        assert!(scrape_options.actions.is_some());
419        assert!(scrape_options.location.is_some());
420        assert!(scrape_options.remove_base64_images.unwrap());
421        assert!(scrape_options.block_ads.unwrap());
422        assert_eq!(scrape_options.proxy.as_ref().unwrap(), "basic");
423    }
424}