firecrawl_sdk/
batch_scrape.rs

1use std::collections::HashMap;
2
3use log::info;
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6
7#[cfg(feature = "mcp_tool")]
8use schemars::JsonSchema;
9
10use crate::{document::Document, scrape::ScrapeOptions, FirecrawlApp, FirecrawlError, API_VERSION};
11
12#[serde_with::skip_serializing_none]
13#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
14#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
15#[serde(rename_all = "camelCase")]
16pub struct Webhook {
17    /// Webhook URL to notify when scraping is complete
18    pub url: String,
19
20    /// Custom headers to send with webhook
21    pub headers: Option<HashMap<String, String>>,
22
23    /// Custom metadata to include in webhook payload
24    pub metadata: Option<HashMap<String, Value>>,
25
26    /// Events that trigger the webhook
27    pub events: Option<Vec<String>>,
28}
29
30impl Webhook {
31    pub fn dummy() -> Self {
32        Webhook {
33            url: "https://webhook.example.com".to_string(),
34            headers: None,
35            metadata: None,
36            events: None,
37        }
38    }
39}
40
41#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
42#[serde(rename_all = "camelCase")]
43pub struct BatchScrapeRequestBody {
44    /// List of URLs to scrape
45    pub urls: Vec<String>,
46
47    /// Webhook configuration for notifications
48    pub webhook: Webhook,
49
50    /// Whether to ignore invalid URLs
51    #[serde(rename = "ignoreInvalidURLs")]
52    pub ignore_invalid_urls: Option<bool>,
53
54    /// Scraping options
55    #[serde(flatten)]
56    pub options: ScrapeOptions,
57}
58
59#[derive(Deserialize, Serialize, Debug, Default)]
60#[serde(rename_all = "camelCase")]
61struct BatchScrapeResponse {
62    /// This will always be `true` due to `FirecrawlApp::handle_response`.
63    success: bool,
64
65    /// The ID of the batch scrape job
66    id: String,
67
68    /// The URL to check the status of the batch scrape job
69    url: String,
70
71    /// If ignoreInvalidURLs is true, this is an array containing the invalid URLs
72    /// that were specified in the request. If there were no invalid URLs, this will
73    /// be an empty array. If ignoreInvalidURLs is false, this field will be undefined.
74    #[serde(skip_serializing_if = "Option::is_none")]
75    invalid_urls: Option<Vec<String>>,
76}
77
78#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
79#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
80#[serde(rename_all = "camelCase")]
81pub struct BatchScrapeUrlsInput {
82    /// List of URLs to scrape
83    pub urls: Vec<String>,
84
85    /// Webhook configuration for notifications, the url default to a dummy url
86    pub webhook: Option<Webhook>,
87
88    /// Whether to ignore invalid URLs
89    #[serde(skip)]
90    pub ignore_invalid_urls: Option<bool>,
91
92    /// Poll interval in milliseconds. (default: 2000)
93    pub poll_interval: Option<u64>,
94
95    #[serde(skip)]
96    pub idempotency_key: Option<String>,
97
98    /// Scraping options
99    #[serde(flatten)]
100    pub options: Option<ScrapeOptions>,
101}
102
103#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
104#[serde(rename_all = "camelCase")]
105pub enum BatchScrapeStatusTypes {
106    Scraping,
107    Completed,
108    Failed,
109}
110
111impl Default for BatchScrapeStatusTypes {
112    fn default() -> Self {
113        Self::Scraping
114    }
115}
116
117#[derive(Deserialize, Serialize, Debug, Default)]
118#[serde(rename_all = "camelCase")]
119pub struct BatchScrapeStatus {
120    /// This will always be `true` due to `FirecrawlApp::handle_response`.
121    pub success: bool,
122
123    /// The status of the batch scrape job
124    pub status: BatchScrapeStatusTypes,
125
126    /// The total number of URLs in the batch
127    #[serde(default)]
128    pub total: usize,
129
130    /// The number of completed URLs in the batch
131    #[serde(default)]
132    pub completed: usize,
133
134    /// The number of credits used for this batch scrape
135    #[serde(default)]
136    pub credits_used: usize,
137
138    /// When the batch scrape results expire
139    pub expires_at: Option<String>,
140
141    /// Cursor for the next page of results, if any
142    pub next: Option<String>,
143
144    /// The resulting documents if the status is Completed
145    #[serde(default)]
146    pub data: Vec<Document>,
147}
148
149impl FirecrawlApp {
150    /// Scrapes multiple URLs in a single request using the Firecrawl API.
151    pub async fn batch_scrape_urls(
152        &self,
153        urls: Vec<String>,
154        options: impl Into<Option<ScrapeOptions>>,
155        poll_interval: Option<u64>,
156        idempotency_key: Option<String>,
157        webhook: Webhook,
158        ignore_invalid_urls: Option<bool>,
159    ) -> Result<BatchScrapeStatus, FirecrawlError> {
160        let request_body = BatchScrapeRequestBody {
161            urls,
162            webhook,
163            ignore_invalid_urls,
164            options: options.into().unwrap_or_default(),
165        };
166
167        let headers = self.prepare_headers(idempotency_key.as_ref());
168
169        let response = self
170            .client
171            .post(format!("{}/{}/batch/scrape", self.api_url, API_VERSION))
172            .headers(headers)
173            .json(&request_body)
174            .send()
175            .await
176            .map_err(|e| FirecrawlError::HttpError("Batch scraping URLs".to_string(), e))?;
177
178        let response = self
179            .handle_response::<BatchScrapeResponse>(response, "batch scrape URLs")
180            .await?;
181
182        let poll_interval = poll_interval.unwrap_or(2000);
183        self.monitor_batch_scrape_status(&response.id, poll_interval)
184            .await
185    }
186
187    /// Checks the status of a batch scrape job.
188    pub async fn check_batch_scrape_status(
189        &self,
190        id: &str,
191    ) -> Result<BatchScrapeStatus, FirecrawlError> {
192        let headers = self.prepare_headers(None);
193
194        println!("Checking batch scrape status for job: {}", id);
195
196        let response = self
197            .client
198            .get(format!(
199                "{}/{}/batch/scrape/{}",
200                self.api_url, API_VERSION, id
201            ))
202            .headers(headers)
203            .send()
204            .await
205            .map_err(|e| {
206                FirecrawlError::HttpError("Checking batch scrape status".to_string(), e)
207            })?;
208
209        self.handle_response::<BatchScrapeStatus>(response, "check batch scrape status")
210            .await
211    }
212
213    /// Monitors a batch scrape job until it completes, fails, or is cancelled.
214    pub async fn monitor_batch_scrape_status(
215        &self,
216        id: &str,
217        poll_interval: u64,
218    ) -> Result<BatchScrapeStatus, FirecrawlError> {
219        let mut all_data = Vec::new();
220        let mut current_cursor: Option<String> = None;
221
222        loop {
223            let mut status_data = if let Some(ref cursor) = current_cursor {
224                self.check_batch_scrape_status_with_cursor(id, cursor)
225                    .await?
226            } else {
227                self.check_batch_scrape_status(id).await?
228            };
229
230            // Collect data from this page
231            all_data.append(&mut status_data.data);
232
233            // Check if we need to paginate
234            if let Some(next) = status_data.next {
235                current_cursor = Some(next);
236                continue;
237            }
238
239            // Check job status
240            match status_data.status {
241                BatchScrapeStatusTypes::Completed => {
242                    // Put all collected data back into the status
243                    status_data.data = all_data;
244                    break Ok(status_data);
245                }
246                BatchScrapeStatusTypes::Scraping => {
247                    tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
248                    // Keep the cursor as is, to continue from where we left off
249                }
250                BatchScrapeStatusTypes::Failed => {
251                    break Err(FirecrawlError::BatchScrapeJobFailed(
252                        "Batch scrape job failed.".to_string(),
253                    ));
254                }
255            }
256        }
257    }
258
259    /// Checks the status of a batch scrape job with a cursor for pagination.
260    pub async fn check_batch_scrape_status_with_cursor(
261        &self,
262        id: &str,
263        cursor: &str,
264    ) -> Result<BatchScrapeStatus, FirecrawlError> {
265        let headers = self.prepare_headers(None);
266
267        let response = self
268            .client
269            .get(format!(
270                "{}/{}/batch/scrape/{}?cursor={}",
271                self.api_url, API_VERSION, id, cursor
272            ))
273            .headers(headers)
274            .send()
275            .await
276            .map_err(|e| {
277                FirecrawlError::HttpError("Checking batch scrape status".to_string(), e)
278            })?;
279
280        self.handle_response::<BatchScrapeStatus>(response, "check batch scrape status")
281            .await
282    }
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288    use crate::scrape::{Action, ActionType, JsonOptions, ScrapeFormats};
289    use serde_json::json;
290
291    #[test]
292    fn test_batch_scrape_request_serialization() {
293        // API example JSON
294        let json_data = json!({
295            "urls": ["https://example.com"],
296            "webhook": {
297                "url": "https://webhook.example.com",
298                "headers": {},
299                "metadata": {},
300                "events": ["completed"]
301            },
302            "formats": ["markdown"],
303            "onlyMainContent": true,
304            "includeTags": ["div"],
305            "excludeTags": ["img"],
306            "headers": {},
307            "waitFor": 0,
308            "mobile": false,
309            "skipTlsVerification": false,
310            "timeout": 30000,
311            "jsonOptions": {
312                "schema": { "type": "object" },
313                "systemPrompt": "Extract data",
314                "prompt": "Extract title"
315            },
316            "actions": [
317                {
318                    "type": "wait",
319                    "milliseconds": 2000,
320                    "selector": "#my-element"
321                }
322            ],
323            "location": {
324                "country": "US",
325                "languages": ["en-US"]
326            },
327            "removeBase64Images": true,
328            "blockAds": true,
329            "proxy": "basic"
330        });
331
332        // Deserialize the JSON to our request body struct
333        let req_body: BatchScrapeRequestBody =
334            serde_json::from_value(json_data).expect("Failed to deserialize JSON");
335
336        // Create the expected complete request body struct
337        let expected_req_body = BatchScrapeRequestBody {
338            urls: vec!["https://example.com".to_string()],
339            webhook: Webhook {
340                url: "https://webhook.example.com".to_string(),
341                headers: Some(HashMap::new()),
342                metadata: Some(HashMap::new()),
343                events: Some(vec!["completed".to_string()]),
344            },
345            ignore_invalid_urls: None, // This field wasn't in the JSON, so it should be None
346            options: ScrapeOptions {
347                formats: Some(vec![ScrapeFormats::Markdown]),
348                only_main_content: Some(true),
349                include_tags: Some(vec!["div".to_string()]),
350                exclude_tags: Some(vec!["img".to_string()]),
351                headers: Some(HashMap::new()),
352                wait_for: Some(0),
353                mobile: Some(false),
354                skip_tls_verification: Some(false),
355                timeout: Some(30000),
356                json_options: Some(JsonOptions {
357                    schema: Some(json!({"type": "object"})),
358                    system_prompt: Some("Extract data".to_string()),
359                    prompt: Some("Extract title".to_string()),
360                }),
361                actions: Some(vec![Action {
362                    action_type: ActionType::Wait,
363                    milliseconds: Some(2000),
364                    selector: Some("#my-element".to_string()),
365                    text: None,
366                    key: None,
367                    direction: None,
368                    script: None,
369                    full_page: None,
370                }]),
371                location: Some(crate::scrape::LocationOptions {
372                    country: "US".to_string(),
373                    languages: vec!["en-US".to_string()],
374                }),
375                remove_base64_images: Some(true),
376                block_ads: Some(true),
377                proxy: Some("basic".to_string()),
378            },
379        };
380
381        // Compare the entire structs
382        assert_eq!(req_body, expected_req_body);
383    }
384
385    #[test]
386    fn test_batch_scrape_options_to_scrape_options() {
387        let scrape_options = ScrapeOptions {
388            formats: Some(vec![ScrapeFormats::Markdown]),
389            only_main_content: Some(true),
390            include_tags: Some(vec!["div".to_string()]),
391            exclude_tags: Some(vec!["img".to_string()]),
392            headers: Some(HashMap::new()),
393            wait_for: Some(1000),
394            mobile: Some(true),
395            skip_tls_verification: Some(false),
396            timeout: Some(2000),
397            json_options: Some(crate::scrape::JsonOptions::default()),
398            actions: Some(vec![]),
399            location: Some(crate::scrape::LocationOptions::default()),
400            remove_base64_images: Some(true),
401            block_ads: Some(true),
402            proxy: Some("basic".to_string()),
403        };
404
405        assert_eq!(scrape_options.formats.as_ref().unwrap().len(), 1);
406        assert!(matches!(
407            scrape_options.formats.as_ref().unwrap()[0],
408            ScrapeFormats::Markdown
409        ));
410        assert!(scrape_options.only_main_content.unwrap());
411        assert_eq!(scrape_options.include_tags.as_ref().unwrap()[0], "div");
412        assert_eq!(scrape_options.exclude_tags.as_ref().unwrap()[0], "img");
413        assert_eq!(scrape_options.wait_for.unwrap(), 1000);
414        assert!(scrape_options.headers.is_some());
415        assert!(scrape_options.mobile.unwrap());
416        assert!(!scrape_options.skip_tls_verification.unwrap());
417        assert_eq!(scrape_options.timeout.unwrap(), 2000);
418        assert!(scrape_options.json_options.is_some());
419        assert!(scrape_options.actions.is_some());
420        assert!(scrape_options.location.is_some());
421        assert!(scrape_options.remove_base64_images.unwrap());
422        assert!(scrape_options.block_ads.unwrap());
423        assert_eq!(scrape_options.proxy.as_ref().unwrap(), "basic");
424    }
425}