firecrawl_sdk/
batch_scrape.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6#[cfg(feature = "mcp_tool")]
7use schemars::JsonSchema;
8
9use crate::{
10    document::Document,
11    scrape::{Action, JsonOptions, LocationOptions, ScrapeFormats, ScrapeOptions},
12    FirecrawlApp, FirecrawlError, API_VERSION,
13};
14
15#[serde_with::skip_serializing_none]
16#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
17#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
18#[serde(rename_all = "camelCase")]
19pub struct BatchScrapeWebhook {
20    /// Webhook URL to notify when scraping is complete
21    pub url: String,
22
23    /// Custom headers to send with webhook
24    pub headers: Option<HashMap<String, String>>,
25
26    /// Custom metadata to include in webhook payload
27    pub metadata: Option<HashMap<String, Value>>,
28
29    /// Events that trigger the webhook
30    pub events: Option<Vec<String>>,
31}
32
33#[serde_with::skip_serializing_none]
34#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
35#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
36#[serde(rename_all = "camelCase")]
37pub struct BatchScrapeOptions {
38    /// Content formats to extract
39    pub formats: Option<Vec<ScrapeFormats>>,
40
41    /// Extract only the main content, filtering out navigation, footers, etc.
42    pub only_main_content: Option<bool>,
43
44    /// HTML tags to specifically include in extraction
45    pub include_tags: Option<Vec<String>>,
46
47    /// HTML tags to exclude from extraction
48    pub exclude_tags: Option<Vec<String>>,
49
50    /// Additional HTTP headers to use when loading the page.
51    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
52    pub headers: Option<HashMap<String, String>>,
53
54    /// Time in milliseconds to wait for dynamic content to load
55    pub wait_for: Option<u32>,
56
57    /// Use mobile viewport
58    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
59    pub mobile: Option<bool>,
60
61    /// Skip TLS certificate verification
62    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
63    pub skip_tls_verification: Option<bool>,
64
65    /// Maximum time in milliseconds to wait for the page to load
66    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
67    pub timeout: Option<u32>,
68
69    /// JSON options for structured data extraction
70    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
71    #[serde(rename = "jsonOptions")]
72    pub json_options: Option<JsonOptions>,
73
74    /// List of actions to perform before scraping
75    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
76    pub actions: Option<Vec<Action>>,
77
78    /// Location settings for scraping
79    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
80    pub location: Option<LocationOptions>,
81
82    /// Remove base64 encoded images from output
83    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
84    pub remove_base64_images: Option<bool>,
85
86    /// Block ads during page loading
87    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
88    pub block_ads: Option<bool>,
89
90    /// Proxy configuration to use (values: "none", "basic", "residential")
91    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
92    pub proxy: Option<String>,
93}
94
95impl Default for BatchScrapeOptions {
96    fn default() -> Self {
97        Self {
98            formats: None,
99            only_main_content: None,
100            include_tags: None,
101            exclude_tags: None,
102            headers: None,
103            json_options: None,
104            actions: None,
105            location: None,
106            wait_for: None,
107            mobile: None,
108            skip_tls_verification: None,
109            timeout: None,
110            remove_base64_images: None,
111            block_ads: None,
112            proxy: None,
113        }
114    }
115}
116
117impl From<BatchScrapeOptions> for ScrapeOptions {
118    fn from(options: BatchScrapeOptions) -> Self {
119        ScrapeOptions {
120            formats: options.formats,
121            only_main_content: options.only_main_content,
122            include_tags: options.include_tags,
123            exclude_tags: options.exclude_tags,
124            headers: options.headers,
125            json_options: options.json_options,
126            actions: options.actions,
127            location: options.location,
128            wait_for: options.wait_for,
129            mobile: options.mobile,
130            skip_tls_verification: options.skip_tls_verification,
131            timeout: options.timeout,
132            remove_base64_images: options.remove_base64_images,
133            block_ads: options.block_ads,
134            proxy: options.proxy,
135            extract: None,
136            language: None,
137            parse_pdf: None,
138        }
139    }
140}
141
142#[serde_with::skip_serializing_none]
143#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
144#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
145#[serde(rename_all = "camelCase")]
146pub struct BatchScrapeRequestBody {
147    /// List of URLs to scrape
148    pub urls: Vec<String>,
149
150    /// Webhook configuration for notifications
151    pub webhook: Option<BatchScrapeWebhook>,
152
153    /// Whether to ignore invalid URLs
154    pub ignore_invalid_urls: Option<bool>,
155
156    /// Scraping options
157    #[serde(flatten)]
158    pub options: BatchScrapeOptions,
159}
160
161#[derive(Deserialize, Serialize, Debug, Default)]
162#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
163#[serde(rename_all = "camelCase")]
164struct BatchScrapeResponse {
165    /// This will always be `true` due to `FirecrawlApp::handle_response`.
166    success: bool,
167
168    /// The resulting documents.
169    data: Vec<Document>,
170}
171
172impl FirecrawlApp {
173    /// Scrapes multiple URLs in a single request using the Firecrawl API.
174    pub async fn batch_scrape_urls(
175        &self,
176        urls: Vec<String>,
177        webhook: Option<BatchScrapeWebhook>,
178        ignore_invalid_urls: Option<bool>,
179        options: impl Into<Option<BatchScrapeOptions>>,
180    ) -> Result<Vec<Document>, FirecrawlError> {
181        let request_body = BatchScrapeRequestBody {
182            urls,
183            webhook,
184            ignore_invalid_urls,
185            options: options.into().unwrap_or_default(),
186        };
187
188        let headers = self.prepare_headers(None);
189
190        let response = self
191            .client
192            .post(format!("{}/{}/batch-scrape", self.api_url, API_VERSION))
193            .headers(headers)
194            .json(&request_body)
195            .send()
196            .await
197            .map_err(|e| FirecrawlError::HttpError("Batch scraping URLs".to_string(), e))?;
198
199        let response = self
200            .handle_response::<BatchScrapeResponse>(response, "batch scrape URLs")
201            .await?;
202
203        Ok(response.data)
204    }
205}
206
207#[cfg(all(test, feature = "mcp_tool"))]
208mod schema_tests {
209    use super::*;
210    use async_claude;
211    use serde_json::json;
212
213    #[test]
214    fn test_batch_scrape_request_schema() {
215        let actual_schema =
216            async_claude::tool::parse_input_schema::<BatchScrapeRequestBody>().unwrap();
217        println!("Schema: {:#?}", actual_schema);
218
219        // Check basic structure
220        assert_eq!(actual_schema["type"], "object");
221
222        // Get properties object
223        let properties = &actual_schema["properties"];
224        assert!(properties.is_object());
225
226        // Check required fields
227        let required = &actual_schema["required"];
228        assert!(required.is_array());
229        assert!(required.as_array().unwrap().contains(&json!("urls")));
230
231        // Check urls property
232        assert_eq!(properties["urls"]["type"], "array");
233        assert_eq!(properties["urls"]["items"]["type"], "string");
234        assert_eq!(properties["urls"]["description"], "List of URLs to scrape");
235
236        // Since options is flattened, we check the flattened properties directly
237        // Check formats property (from flattened options)
238        assert_eq!(properties["formats"]["type"], "array");
239
240        // Check onlyMainContent property (from flattened options)
241        assert_eq!(properties["onlyMainContent"]["type"], "boolean");
242
243        // Check array properties (from flattened options)
244        assert_eq!(properties["includeTags"]["type"], "array");
245        assert_eq!(properties["includeTags"]["items"]["type"], "string");
246        assert_eq!(properties["excludeTags"]["type"], "array");
247        assert_eq!(properties["excludeTags"]["items"]["type"], "string");
248
249        // Check numeric properties (from flattened options)
250        assert!(
251            properties["waitFor"]["type"] == "integer" || properties["waitFor"]["type"] == "number"
252        );
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259    use crate::document::DocumentMetadata;
260    use crate::scrape::ActionType;
261    use serde_json::json;
262
263    #[test]
264    fn test_batch_scrape_request_serialization() {
265        // API example JSON
266        let json_data = json!({
267            "urls": ["https://example.com"],
268            "webhook": {
269                "url": "https://webhook.example.com",
270                "headers": {},
271                "metadata": {},
272                "events": ["completed"]
273            },
274            "formats": ["markdown"],
275            "onlyMainContent": true,
276            "includeTags": ["div"],
277            "excludeTags": ["img"],
278            "headers": {},
279            "waitFor": 0,
280            "mobile": false,
281            "skipTlsVerification": false,
282            "timeout": 30000,
283            "jsonOptions": {
284                "schema": { "type": "object" },
285                "systemPrompt": "Extract data",
286                "prompt": "Extract title"
287            },
288            "actions": [
289                {
290                    "type": "wait",
291                    "milliseconds": 2000,
292                    "selector": "#my-element"
293                }
294            ],
295            "location": {
296                "country": "US",
297                "languages": ["en-US"]
298            },
299            "removeBase64Images": true,
300            "blockAds": true,
301            "proxy": "basic"
302        });
303
304        // Deserialize the JSON to our request body struct
305        let req_body: BatchScrapeRequestBody =
306            serde_json::from_value(json_data).expect("Failed to deserialize JSON");
307
308        // Create the expected complete request body struct
309        let expected_req_body = BatchScrapeRequestBody {
310            urls: vec!["https://example.com".to_string()],
311            webhook: Some(BatchScrapeWebhook {
312                url: "https://webhook.example.com".to_string(),
313                headers: Some(HashMap::new()),
314                metadata: Some(HashMap::new()),
315                events: Some(vec!["completed".to_string()]),
316            }),
317            ignore_invalid_urls: None, // This field wasn't in the JSON, so it should be None
318            options: BatchScrapeOptions {
319                formats: Some(vec![ScrapeFormats::Markdown]),
320                only_main_content: Some(true),
321                include_tags: Some(vec!["div".to_string()]),
322                exclude_tags: Some(vec!["img".to_string()]),
323                headers: Some(HashMap::new()),
324                wait_for: Some(0),
325                mobile: Some(false),
326                skip_tls_verification: Some(false),
327                timeout: Some(30000),
328                json_options: Some(JsonOptions {
329                    schema: Some(json!({"type": "object"})),
330                    system_prompt: Some("Extract data".to_string()),
331                    prompt: Some("Extract title".to_string()),
332                }),
333                actions: Some(vec![Action {
334                    action_type: ActionType::Wait,
335                    milliseconds: Some(2000),
336                    selector: Some("#my-element".to_string()),
337                    text: None,
338                    key: None,
339                    direction: None,
340                    script: None,
341                    full_page: None,
342                }]),
343                location: Some(crate::scrape::LocationOptions {
344                    country: "US".to_string(),
345                    languages: vec!["en-US".to_string()],
346                }),
347                remove_base64_images: Some(true),
348                block_ads: Some(true),
349                proxy: Some("basic".to_string()),
350            },
351        };
352
353        // Compare the entire structs
354        assert_eq!(req_body, expected_req_body);
355    }
356
357    #[test]
358    fn test_batch_scrape_options_to_scrape_options() {
359        let batch_options = BatchScrapeOptions {
360            formats: Some(vec![ScrapeFormats::Markdown]),
361            only_main_content: Some(true),
362            include_tags: Some(vec!["div".to_string()]),
363            exclude_tags: Some(vec!["img".to_string()]),
364            headers: Some(HashMap::new()),
365            wait_for: Some(1000),
366            mobile: Some(true),
367            skip_tls_verification: Some(false),
368            timeout: Some(2000),
369            json_options: Some(crate::scrape::JsonOptions::default()),
370            actions: Some(vec![]),
371            location: Some(crate::scrape::LocationOptions::default()),
372            remove_base64_images: Some(true),
373            block_ads: Some(true),
374            proxy: Some("basic".to_string()),
375        };
376
377        let scrape_options: ScrapeOptions = batch_options.into();
378
379        assert_eq!(scrape_options.formats.as_ref().unwrap().len(), 1);
380        assert!(matches!(
381            scrape_options.formats.as_ref().unwrap()[0],
382            ScrapeFormats::Markdown
383        ));
384        assert!(scrape_options.only_main_content.unwrap());
385        assert_eq!(scrape_options.include_tags.as_ref().unwrap()[0], "div");
386        assert_eq!(scrape_options.exclude_tags.as_ref().unwrap()[0], "img");
387        assert_eq!(scrape_options.wait_for.unwrap(), 1000);
388        assert!(scrape_options.headers.is_some());
389        assert!(scrape_options.mobile.unwrap());
390        assert!(!scrape_options.skip_tls_verification.unwrap());
391        assert_eq!(scrape_options.timeout.unwrap(), 2000);
392        assert!(scrape_options.json_options.is_some());
393        assert!(scrape_options.actions.is_some());
394        assert!(scrape_options.location.is_some());
395        assert!(scrape_options.remove_base64_images.unwrap());
396        assert!(scrape_options.block_ads.unwrap());
397        assert_eq!(scrape_options.proxy.as_ref().unwrap(), "basic");
398    }
399}