1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6#[cfg(feature = "mcp_tool")]
7use schemars::JsonSchema;
8
9use crate::{
10 document::Document,
11 scrape::{Action, JsonOptions, LocationOptions, ScrapeFormats, ScrapeOptions},
12 FirecrawlApp, FirecrawlError, API_VERSION,
13};
14
15#[serde_with::skip_serializing_none]
16#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
17#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
18#[serde(rename_all = "camelCase")]
19pub struct BatchScrapeWebhook {
20 pub url: String,
22
23 pub headers: Option<HashMap<String, String>>,
25
26 pub metadata: Option<HashMap<String, Value>>,
28
29 pub events: Option<Vec<String>>,
31}
32
33#[serde_with::skip_serializing_none]
34#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
35#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
36#[serde(rename_all = "camelCase")]
37pub struct BatchScrapeOptions {
38 pub formats: Option<Vec<ScrapeFormats>>,
40
41 pub only_main_content: Option<bool>,
43
44 pub include_tags: Option<Vec<String>>,
46
47 pub exclude_tags: Option<Vec<String>>,
49
50 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
52 pub headers: Option<HashMap<String, String>>,
53
54 pub wait_for: Option<u32>,
56
57 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
59 pub mobile: Option<bool>,
60
61 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
63 pub skip_tls_verification: Option<bool>,
64
65 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
67 pub timeout: Option<u32>,
68
69 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
71 #[serde(rename = "jsonOptions")]
72 pub json_options: Option<JsonOptions>,
73
74 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
76 pub actions: Option<Vec<Action>>,
77
78 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
80 pub location: Option<LocationOptions>,
81
82 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
84 pub remove_base64_images: Option<bool>,
85
86 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
88 pub block_ads: Option<bool>,
89
90 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
92 pub proxy: Option<String>,
93}
94
95impl Default for BatchScrapeOptions {
96 fn default() -> Self {
97 Self {
98 formats: None,
99 only_main_content: None,
100 include_tags: None,
101 exclude_tags: None,
102 headers: None,
103 json_options: None,
104 actions: None,
105 location: None,
106 wait_for: None,
107 mobile: None,
108 skip_tls_verification: None,
109 timeout: None,
110 remove_base64_images: None,
111 block_ads: None,
112 proxy: None,
113 }
114 }
115}
116
117impl From<BatchScrapeOptions> for ScrapeOptions {
118 fn from(options: BatchScrapeOptions) -> Self {
119 ScrapeOptions {
120 formats: options.formats,
121 only_main_content: options.only_main_content,
122 include_tags: options.include_tags,
123 exclude_tags: options.exclude_tags,
124 headers: options.headers,
125 json_options: options.json_options,
126 actions: options.actions,
127 location: options.location,
128 wait_for: options.wait_for,
129 mobile: options.mobile,
130 skip_tls_verification: options.skip_tls_verification,
131 timeout: options.timeout,
132 remove_base64_images: options.remove_base64_images,
133 block_ads: options.block_ads,
134 proxy: options.proxy,
135 extract: None,
136 language: None,
137 parse_pdf: None,
138 }
139 }
140}
141
142#[serde_with::skip_serializing_none]
143#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
144#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
145#[serde(rename_all = "camelCase")]
146pub struct BatchScrapeRequestBody {
147 pub urls: Vec<String>,
149
150 pub webhook: Option<BatchScrapeWebhook>,
152
153 pub ignore_invalid_urls: Option<bool>,
155
156 #[serde(flatten)]
158 pub options: BatchScrapeOptions,
159}
160
161#[derive(Deserialize, Serialize, Debug, Default)]
162#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
163#[serde(rename_all = "camelCase")]
164struct BatchScrapeResponse {
165 success: bool,
167
168 data: Vec<Document>,
170}
171
172impl FirecrawlApp {
173 pub async fn batch_scrape_urls(
175 &self,
176 urls: Vec<String>,
177 webhook: Option<BatchScrapeWebhook>,
178 ignore_invalid_urls: Option<bool>,
179 options: impl Into<Option<BatchScrapeOptions>>,
180 ) -> Result<Vec<Document>, FirecrawlError> {
181 let request_body = BatchScrapeRequestBody {
182 urls,
183 webhook,
184 ignore_invalid_urls,
185 options: options.into().unwrap_or_default(),
186 };
187
188 let headers = self.prepare_headers(None);
189
190 let response = self
191 .client
192 .post(format!("{}/{}/batch-scrape", self.api_url, API_VERSION))
193 .headers(headers)
194 .json(&request_body)
195 .send()
196 .await
197 .map_err(|e| FirecrawlError::HttpError("Batch scraping URLs".to_string(), e))?;
198
199 let response = self
200 .handle_response::<BatchScrapeResponse>(response, "batch scrape URLs")
201 .await?;
202
203 Ok(response.data)
204 }
205}
206
207#[cfg(all(test, feature = "mcp_tool"))]
208mod schema_tests {
209 use super::*;
210 use async_claude;
211 use serde_json::json;
212
213 #[test]
214 fn test_batch_scrape_request_schema() {
215 let actual_schema =
216 async_claude::tool::parse_input_schema::<BatchScrapeRequestBody>().unwrap();
217 println!("Schema: {:#?}", actual_schema);
218
219 assert_eq!(actual_schema["type"], "object");
221
222 let properties = &actual_schema["properties"];
224 assert!(properties.is_object());
225
226 let required = &actual_schema["required"];
228 assert!(required.is_array());
229 assert!(required.as_array().unwrap().contains(&json!("urls")));
230
231 assert_eq!(properties["urls"]["type"], "array");
233 assert_eq!(properties["urls"]["items"]["type"], "string");
234 assert_eq!(properties["urls"]["description"], "List of URLs to scrape");
235
236 assert_eq!(properties["formats"]["type"], "array");
239
240 assert_eq!(properties["onlyMainContent"]["type"], "boolean");
242
243 assert_eq!(properties["includeTags"]["type"], "array");
245 assert_eq!(properties["includeTags"]["items"]["type"], "string");
246 assert_eq!(properties["excludeTags"]["type"], "array");
247 assert_eq!(properties["excludeTags"]["items"]["type"], "string");
248
249 assert!(
251 properties["waitFor"]["type"] == "integer" || properties["waitFor"]["type"] == "number"
252 );
253 }
254}
255
256#[cfg(test)]
257mod tests {
258 use super::*;
259 use crate::document::DocumentMetadata;
260 use crate::scrape::ActionType;
261 use serde_json::json;
262
263 #[test]
264 fn test_batch_scrape_request_serialization() {
265 let json_data = json!({
267 "urls": ["https://example.com"],
268 "webhook": {
269 "url": "https://webhook.example.com",
270 "headers": {},
271 "metadata": {},
272 "events": ["completed"]
273 },
274 "formats": ["markdown"],
275 "onlyMainContent": true,
276 "includeTags": ["div"],
277 "excludeTags": ["img"],
278 "headers": {},
279 "waitFor": 0,
280 "mobile": false,
281 "skipTlsVerification": false,
282 "timeout": 30000,
283 "jsonOptions": {
284 "schema": { "type": "object" },
285 "systemPrompt": "Extract data",
286 "prompt": "Extract title"
287 },
288 "actions": [
289 {
290 "type": "wait",
291 "milliseconds": 2000,
292 "selector": "#my-element"
293 }
294 ],
295 "location": {
296 "country": "US",
297 "languages": ["en-US"]
298 },
299 "removeBase64Images": true,
300 "blockAds": true,
301 "proxy": "basic"
302 });
303
304 let req_body: BatchScrapeRequestBody =
306 serde_json::from_value(json_data).expect("Failed to deserialize JSON");
307
308 let expected_req_body = BatchScrapeRequestBody {
310 urls: vec!["https://example.com".to_string()],
311 webhook: Some(BatchScrapeWebhook {
312 url: "https://webhook.example.com".to_string(),
313 headers: Some(HashMap::new()),
314 metadata: Some(HashMap::new()),
315 events: Some(vec!["completed".to_string()]),
316 }),
317 ignore_invalid_urls: None, options: BatchScrapeOptions {
319 formats: Some(vec![ScrapeFormats::Markdown]),
320 only_main_content: Some(true),
321 include_tags: Some(vec!["div".to_string()]),
322 exclude_tags: Some(vec!["img".to_string()]),
323 headers: Some(HashMap::new()),
324 wait_for: Some(0),
325 mobile: Some(false),
326 skip_tls_verification: Some(false),
327 timeout: Some(30000),
328 json_options: Some(JsonOptions {
329 schema: Some(json!({"type": "object"})),
330 system_prompt: Some("Extract data".to_string()),
331 prompt: Some("Extract title".to_string()),
332 }),
333 actions: Some(vec![Action {
334 action_type: ActionType::Wait,
335 milliseconds: Some(2000),
336 selector: Some("#my-element".to_string()),
337 text: None,
338 key: None,
339 direction: None,
340 script: None,
341 full_page: None,
342 }]),
343 location: Some(crate::scrape::LocationOptions {
344 country: "US".to_string(),
345 languages: vec!["en-US".to_string()],
346 }),
347 remove_base64_images: Some(true),
348 block_ads: Some(true),
349 proxy: Some("basic".to_string()),
350 },
351 };
352
353 assert_eq!(req_body, expected_req_body);
355 }
356
357 #[test]
358 fn test_batch_scrape_options_to_scrape_options() {
359 let batch_options = BatchScrapeOptions {
360 formats: Some(vec![ScrapeFormats::Markdown]),
361 only_main_content: Some(true),
362 include_tags: Some(vec!["div".to_string()]),
363 exclude_tags: Some(vec!["img".to_string()]),
364 headers: Some(HashMap::new()),
365 wait_for: Some(1000),
366 mobile: Some(true),
367 skip_tls_verification: Some(false),
368 timeout: Some(2000),
369 json_options: Some(crate::scrape::JsonOptions::default()),
370 actions: Some(vec![]),
371 location: Some(crate::scrape::LocationOptions::default()),
372 remove_base64_images: Some(true),
373 block_ads: Some(true),
374 proxy: Some("basic".to_string()),
375 };
376
377 let scrape_options: ScrapeOptions = batch_options.into();
378
379 assert_eq!(scrape_options.formats.as_ref().unwrap().len(), 1);
380 assert!(matches!(
381 scrape_options.formats.as_ref().unwrap()[0],
382 ScrapeFormats::Markdown
383 ));
384 assert!(scrape_options.only_main_content.unwrap());
385 assert_eq!(scrape_options.include_tags.as_ref().unwrap()[0], "div");
386 assert_eq!(scrape_options.exclude_tags.as_ref().unwrap()[0], "img");
387 assert_eq!(scrape_options.wait_for.unwrap(), 1000);
388 assert!(scrape_options.headers.is_some());
389 assert!(scrape_options.mobile.unwrap());
390 assert!(!scrape_options.skip_tls_verification.unwrap());
391 assert_eq!(scrape_options.timeout.unwrap(), 2000);
392 assert!(scrape_options.json_options.is_some());
393 assert!(scrape_options.actions.is_some());
394 assert!(scrape_options.location.is_some());
395 assert!(scrape_options.remove_base64_images.unwrap());
396 assert!(scrape_options.block_ads.unwrap());
397 assert_eq!(scrape_options.proxy.as_ref().unwrap(), "basic");
398 }
399}