1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6#[cfg(feature = "mcp-tool")]
7use schemars::JsonSchema;
8
9use crate::{API_VERSION, FirecrawlApp, FirecrawlError, document::Document, scrape::ScrapeOptions};
10
11#[serde_with::skip_serializing_none]
12#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
13#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
14#[serde(rename_all = "camelCase")]
15pub struct Webhook {
16 pub url: String,
18
19 pub headers: Option<HashMap<String, String>>,
21
22 pub metadata: Option<HashMap<String, Value>>,
24
25 pub events: Option<Vec<String>>,
27}
28
29impl Webhook {
30 pub fn dummy() -> Self {
31 Webhook {
32 url: "https://webhook.example.com".to_string(),
33 headers: None,
34 metadata: None,
35 events: None,
36 }
37 }
38}
39
40#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
41#[serde(rename_all = "camelCase")]
42pub struct BatchScrapeRequestBody {
43 pub urls: Vec<String>,
45
46 pub webhook: Webhook,
48
49 #[serde(rename = "ignoreInvalidURLs")]
51 pub ignore_invalid_urls: Option<bool>,
52
53 #[serde(flatten)]
55 pub options: ScrapeOptions,
56}
57
58#[derive(Deserialize, Serialize, Debug, Default)]
59#[serde(rename_all = "camelCase")]
60struct BatchScrapeResponse {
61 success: bool,
63
64 id: String,
66
67 url: String,
69
70 #[serde(skip_serializing_if = "Option::is_none")]
74 invalid_urls: Option<Vec<String>>,
75}
76
77#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
78#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
79#[serde(rename_all = "camelCase")]
80pub struct BatchScrapeUrlsInput {
81 pub urls: Vec<String>,
83
84 pub webhook: Option<Webhook>,
86
87 #[serde(skip)]
89 pub ignore_invalid_urls: Option<bool>,
90
91 pub poll_interval: Option<u64>,
93
94 #[serde(skip)]
95 pub idempotency_key: Option<String>,
96
97 #[serde(flatten)]
99 pub options: Option<ScrapeOptions>,
100}
101
102#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
103#[serde(rename_all = "camelCase")]
104pub enum BatchScrapeStatusTypes {
105 Scraping,
106 Completed,
107 Failed,
108}
109
110impl Default for BatchScrapeStatusTypes {
111 fn default() -> Self {
112 Self::Scraping
113 }
114}
115
116#[derive(Deserialize, Serialize, Debug, Default)]
117#[serde(rename_all = "camelCase")]
118pub struct BatchScrapeStatus {
119 pub success: bool,
121
122 pub status: BatchScrapeStatusTypes,
124
125 #[serde(default)]
127 pub total: usize,
128
129 #[serde(default)]
131 pub completed: usize,
132
133 #[serde(default)]
135 pub credits_used: usize,
136
137 pub expires_at: Option<String>,
139
140 pub next: Option<String>,
142
143 #[serde(default)]
145 pub data: Vec<Document>,
146}
147
148impl FirecrawlApp {
149 pub async fn batch_scrape_urls(
151 &self,
152 urls: Vec<String>,
153 options: impl Into<Option<ScrapeOptions>>,
154 poll_interval: Option<u64>,
155 idempotency_key: Option<String>,
156 webhook: Webhook,
157 ignore_invalid_urls: Option<bool>,
158 ) -> Result<BatchScrapeStatus, FirecrawlError> {
159 let request_body = BatchScrapeRequestBody {
160 urls,
161 webhook,
162 ignore_invalid_urls,
163 options: options.into().unwrap_or_default(),
164 };
165
166 let headers = self.prepare_headers(idempotency_key.as_ref());
167
168 let response = self
169 .client
170 .post(format!("{}/{}/batch/scrape", self.api_url, API_VERSION))
171 .headers(headers)
172 .json(&request_body)
173 .send()
174 .await
175 .map_err(|e| FirecrawlError::HttpError("Batch scraping URLs".to_string(), e))?;
176
177 let response = self
178 .handle_response::<BatchScrapeResponse>(response, "batch scrape URLs")
179 .await?;
180
181 let poll_interval = poll_interval.unwrap_or(2000);
182 self.monitor_batch_scrape_status(&response.id, poll_interval)
183 .await
184 }
185
186 pub async fn check_batch_scrape_status(
188 &self,
189 id: &str,
190 ) -> Result<BatchScrapeStatus, FirecrawlError> {
191 let headers = self.prepare_headers(None);
192
193 println!("Checking batch scrape status for job: {}", id);
194
195 let response = self
196 .client
197 .get(format!(
198 "{}/{}/batch/scrape/{}",
199 self.api_url, API_VERSION, id
200 ))
201 .headers(headers)
202 .send()
203 .await
204 .map_err(|e| {
205 FirecrawlError::HttpError("Checking batch scrape status".to_string(), e)
206 })?;
207
208 self.handle_response::<BatchScrapeStatus>(response, "check batch scrape status")
209 .await
210 }
211
212 pub async fn monitor_batch_scrape_status(
214 &self,
215 id: &str,
216 poll_interval: u64,
217 ) -> Result<BatchScrapeStatus, FirecrawlError> {
218 let mut all_data = Vec::new();
219 let mut current_cursor: Option<String> = None;
220
221 loop {
222 let mut status_data = if let Some(ref cursor) = current_cursor {
223 self.check_batch_scrape_status_with_cursor(id, cursor)
224 .await?
225 } else {
226 self.check_batch_scrape_status(id).await?
227 };
228
229 all_data.append(&mut status_data.data);
231
232 if let Some(next) = status_data.next {
234 current_cursor = Some(next);
235 continue;
236 }
237
238 match status_data.status {
240 BatchScrapeStatusTypes::Completed => {
241 status_data.data = all_data;
243 break Ok(status_data);
244 }
245 BatchScrapeStatusTypes::Scraping => {
246 tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
247 }
249 BatchScrapeStatusTypes::Failed => {
250 break Err(FirecrawlError::BatchScrapeJobFailed(
251 "Batch scrape job failed.".to_string(),
252 ));
253 }
254 }
255 }
256 }
257
258 pub async fn check_batch_scrape_status_with_cursor(
260 &self,
261 id: &str,
262 cursor: &str,
263 ) -> Result<BatchScrapeStatus, FirecrawlError> {
264 let headers = self.prepare_headers(None);
265
266 let response = self
267 .client
268 .get(format!(
269 "{}/{}/batch/scrape/{}?cursor={}",
270 self.api_url, API_VERSION, id, cursor
271 ))
272 .headers(headers)
273 .send()
274 .await
275 .map_err(|e| {
276 FirecrawlError::HttpError("Checking batch scrape status".to_string(), e)
277 })?;
278
279 self.handle_response::<BatchScrapeStatus>(response, "check batch scrape status")
280 .await
281 }
282}
283
284#[cfg(test)]
285mod tests {
286 use super::*;
287 use crate::scrape::{Action, ActionType, JsonOptions, ScrapeFormats};
288 use serde_json::json;
289
290 #[test]
291 fn test_batch_scrape_request_serialization() {
292 let json_data = json!({
294 "urls": ["https://example.com"],
295 "webhook": {
296 "url": "https://webhook.example.com",
297 "headers": {},
298 "metadata": {},
299 "events": ["completed"]
300 },
301 "formats": ["markdown"],
302 "onlyMainContent": true,
303 "includeTags": ["div"],
304 "excludeTags": ["img"],
305 "headers": {},
306 "waitFor": 0,
307 "mobile": false,
308 "skipTlsVerification": false,
309 "timeout": 30000,
310 "jsonOptions": {
311 "schema": { "type": "object" },
312 "systemPrompt": "Extract data",
313 "prompt": "Extract title"
314 },
315 "actions": [
316 {
317 "type": "wait",
318 "milliseconds": 2000,
319 "selector": "#my-element"
320 }
321 ],
322 "location": {
323 "country": "US",
324 "languages": ["en-US"]
325 },
326 "removeBase64Images": true,
327 "blockAds": true,
328 "proxy": "basic"
329 });
330
331 let req_body: BatchScrapeRequestBody =
333 serde_json::from_value(json_data).expect("Failed to deserialize JSON");
334
335 let expected_req_body = BatchScrapeRequestBody {
337 urls: vec!["https://example.com".to_string()],
338 webhook: Webhook {
339 url: "https://webhook.example.com".to_string(),
340 headers: Some(HashMap::new()),
341 metadata: Some(HashMap::new()),
342 events: Some(vec!["completed".to_string()]),
343 },
344 ignore_invalid_urls: None, options: ScrapeOptions {
346 formats: Some(vec![ScrapeFormats::Markdown]),
347 only_main_content: Some(true),
348 include_tags: Some(vec!["div".to_string()]),
349 exclude_tags: Some(vec!["img".to_string()]),
350 headers: Some(HashMap::new()),
351 wait_for: Some(0),
352 mobile: Some(false),
353 skip_tls_verification: Some(false),
354 timeout: Some(30000),
355 json_options: Some(JsonOptions {
356 schema: Some(json!({"type": "object"})),
357 system_prompt: Some("Extract data".to_string()),
358 prompt: Some("Extract title".to_string()),
359 }),
360 actions: Some(vec![Action {
361 action_type: ActionType::Wait,
362 milliseconds: Some(2000),
363 selector: Some("#my-element".to_string()),
364 text: None,
365 key: None,
366 direction: None,
367 script: None,
368 full_page: None,
369 }]),
370 location: Some(crate::scrape::LocationOptions {
371 country: "US".to_string(),
372 languages: vec!["en-US".to_string()],
373 }),
374 remove_base64_images: Some(true),
375 block_ads: Some(true),
376 proxy: Some("basic".to_string()),
377 },
378 };
379
380 assert_eq!(req_body, expected_req_body);
382 }
383
384 #[test]
385 fn test_batch_scrape_options_to_scrape_options() {
386 let scrape_options = ScrapeOptions {
387 formats: Some(vec![ScrapeFormats::Markdown]),
388 only_main_content: Some(true),
389 include_tags: Some(vec!["div".to_string()]),
390 exclude_tags: Some(vec!["img".to_string()]),
391 headers: Some(HashMap::new()),
392 wait_for: Some(1000),
393 mobile: Some(true),
394 skip_tls_verification: Some(false),
395 timeout: Some(2000),
396 json_options: Some(crate::scrape::JsonOptions::default()),
397 actions: Some(vec![]),
398 location: Some(crate::scrape::LocationOptions::default()),
399 remove_base64_images: Some(true),
400 block_ads: Some(true),
401 proxy: Some("basic".to_string()),
402 };
403
404 assert_eq!(scrape_options.formats.as_ref().unwrap().len(), 1);
405 assert!(matches!(
406 scrape_options.formats.as_ref().unwrap()[0],
407 ScrapeFormats::Markdown
408 ));
409 assert!(scrape_options.only_main_content.unwrap());
410 assert_eq!(scrape_options.include_tags.as_ref().unwrap()[0], "div");
411 assert_eq!(scrape_options.exclude_tags.as_ref().unwrap()[0], "img");
412 assert_eq!(scrape_options.wait_for.unwrap(), 1000);
413 assert!(scrape_options.headers.is_some());
414 assert!(scrape_options.mobile.unwrap());
415 assert!(!scrape_options.skip_tls_verification.unwrap());
416 assert_eq!(scrape_options.timeout.unwrap(), 2000);
417 assert!(scrape_options.json_options.is_some());
418 assert!(scrape_options.actions.is_some());
419 assert!(scrape_options.location.is_some());
420 assert!(scrape_options.remove_base64_images.unwrap());
421 assert!(scrape_options.block_ads.unwrap());
422 assert_eq!(scrape_options.proxy.as_ref().unwrap(), "basic");
423 }
424}