1use std::collections::HashMap;
2
3use log::info;
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6
7#[cfg(feature = "mcp_tool")]
8use schemars::JsonSchema;
9
10use crate::{document::Document, scrape::ScrapeOptions, FirecrawlApp, FirecrawlError, API_VERSION};
11
12#[serde_with::skip_serializing_none]
13#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
14#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
15#[serde(rename_all = "camelCase")]
16pub struct Webhook {
17 pub url: String,
19
20 pub headers: Option<HashMap<String, String>>,
22
23 pub metadata: Option<HashMap<String, Value>>,
25
26 pub events: Option<Vec<String>>,
28}
29
30impl Webhook {
31 pub fn dummy() -> Self {
32 Webhook {
33 url: "https://webhook.example.com".to_string(),
34 headers: None,
35 metadata: None,
36 events: None,
37 }
38 }
39}
40
41#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
42#[serde(rename_all = "camelCase")]
43pub struct BatchScrapeRequestBody {
44 pub urls: Vec<String>,
46
47 pub webhook: Webhook,
49
50 #[serde(rename = "ignoreInvalidURLs")]
52 pub ignore_invalid_urls: Option<bool>,
53
54 #[serde(flatten)]
56 pub options: ScrapeOptions,
57}
58
59#[derive(Deserialize, Serialize, Debug, Default)]
60#[serde(rename_all = "camelCase")]
61struct BatchScrapeResponse {
62 success: bool,
64
65 id: String,
67
68 url: String,
70
71 #[serde(skip_serializing_if = "Option::is_none")]
75 invalid_urls: Option<Vec<String>>,
76}
77
78#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
79#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
80#[serde(rename_all = "camelCase")]
81pub struct BatchScrapeUrlsInput {
82 pub urls: Vec<String>,
84
85 pub webhook: Option<Webhook>,
87
88 #[serde(skip)]
90 pub ignore_invalid_urls: Option<bool>,
91
92 pub poll_interval: Option<u64>,
94
95 #[serde(skip)]
96 pub idempotency_key: Option<String>,
97
98 #[serde(flatten)]
100 pub options: Option<ScrapeOptions>,
101}
102
103#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
104#[serde(rename_all = "camelCase")]
105pub enum BatchScrapeStatusTypes {
106 Scraping,
107 Completed,
108 Failed,
109}
110
111impl Default for BatchScrapeStatusTypes {
112 fn default() -> Self {
113 Self::Scraping
114 }
115}
116
117#[derive(Deserialize, Serialize, Debug, Default)]
118#[serde(rename_all = "camelCase")]
119pub struct BatchScrapeStatus {
120 pub success: bool,
122
123 pub status: BatchScrapeStatusTypes,
125
126 #[serde(default)]
128 pub total: usize,
129
130 #[serde(default)]
132 pub completed: usize,
133
134 #[serde(default)]
136 pub credits_used: usize,
137
138 pub expires_at: Option<String>,
140
141 pub next: Option<String>,
143
144 #[serde(default)]
146 pub data: Vec<Document>,
147}
148
149impl FirecrawlApp {
150 pub async fn batch_scrape_urls(
152 &self,
153 urls: Vec<String>,
154 options: impl Into<Option<ScrapeOptions>>,
155 poll_interval: Option<u64>,
156 idempotency_key: Option<String>,
157 webhook: Webhook,
158 ignore_invalid_urls: Option<bool>,
159 ) -> Result<BatchScrapeStatus, FirecrawlError> {
160 let request_body = BatchScrapeRequestBody {
161 urls,
162 webhook,
163 ignore_invalid_urls,
164 options: options.into().unwrap_or_default(),
165 };
166
167 let headers = self.prepare_headers(idempotency_key.as_ref());
168
169 let response = self
170 .client
171 .post(format!("{}/{}/batch/scrape", self.api_url, API_VERSION))
172 .headers(headers)
173 .json(&request_body)
174 .send()
175 .await
176 .map_err(|e| FirecrawlError::HttpError("Batch scraping URLs".to_string(), e))?;
177
178 let response = self
179 .handle_response::<BatchScrapeResponse>(response, "batch scrape URLs")
180 .await?;
181
182 let poll_interval = poll_interval.unwrap_or(2000);
183 self.monitor_batch_scrape_status(&response.id, poll_interval)
184 .await
185 }
186
187 pub async fn check_batch_scrape_status(
189 &self,
190 id: &str,
191 ) -> Result<BatchScrapeStatus, FirecrawlError> {
192 let headers = self.prepare_headers(None);
193
194 println!("Checking batch scrape status for job: {}", id);
195
196 let response = self
197 .client
198 .get(format!(
199 "{}/{}/batch/scrape/{}",
200 self.api_url, API_VERSION, id
201 ))
202 .headers(headers)
203 .send()
204 .await
205 .map_err(|e| {
206 FirecrawlError::HttpError("Checking batch scrape status".to_string(), e)
207 })?;
208
209 self.handle_response::<BatchScrapeStatus>(response, "check batch scrape status")
210 .await
211 }
212
213 pub async fn monitor_batch_scrape_status(
215 &self,
216 id: &str,
217 poll_interval: u64,
218 ) -> Result<BatchScrapeStatus, FirecrawlError> {
219 let mut all_data = Vec::new();
220 let mut current_cursor: Option<String> = None;
221
222 loop {
223 let mut status_data = if let Some(ref cursor) = current_cursor {
224 self.check_batch_scrape_status_with_cursor(id, cursor)
225 .await?
226 } else {
227 self.check_batch_scrape_status(id).await?
228 };
229
230 all_data.append(&mut status_data.data);
232
233 if let Some(next) = status_data.next {
235 current_cursor = Some(next);
236 continue;
237 }
238
239 match status_data.status {
241 BatchScrapeStatusTypes::Completed => {
242 status_data.data = all_data;
244 break Ok(status_data);
245 }
246 BatchScrapeStatusTypes::Scraping => {
247 tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
248 }
250 BatchScrapeStatusTypes::Failed => {
251 break Err(FirecrawlError::BatchScrapeJobFailed(
252 "Batch scrape job failed.".to_string(),
253 ));
254 }
255 }
256 }
257 }
258
259 pub async fn check_batch_scrape_status_with_cursor(
261 &self,
262 id: &str,
263 cursor: &str,
264 ) -> Result<BatchScrapeStatus, FirecrawlError> {
265 let headers = self.prepare_headers(None);
266
267 let response = self
268 .client
269 .get(format!(
270 "{}/{}/batch/scrape/{}?cursor={}",
271 self.api_url, API_VERSION, id, cursor
272 ))
273 .headers(headers)
274 .send()
275 .await
276 .map_err(|e| {
277 FirecrawlError::HttpError("Checking batch scrape status".to_string(), e)
278 })?;
279
280 self.handle_response::<BatchScrapeStatus>(response, "check batch scrape status")
281 .await
282 }
283}
284
285#[cfg(test)]
286mod tests {
287 use super::*;
288 use crate::scrape::{Action, ActionType, JsonOptions, ScrapeFormats};
289 use serde_json::json;
290
291 #[test]
292 fn test_batch_scrape_request_serialization() {
293 let json_data = json!({
295 "urls": ["https://example.com"],
296 "webhook": {
297 "url": "https://webhook.example.com",
298 "headers": {},
299 "metadata": {},
300 "events": ["completed"]
301 },
302 "formats": ["markdown"],
303 "onlyMainContent": true,
304 "includeTags": ["div"],
305 "excludeTags": ["img"],
306 "headers": {},
307 "waitFor": 0,
308 "mobile": false,
309 "skipTlsVerification": false,
310 "timeout": 30000,
311 "jsonOptions": {
312 "schema": { "type": "object" },
313 "systemPrompt": "Extract data",
314 "prompt": "Extract title"
315 },
316 "actions": [
317 {
318 "type": "wait",
319 "milliseconds": 2000,
320 "selector": "#my-element"
321 }
322 ],
323 "location": {
324 "country": "US",
325 "languages": ["en-US"]
326 },
327 "removeBase64Images": true,
328 "blockAds": true,
329 "proxy": "basic"
330 });
331
332 let req_body: BatchScrapeRequestBody =
334 serde_json::from_value(json_data).expect("Failed to deserialize JSON");
335
336 let expected_req_body = BatchScrapeRequestBody {
338 urls: vec!["https://example.com".to_string()],
339 webhook: Webhook {
340 url: "https://webhook.example.com".to_string(),
341 headers: Some(HashMap::new()),
342 metadata: Some(HashMap::new()),
343 events: Some(vec!["completed".to_string()]),
344 },
345 ignore_invalid_urls: None, options: ScrapeOptions {
347 formats: Some(vec![ScrapeFormats::Markdown]),
348 only_main_content: Some(true),
349 include_tags: Some(vec!["div".to_string()]),
350 exclude_tags: Some(vec!["img".to_string()]),
351 headers: Some(HashMap::new()),
352 wait_for: Some(0),
353 mobile: Some(false),
354 skip_tls_verification: Some(false),
355 timeout: Some(30000),
356 json_options: Some(JsonOptions {
357 schema: Some(json!({"type": "object"})),
358 system_prompt: Some("Extract data".to_string()),
359 prompt: Some("Extract title".to_string()),
360 }),
361 actions: Some(vec![Action {
362 action_type: ActionType::Wait,
363 milliseconds: Some(2000),
364 selector: Some("#my-element".to_string()),
365 text: None,
366 key: None,
367 direction: None,
368 script: None,
369 full_page: None,
370 }]),
371 location: Some(crate::scrape::LocationOptions {
372 country: "US".to_string(),
373 languages: vec!["en-US".to_string()],
374 }),
375 remove_base64_images: Some(true),
376 block_ads: Some(true),
377 proxy: Some("basic".to_string()),
378 },
379 };
380
381 assert_eq!(req_body, expected_req_body);
383 }
384
385 #[test]
386 fn test_batch_scrape_options_to_scrape_options() {
387 let scrape_options = ScrapeOptions {
388 formats: Some(vec![ScrapeFormats::Markdown]),
389 only_main_content: Some(true),
390 include_tags: Some(vec!["div".to_string()]),
391 exclude_tags: Some(vec!["img".to_string()]),
392 headers: Some(HashMap::new()),
393 wait_for: Some(1000),
394 mobile: Some(true),
395 skip_tls_verification: Some(false),
396 timeout: Some(2000),
397 json_options: Some(crate::scrape::JsonOptions::default()),
398 actions: Some(vec![]),
399 location: Some(crate::scrape::LocationOptions::default()),
400 remove_base64_images: Some(true),
401 block_ads: Some(true),
402 proxy: Some("basic".to_string()),
403 };
404
405 assert_eq!(scrape_options.formats.as_ref().unwrap().len(), 1);
406 assert!(matches!(
407 scrape_options.formats.as_ref().unwrap()[0],
408 ScrapeFormats::Markdown
409 ));
410 assert!(scrape_options.only_main_content.unwrap());
411 assert_eq!(scrape_options.include_tags.as_ref().unwrap()[0], "div");
412 assert_eq!(scrape_options.exclude_tags.as_ref().unwrap()[0], "img");
413 assert_eq!(scrape_options.wait_for.unwrap(), 1000);
414 assert!(scrape_options.headers.is_some());
415 assert!(scrape_options.mobile.unwrap());
416 assert!(!scrape_options.skip_tls_verification.unwrap());
417 assert_eq!(scrape_options.timeout.unwrap(), 2000);
418 assert!(scrape_options.json_options.is_some());
419 assert!(scrape_options.actions.is_some());
420 assert!(scrape_options.location.is_some());
421 assert!(scrape_options.remove_base64_images.unwrap());
422 assert!(scrape_options.block_ads.unwrap());
423 assert_eq!(scrape_options.proxy.as_ref().unwrap(), "basic");
424 }
425}