1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6#[cfg(feature = "mcp_tool")]
7use schemars::JsonSchema;
8
9use crate::{document::Document, FirecrawlApp, FirecrawlError, API_VERSION};
10
11#[derive(Deserialize, Serialize, Clone, Copy, Debug, PartialEq, Eq)]
12#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
13pub enum ScrapeFormats {
14 #[serde(rename = "markdown")]
16 Markdown,
17
18 #[serde(rename = "html")]
20 HTML,
21
22 #[serde(rename = "rawHtml")]
24 RawHTML,
25
26 #[serde(rename = "links")]
28 Links,
29
30 #[serde(rename = "screenshot")]
34 Screenshot,
35
36 #[serde(rename = "screenshot@fullPage")]
40 ScreenshotFullPage,
41
42 #[serde(rename = "extract")]
46 Extract,
47
48 #[serde(rename = "json")]
52 JSON,
53}
54
55#[serde_with::skip_serializing_none]
56#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
57#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
58#[serde(rename_all = "camelCase")]
59pub struct ExtractOptions {
60 pub schema: Option<Value>,
62
63 pub system_prompt: Option<String>,
64
65 pub prompt: Option<String>,
67}
68
69#[serde_with::skip_serializing_none]
70#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
71#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
72#[serde(rename_all = "camelCase")]
73pub struct JsonOptions {
74 pub schema: Option<Value>,
76
77 pub system_prompt: Option<String>,
79
80 pub prompt: Option<String>,
82}
83
84#[derive(Deserialize, Serialize, Clone, Debug, Default, PartialEq, Eq)]
85#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
86#[serde(rename_all = "camelCase")]
87pub enum ActionType {
88 #[default]
89 #[serde(rename = "click")]
90 Click,
91
92 #[serde(rename = "type")]
93 Type,
94
95 #[serde(rename = "wait")]
96 Wait,
97
98 #[serde(rename = "screenshot")]
99 Screenshot,
100
101 #[serde(rename = "write")]
102 Write,
103
104 #[serde(rename = "press")]
105 Press,
106
107 #[serde(rename = "scroll")]
108 Scroll,
109
110 #[serde(rename = "scrape")]
111 Scrape,
112
113 #[serde(rename = "executeJavascript")]
114 ExecuteJavascript,
115}
116
117#[serde_with::skip_serializing_none]
118#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
119#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
120#[serde(rename_all = "camelCase")]
121pub struct Action {
122 #[serde(rename = "type")]
124 pub action_type: ActionType,
125
126 pub selector: Option<String>,
128
129 pub text: Option<String>,
131
132 pub milliseconds: Option<u32>,
134
135 pub key: Option<String>,
137
138 pub direction: Option<String>,
140
141 pub script: Option<String>,
143
144 pub full_page: Option<bool>,
146}
147
148#[serde_with::skip_serializing_none]
149#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
150#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
151#[serde(rename_all = "camelCase")]
152pub struct LocationOptions {
153 pub country: String,
155
156 pub languages: Vec<String>,
158}
159
160#[serde_with::skip_serializing_none]
161#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
162#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
163#[serde(rename_all = "camelCase")]
164pub struct ScrapeOptions {
165 pub formats: Option<Vec<ScrapeFormats>>,
167
168 pub only_main_content: Option<bool>,
170
171 pub include_tags: Option<Vec<String>>,
173
174 pub exclude_tags: Option<Vec<String>>,
176
177 pub headers: Option<HashMap<String, String>>,
179
180 pub wait_for: Option<u32>,
182
183 pub timeout: Option<u32>,
185
186 pub extract: Option<ExtractOptions>,
188
189 #[serde(rename = "jsonOptions")]
191 pub json_options: Option<JsonOptions>,
192
193 #[serde(rename = "parsePDF")]
195 pub parse_pdf: Option<bool>,
196
197 pub location: Option<LocationOptions>,
199
200 pub language: Option<String>,
202
203 pub actions: Option<Vec<Action>>,
205
206 pub mobile: Option<bool>,
208
209 pub skip_tls_verification: Option<bool>,
211
212 pub remove_base64_images: Option<bool>,
214
215 pub block_ads: Option<bool>,
217
218 pub proxy: Option<String>,
220}
221
222#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
223#[serde(rename_all = "camelCase")]
224struct ScrapeRequestBody {
225 url: String,
226
227 #[serde(flatten)]
228 options: ScrapeOptions,
229}
230
231#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
232#[serde(rename_all = "camelCase")]
233struct ScrapeResponse {
234 success: bool,
237
238 data: Document,
240}
241
242impl FirecrawlApp {
243 pub async fn scrape_url(
245 &self,
246 url: impl AsRef<str>,
247 options: impl Into<Option<ScrapeOptions>>,
248 ) -> Result<Document, FirecrawlError> {
249 let body = ScrapeRequestBody {
250 url: url.as_ref().to_string(),
251 options: options.into().unwrap_or_default(),
252 };
253
254 let headers = self.prepare_headers(None);
255
256 let response = self
257 .client
258 .post(format!("{}/{}/scrape", self.api_url, API_VERSION))
259 .headers(headers)
260 .json(&body)
261 .send()
262 .await
263 .map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
264
265 let response = self
266 .handle_response::<ScrapeResponse>(response, "scrape URL")
267 .await?;
268
269 Ok(response.data)
270 }
271}
272
273#[cfg(all(test, feature = "mcp_tool"))]
274mod schema_tests {
275 use super::*;
276 use async_claude;
277 use serde_json::json;
278
279 #[test]
280 fn test_scrape_options_schema() {
281 let actual_schema = async_claude::tool::parse_input_schema::<ScrapeOptions>().unwrap();
282
283 assert_eq!(actual_schema["type"], "object");
285
286 let properties = &actual_schema["properties"];
288 assert!(properties.is_object());
289
290 let expected_properties = [
292 "formats",
293 "onlyMainContent",
294 "includeTags",
295 "excludeTags",
296 "headers",
297 "waitFor",
298 "timeout",
299 "extract",
300 "jsonOptions",
301 "parsePDF",
302 "location",
303 "language",
304 "actions",
305 "mobile",
306 "skipTlsVerification",
307 "removeBase64Images",
308 "blockAds",
309 "proxy",
310 ];
311
312 for prop in expected_properties.iter() {
313 assert!(
314 properties.get(*prop).is_some(),
315 "Property {} not found",
316 prop
317 );
318 }
319
320 assert_eq!(properties["formats"]["type"], "array");
322 assert!(properties["formats"]["items"].is_object());
323 assert_eq!(
325 properties["formats"]["description"],
326 "Content formats to extract (default: `[ Markdown ]`)"
327 );
328
329 let boolean_properties = [
331 "onlyMainContent",
332 "mobile",
333 "skipTlsVerification",
334 "removeBase64Images",
335 "blockAds",
336 ];
337 for prop in boolean_properties.iter() {
338 assert_eq!(
339 properties[*prop]["type"], "boolean",
340 "Property {} should be boolean",
341 prop
342 );
343 }
344
345 let string_array_properties = ["includeTags", "excludeTags"];
347 for prop in string_array_properties.iter() {
348 assert_eq!(properties[*prop]["type"], "array");
349 assert_eq!(properties[*prop]["items"]["type"], "string");
350 }
351
352 let numeric_properties = ["waitFor", "timeout"];
354 for prop in numeric_properties.iter() {
355 assert!(
356 properties[*prop]["type"] == "integer" || properties[*prop]["type"] == "number",
357 "Property {} should be numeric",
358 prop
359 );
360 }
361
362 assert_eq!(properties["actions"]["type"], "array");
364 assert!(properties["actions"]["items"].is_object());
365 assert_eq!(properties["actions"]["items"]["type"], "object");
366
367 let action_props = &properties["actions"]["items"]["properties"];
369 let expected_action_props = [
370 "type",
371 "selector",
372 "text",
373 "milliseconds",
374 "key",
375 "direction",
376 "script",
377 "fullPage",
378 ];
379 for prop in expected_action_props.iter() {
380 assert!(
381 action_props.get(*prop).is_some(),
382 "Action property {} not found",
383 prop
384 );
385 }
386
387 let action_type_prop = &action_props["type"];
389 assert!(action_type_prop["enum"].is_array());
390 let expected_action_types = [
391 "wait",
392 "click",
393 "screenshot",
394 "write",
395 "press",
396 "scroll",
397 "scrape",
398 "executeJavascript",
399 ];
400
401 for action_type in expected_action_types.iter() {
402 assert!(
403 action_type_prop["enum"]
404 .as_array()
405 .unwrap()
406 .iter()
407 .any(|v| v.as_str().unwrap_or("") == *action_type),
408 "Action type {} not found in enum",
409 action_type
410 );
411 }
412
413 assert_eq!(properties["extract"]["type"], "object");
415 let extract_props = &properties["extract"]["properties"];
416 assert!(extract_props.get("schema").is_some());
417 assert!(extract_props.get("systemPrompt").is_some());
418 assert!(extract_props.get("prompt").is_some());
419
420 assert_eq!(properties["location"]["type"], "object");
422 let location_props = &properties["location"]["properties"];
423 assert!(location_props.get("country").is_some());
424 assert!(location_props.get("languages").is_some());
425 assert_eq!(location_props["languages"]["type"], "array");
426 assert_eq!(location_props["languages"]["items"]["type"], "string");
427
428 assert_eq!(
430 properties["onlyMainContent"]["description"],
431 "Extract only the main content, filtering out navigation, footers, etc. (default: `true`)"
432 );
433
434 assert_eq!(
435 properties["actions"]["description"],
436 "List of actions to perform before scraping"
437 );
438
439 assert_eq!(
440 properties["mobile"]["description"],
441 "Use mobile viewport (default: `false`)"
442 );
443
444 assert_eq!(
445 properties["skipTlsVerification"]["description"],
446 "Skip TLS certificate verification (default: `false`)"
447 );
448
449 assert_eq!(
450 properties["removeBase64Images"]["description"],
451 "Remove base64 encoded images from output (default: `false`)"
452 );
453 }
454}
455
456#[cfg(test)]
457mod tests {
458 use super::*;
459 use serde_json::json;
460
461 #[test]
462 fn test_scrape_request_body_deserialization() {
463 let json_data = json!({
464 "url": "https://example.com",
465 "formats": [
466 "markdown"
467 ],
468 "onlyMainContent": true,
469 "includeTags": [
470 "div"
471 ],
472 "excludeTags": [
473 "img"
474 ],
475 "headers": {
476 "User-Agent": "Custom User Agent"
477 },
478 "waitFor": 1000,
479 "mobile": false,
480 "skipTlsVerification": false,
481 "timeout": 30000,
482 "jsonOptions": {
483 "schema": {
484 "type": "object",
485 "properties": {
486 "title": {
487 "type": "string"
488 }
489 }
490 },
491 "systemPrompt": "Extract data from the page",
492 "prompt": "Pull out the title"
493 },
494 "actions": [
495 {
496 "type": "wait",
497 "milliseconds": 2000,
498 "selector": "#my-element"
499 }
500 ],
501 "location": {
502 "country": "US",
503 "languages": [
504 "en-US"
505 ]
506 },
507 "removeBase64Images": true,
508 "blockAds": true,
509 "proxy": "basic"
510 });
511
512 let req_body: ScrapeRequestBody =
514 serde_json::from_value(json_data).expect("Failed to deserialize ScrapeRequestBody");
515
516 let mut expected_headers = HashMap::new();
518 expected_headers.insert("User-Agent".to_string(), "Custom User Agent".to_string());
519
520 let expected_req_body = ScrapeRequestBody {
522 url: "https://example.com".to_string(),
523 options: ScrapeOptions {
524 formats: Some(vec![ScrapeFormats::Markdown]),
525 only_main_content: Some(true),
526 include_tags: Some(vec!["div".to_string()]),
527 exclude_tags: Some(vec!["img".to_string()]),
528 headers: Some(expected_headers),
529 wait_for: Some(1000),
530 mobile: Some(false),
531 skip_tls_verification: Some(false),
532 timeout: Some(30000),
533 json_options: Some(JsonOptions {
534 schema: Some(json!({
535 "type": "object",
536 "properties": {
537 "title": {
538 "type": "string"
539 }
540 }
541 })),
542 system_prompt: Some("Extract data from the page".to_string()),
543 prompt: Some("Pull out the title".to_string()),
544 }),
545 actions: Some(vec![Action {
546 action_type: ActionType::Wait,
547 milliseconds: Some(2000),
548 selector: Some("#my-element".to_string()),
549 text: None,
550 key: None,
551 direction: None,
552 script: None,
553 full_page: None,
554 }]),
555 location: Some(LocationOptions {
556 country: "US".to_string(),
557 languages: vec!["en-US".to_string()],
558 }),
559 remove_base64_images: Some(true),
560 block_ads: Some(true),
561 proxy: Some("basic".to_string()),
562 language: None,
563 extract: None,
564 parse_pdf: None,
565 },
566 };
567
568 let json_opts_actual = req_body.options.json_options.clone();
571 let json_opts_expected = expected_req_body.options.json_options.clone();
572
573 let mut req_body_compare = req_body.clone();
575 let mut expected_req_body_compare = expected_req_body.clone();
576 req_body_compare.options.json_options = None;
577 expected_req_body_compare.options.json_options = None;
578
579 assert_eq!(req_body_compare, expected_req_body_compare);
581
582 assert_eq!(
584 json_opts_actual.as_ref().unwrap().system_prompt,
585 json_opts_expected.as_ref().unwrap().system_prompt
586 );
587 assert_eq!(
588 json_opts_actual.as_ref().unwrap().prompt,
589 json_opts_expected.as_ref().unwrap().prompt
590 );
591
592 let schema_actual =
594 serde_json::to_string(&json_opts_actual.as_ref().unwrap().schema).unwrap();
595 let schema_expected =
596 serde_json::to_string(&json_opts_expected.as_ref().unwrap().schema).unwrap();
597 assert_eq!(schema_actual, schema_expected);
598 }
599
600 #[test]
601 fn test_json_options_deserialization() {
602 let json_data = json!({
603 "schema": {
604 "type": "object",
605 "properties": {
606 "title": { "type": "string" }
607 }
608 },
609 "systemPrompt": "Custom system prompt for extraction",
610 "prompt": "Extract the title from the page"
611 });
612
613 let json_options: JsonOptions =
615 serde_json::from_value(json_data).expect("Failed to deserialize JsonOptions");
616
617 let expected_json_options = JsonOptions {
619 schema: Some(json!({
620 "type": "object",
621 "properties": {
622 "title": { "type": "string" }
623 }
624 })),
625 system_prompt: Some("Custom system prompt for extraction".to_string()),
626 prompt: Some("Extract the title from the page".to_string()),
627 };
628
629 assert_eq!(
631 json_options.system_prompt,
632 expected_json_options.system_prompt
633 );
634 assert_eq!(json_options.prompt, expected_json_options.prompt);
635
636 let schema_actual = serde_json::to_string(&json_options.schema).unwrap();
638 let schema_expected = serde_json::to_string(&expected_json_options.schema).unwrap();
639 assert_eq!(schema_actual, schema_expected);
640 }
641
642 #[test]
643 fn test_action_deserialization() {
644 let wait_action_json = json!({
646 "type": "wait",
647 "milliseconds": 3000,
648 "selector": "#loading"
649 });
650
651 let wait_action: Action =
652 serde_json::from_value(wait_action_json).expect("Failed to deserialize wait Action");
653
654 let expected_wait_action = Action {
655 action_type: ActionType::Wait,
656 milliseconds: Some(3000),
657 selector: Some("#loading".to_string()),
658 text: None,
659 key: None,
660 direction: None,
661 script: None,
662 full_page: None,
663 };
664
665 assert_eq!(wait_action, expected_wait_action);
667
668 let click_action_json = json!({
670 "type": "click",
671 "selector": "#submit-button"
672 });
673
674 let click_action: Action =
675 serde_json::from_value(click_action_json).expect("Failed to deserialize click Action");
676
677 let expected_click_action = Action {
678 action_type: ActionType::Click,
679 milliseconds: None,
680 selector: Some("#submit-button".to_string()),
681 text: None,
682 key: None,
683 direction: None,
684 script: None,
685 full_page: None,
686 };
687
688 assert_eq!(click_action, expected_click_action);
689
690 let type_action_json = json!({
692 "type": "type",
693 "selector": "#search-input",
694 "text": "search query"
695 });
696
697 let type_action: Action =
698 serde_json::from_value(type_action_json).expect("Failed to deserialize type Action");
699
700 let expected_type_action = Action {
701 action_type: ActionType::Type,
702 milliseconds: None,
703 selector: Some("#search-input".to_string()),
704 text: Some("search query".to_string()),
705 key: None,
706 direction: None,
707 script: None,
708 full_page: None,
709 };
710
711 assert_eq!(type_action, expected_type_action);
712 }
713}