1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6#[cfg(feature = "mcp-tool")]
7use schemars::JsonSchema;
8
9use crate::{API_VERSION, FirecrawlApp, FirecrawlError, document::Document};
10
11#[derive(Deserialize, Serialize, Clone, Copy, Debug, PartialEq, Eq)]
12#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
13pub enum ScrapeFormats {
14 #[serde(rename = "markdown")]
16 Markdown,
17
18 #[serde(rename = "html")]
20 HTML,
21
22 #[serde(rename = "rawHtml")]
24 RawHTML,
25
26 #[serde(rename = "links")]
28 Links,
29
30 #[serde(rename = "screenshot")]
34 Screenshot,
35
36 #[serde(rename = "screenshot@fullPage")]
40 ScreenshotFullPage,
41
42 #[serde(rename = "json")]
46 JSON,
47}
48
49#[serde_with::skip_serializing_none]
50#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
51#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
52#[serde(rename_all = "camelCase")]
53pub struct ExtractOptions {
54 pub schema: Option<Value>,
56
57 pub system_prompt: Option<String>,
59
60 pub prompt: Option<String>,
62}
63
64#[serde_with::skip_serializing_none]
65#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
66#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
67#[serde(rename_all = "camelCase")]
68pub struct JsonOptions {
69 pub schema: Option<Value>,
71
72 pub system_prompt: Option<String>,
74
75 pub prompt: Option<String>,
77}
78
79#[derive(Deserialize, Serialize, Clone, Debug, Default, PartialEq, Eq)]
80#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
81#[serde(rename_all = "camelCase")]
82pub enum ActionType {
83 #[default]
84 #[serde(rename = "click")]
85 Click,
86
87 #[serde(rename = "type")]
88 Type,
89
90 #[serde(rename = "wait")]
91 Wait,
92
93 #[serde(rename = "screenshot")]
94 Screenshot,
95
96 #[serde(rename = "write")]
97 Write,
98
99 #[serde(rename = "press")]
100 Press,
101
102 #[serde(rename = "scroll")]
103 Scroll,
104
105 #[serde(rename = "scrape")]
106 Scrape,
107
108 #[serde(rename = "executeJavascript")]
109 ExecuteJavascript,
110}
111
112#[serde_with::skip_serializing_none]
113#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
114#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
115#[serde(rename_all = "camelCase")]
116pub struct Action {
117 #[serde(rename = "type")]
119 pub action_type: ActionType,
120
121 pub selector: Option<String>,
123
124 pub text: Option<String>,
126
127 pub milliseconds: Option<u32>,
129
130 pub key: Option<String>,
132
133 pub direction: Option<String>,
135
136 pub script: Option<String>,
138
139 pub full_page: Option<bool>,
141}
142
143#[serde_with::skip_serializing_none]
144#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
145#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
146#[serde(rename_all = "camelCase")]
147pub struct LocationOptions {
148 pub country: String,
150
151 pub languages: Vec<String>,
153}
154
155#[serde_with::skip_serializing_none]
156#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
157#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
158#[serde(rename_all = "camelCase")]
159pub struct ScrapeOptions {
160 #[cfg_attr(feature = "mcp-tool", schemars(skip))]
162 pub formats: Option<Vec<ScrapeFormats>>,
163
164 pub only_main_content: Option<bool>,
166
167 pub include_tags: Option<Vec<String>>,
169
170 pub exclude_tags: Option<Vec<String>>,
172
173 pub headers: Option<HashMap<String, String>>,
175
176 pub wait_for: Option<u32>,
178
179 pub timeout: Option<u32>,
181
182 #[serde(rename = "jsonOptions")]
184 pub json_options: Option<JsonOptions>,
185
186 #[cfg_attr(feature = "self-host", schemars(skip))]
188 pub location: Option<LocationOptions>,
189
190 #[cfg_attr(feature = "self-host", schemars(skip))]
192 pub actions: Option<Vec<Action>>,
193
194 pub mobile: Option<bool>,
196
197 pub skip_tls_verification: Option<bool>,
199
200 pub remove_base64_images: Option<bool>,
202
203 #[cfg_attr(feature = "mcp-tool", schemars(skip))]
205 pub block_ads: Option<bool>,
206
207 #[cfg_attr(feature = "mcp-tool", schemars(skip))]
209 pub proxy: Option<String>,
210}
211
212#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
213#[serde(rename_all = "camelCase")]
214pub struct ScrapeRequestBody {
215 pub url: String,
217
218 #[serde(flatten)]
219 pub options: ScrapeOptions,
220}
221
222#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
223#[serde(rename_all = "camelCase")]
224struct ScrapeResponse {
225 success: bool,
228
229 data: Document,
231}
232
233#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
234#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
235#[serde(rename_all = "camelCase")]
236pub struct ScrapeUrlInput {
237 pub url: String,
239
240 #[serde(flatten)]
241 pub options: ScrapeOptions,
242}
243
244impl FirecrawlApp {
245 pub async fn scrape_url(
247 &self,
248 url: impl AsRef<str>,
249 options: impl Into<Option<ScrapeOptions>>,
250 ) -> Result<Document, FirecrawlError> {
251 let body = ScrapeRequestBody {
252 url: url.as_ref().to_string(),
253 options: options.into().unwrap_or_default(),
254 };
255
256 let headers = self.prepare_headers(None);
257
258 let response = self
259 .client
260 .post(format!("{}/{}/scrape", self.api_url, API_VERSION))
261 .headers(headers)
262 .json(&body)
263 .send()
264 .await
265 .map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
266
267 let response = self
268 .handle_response::<ScrapeResponse>(response, "scrape URL")
269 .await?;
270
271 Ok(response.data)
272 }
273}
274
275#[cfg(all(test, feature = "mcp-tool"))]
276mod schema_tests {
277 use super::*;
278 use async_claude;
279 use serde_json::json;
280
281 #[test]
282 fn test_scrape_options_schema() {
283 let actual_schema = async_claude::tool::parse_input_schema::<ScrapeOptions>().unwrap();
284
285 let expected_schema = json!({
287 "actions": {
288 "description": "List of actions to perform before scraping",
289 "items": {
290 "properties": {
291 "direction": {
292 "description": "Scroll direction (up or down)",
293 "type": "string"
294 },
295 "fullPage": {
296 "description": "Take full page screenshot (for screenshot action)",
297 "type": "boolean"
298 },
299 "key": {
300 "description": "Key to press (for press action)",
301 "type": "string"
302 },
303 "milliseconds": {
304 "description": "Time to wait in milliseconds (for wait action)",
305 "format": "uint32",
306 "minimum": 0.0,
307 "type": "integer"
308 },
309 "script": {
310 "description": "JavaScript code to execute (for executeJavascript action)",
311 "type": "string"
312 },
313 "selector": {
314 "description": "CSS selector for the target element",
315 "type": "string"
316 },
317 "text": {
318 "description": "Text to write (for write action)",
319 "type": "string"
320 },
321 "type": {
322 "description": "Type of action to perform",
323 "enum": [
324 "click",
325 "type",
326 "wait",
327 "screenshot",
328 "write",
329 "press",
330 "scroll",
331 "scrape",
332 "executeJavascript"
333 ],
334 "type": "string"
335 }
336 },
337 "required": [
338 "type"
339 ],
340 "type": "object"
341 },
342 "type": "array"
343 },
344 "excludeTags": {
345 "description": "HTML tags to exclude from extraction",
346 "items": {
347 "type": "string"
348 },
349 "type": "array"
350 },
351 "headers": {
352 "additionalProperties": {
353 "type": "string"
354 },
355 "description": "Additional HTTP headers to use when loading the page.",
356 "type": "object"
357 },
358 "includeTags": {
359 "description": "HTML tags to specifically include in extraction",
360 "items": {
361 "type": "string"
362 },
363 "type": "array"
364 },
365 "jsonOptions": {
366 "description": "The JSON options to use for the final extract.",
367 "properties": {
368 "prompt": {
369 "description": "Extraction prompt to send to the LLM agent",
370 "type": "string"
371 },
372 "schema": {
373 "description": "Schema the output should adhere to, provided in JSON Schema format."
374 },
375 "systemPrompt": {
376 "description": "System prompt to send to the LLM agent for schema extraction",
377 "type": "string"
378 }
379 },
380 "type": "object"
381 },
382 "location": {
383 "description": "Location settings for scraping",
384 "properties": {
385 "country": {
386 "description": "Country code for location emulation",
387 "type": "string"
388 },
389 "languages": {
390 "description": "Language preferences",
391 "items": {
392 "type": "string"
393 },
394 "type": "array"
395 }
396 },
397 "required": [
398 "country",
399 "languages"
400 ],
401 "type": "object"
402 },
403 "mobile": {
404 "description": "Use mobile viewport. (default: `false`)",
405 "type": "boolean"
406 },
407 "onlyMainContent": {
408 "description": "Extract only the main content, filtering out navigation, footers, etc. (default: `true`)",
409 "type": "boolean"
410 },
411 "removeBase64Images": {
412 "description": "Remove base64 encoded images from output. (default: `false`)",
413 "type": "boolean"
414 },
415 "skipTlsVerification": {
416 "description": "Skip TLS certificate verification. (default: `false`)",
417 "type": "boolean"
418 },
419 "timeout": {
420 "description": "Maximum time in milliseconds to wait for the page to load. (default: `60000`)",
421 "format": "uint32",
422 "minimum": 0.0,
423 "type": "integer"
424 },
425 "waitFor": {
426 "description": "Time in milliseconds to wait for dynamic content to load. (default: `0`)",
427 "format": "uint32",
428 "minimum": 0.0,
429 "type": "integer"
430 }
431 });
432
433 let actual_json_str = serde_json::to_string_pretty(&actual_schema["properties"]).unwrap();
435 let expected_json_str = serde_json::to_string_pretty(&expected_schema).unwrap();
436
437 assert_eq!(
439 actual_json_str, expected_json_str,
440 "Schema properties don't match"
441 );
442 }
443}
444
445#[cfg(test)]
446mod tests {
447 use super::*;
448 use serde_json::json;
449
450 #[test]
451 fn test_scrape_request_body_deserialization() {
452 let json_data = json!({
453 "url": "https://example.com",
454 "formats": [
455 "markdown"
456 ],
457 "onlyMainContent": true,
458 "includeTags": [
459 "div"
460 ],
461 "excludeTags": [
462 "img"
463 ],
464 "headers": {
465 "User-Agent": "Custom User Agent"
466 },
467 "waitFor": 1000,
468 "mobile": false,
469 "skipTlsVerification": false,
470 "timeout": 30000,
471 "jsonOptions": {
472 "schema": {
473 "type": "object",
474 "properties": {
475 "title": {
476 "type": "string"
477 }
478 }
479 },
480 "systemPrompt": "Extract data from the page",
481 "prompt": "Pull out the title"
482 },
483 "actions": [
484 {
485 "type": "wait",
486 "milliseconds": 2000,
487 "selector": "#my-element"
488 }
489 ],
490 "location": {
491 "country": "US",
492 "languages": [
493 "en-US"
494 ]
495 },
496 "removeBase64Images": true,
497 "blockAds": true,
498 "proxy": "basic"
499 });
500
501 let req_body: ScrapeRequestBody =
503 serde_json::from_value(json_data).expect("Failed to deserialize ScrapeRequestBody");
504
505 let mut expected_headers = HashMap::new();
507 expected_headers.insert("User-Agent".to_string(), "Custom User Agent".to_string());
508
509 let expected_req_body = ScrapeRequestBody {
511 url: "https://example.com".to_string(),
512 options: ScrapeOptions {
513 formats: Some(vec![ScrapeFormats::Markdown]),
514 include_tags: Some(vec!["div".to_string()]),
515 exclude_tags: Some(vec!["img".to_string()]),
516 only_main_content: Some(true),
517 headers: Some(expected_headers),
518 wait_for: Some(1000),
519 mobile: Some(false),
520 skip_tls_verification: Some(false),
521 timeout: Some(30000),
522 json_options: Some(JsonOptions {
523 schema: Some(json!({
524 "type": "object",
525 "properties": {
526 "title": { "type": "string" }
527 }
528 })),
529 system_prompt: Some("Extract data from the page".to_string()),
530 prompt: Some("Pull out the title".to_string()),
531 }),
532 actions: Some(vec![Action {
533 action_type: ActionType::Wait,
534 milliseconds: Some(2000),
535 selector: Some("#my-element".to_string()),
536 text: None,
537 key: None,
538 direction: None,
539 script: None,
540 full_page: None,
541 }]),
542 location: Some(LocationOptions {
543 country: "US".to_string(),
544 languages: vec!["en-US".to_string()],
545 }),
546 remove_base64_images: Some(true),
547 block_ads: Some(true),
548 proxy: Some("basic".to_string()),
549 },
550 };
551
552 let json_opts_actual = req_body.options.json_options.clone();
555 let json_opts_expected = expected_req_body.options.json_options.clone();
556
557 let mut req_body_compare = req_body.clone();
559 let mut expected_req_body_compare = expected_req_body.clone();
560 req_body_compare.options.json_options = None;
561 expected_req_body_compare.options.json_options = None;
562
563 assert_eq!(req_body_compare, expected_req_body_compare);
565
566 assert_eq!(
568 json_opts_actual.as_ref().unwrap().system_prompt,
569 json_opts_expected.as_ref().unwrap().system_prompt
570 );
571 assert_eq!(
572 json_opts_actual.as_ref().unwrap().prompt,
573 json_opts_expected.as_ref().unwrap().prompt
574 );
575
576 let schema_actual =
578 serde_json::to_string(&json_opts_actual.as_ref().unwrap().schema).unwrap();
579 let schema_expected =
580 serde_json::to_string(&json_opts_expected.as_ref().unwrap().schema).unwrap();
581 assert_eq!(schema_actual, schema_expected);
582 }
583
584 #[test]
585 fn test_json_options_deserialization() {
586 let json_data = json!({
587 "schema": {
588 "type": "object",
589 "properties": {
590 "title": { "type": "string" }
591 }
592 },
593 "systemPrompt": "Custom system prompt for extraction",
594 "prompt": "Extract the title from the page"
595 });
596
597 let json_options: JsonOptions =
599 serde_json::from_value(json_data).expect("Failed to deserialize JsonOptions");
600
601 let expected_json_options = JsonOptions {
603 schema: Some(json!({
604 "type": "object",
605 "properties": {
606 "title": { "type": "string" }
607 }
608 })),
609 system_prompt: Some("Custom system prompt for extraction".to_string()),
610 prompt: Some("Extract the title from the page".to_string()),
611 };
612
613 assert_eq!(
615 json_options.system_prompt,
616 expected_json_options.system_prompt
617 );
618 assert_eq!(json_options.prompt, expected_json_options.prompt);
619
620 let schema_actual = serde_json::to_string(&json_options.schema).unwrap();
622 let schema_expected = serde_json::to_string(&expected_json_options.schema).unwrap();
623 assert_eq!(schema_actual, schema_expected);
624 }
625
626 #[test]
627 fn test_action_deserialization() {
628 let wait_action_json = json!({
630 "type": "wait",
631 "milliseconds": 3000,
632 "selector": "#loading"
633 });
634
635 let wait_action: Action =
636 serde_json::from_value(wait_action_json).expect("Failed to deserialize wait Action");
637
638 let expected_wait_action = Action {
639 action_type: ActionType::Wait,
640 milliseconds: Some(3000),
641 selector: Some("#loading".to_string()),
642 text: None,
643 key: None,
644 direction: None,
645 script: None,
646 full_page: None,
647 };
648
649 assert_eq!(wait_action, expected_wait_action);
651
652 let click_action_json = json!({
654 "type": "click",
655 "selector": "#submit-button"
656 });
657
658 let click_action: Action =
659 serde_json::from_value(click_action_json).expect("Failed to deserialize click Action");
660
661 let expected_click_action = Action {
662 action_type: ActionType::Click,
663 milliseconds: None,
664 selector: Some("#submit-button".to_string()),
665 text: None,
666 key: None,
667 direction: None,
668 script: None,
669 full_page: None,
670 };
671
672 assert_eq!(click_action, expected_click_action);
673
674 let type_action_json = json!({
676 "type": "type",
677 "selector": "#search-input",
678 "text": "search query"
679 });
680
681 let type_action: Action =
682 serde_json::from_value(type_action_json).expect("Failed to deserialize type Action");
683
684 let expected_type_action = Action {
685 action_type: ActionType::Type,
686 milliseconds: None,
687 selector: Some("#search-input".to_string()),
688 text: Some("search query".to_string()),
689 key: None,
690 direction: None,
691 script: None,
692 full_page: None,
693 };
694
695 assert_eq!(type_action, expected_type_action);
696 }
697}