1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6#[cfg(feature = "mcp_tool")]
7use schemars::JsonSchema;
8
9use crate::{document::Document, FirecrawlApp, FirecrawlError, API_VERSION};
10
11#[derive(Deserialize, Serialize, Clone, Copy, Debug, PartialEq, Eq)]
12#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
13pub enum ScrapeFormats {
14 #[serde(rename = "markdown")]
16 Markdown,
17
18 #[serde(rename = "html")]
20 HTML,
21
22 #[serde(rename = "rawHtml")]
24 RawHTML,
25
26 #[serde(rename = "links")]
28 Links,
29
30 #[serde(rename = "screenshot")]
34 Screenshot,
35
36 #[serde(rename = "screenshot@fullPage")]
40 ScreenshotFullPage,
41
42 #[serde(rename = "json")]
46 JSON,
47}
48
49#[serde_with::skip_serializing_none]
50#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
51#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
52#[serde(rename_all = "camelCase")]
53pub struct ExtractOptions {
54 pub schema: Option<Value>,
56
57 pub system_prompt: Option<String>,
59
60 pub prompt: Option<String>,
62}
63
64#[serde_with::skip_serializing_none]
65#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
66#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
67#[serde(rename_all = "camelCase")]
68pub struct JsonOptions {
69 pub schema: Option<Value>,
71
72 pub system_prompt: Option<String>,
74
75 pub prompt: Option<String>,
77}
78
79#[derive(Deserialize, Serialize, Clone, Debug, Default, PartialEq, Eq)]
80#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
81#[serde(rename_all = "camelCase")]
82pub enum ActionType {
83 #[default]
84 #[serde(rename = "click")]
85 Click,
86
87 #[serde(rename = "type")]
88 Type,
89
90 #[serde(rename = "wait")]
91 Wait,
92
93 #[serde(rename = "screenshot")]
94 Screenshot,
95
96 #[serde(rename = "write")]
97 Write,
98
99 #[serde(rename = "press")]
100 Press,
101
102 #[serde(rename = "scroll")]
103 Scroll,
104
105 #[serde(rename = "scrape")]
106 Scrape,
107
108 #[serde(rename = "executeJavascript")]
109 ExecuteJavascript,
110}
111
112#[serde_with::skip_serializing_none]
113#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
114#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
115#[serde(rename_all = "camelCase")]
116pub struct Action {
117 #[serde(rename = "type")]
119 pub action_type: ActionType,
120
121 pub selector: Option<String>,
123
124 pub text: Option<String>,
126
127 pub milliseconds: Option<u32>,
129
130 pub key: Option<String>,
132
133 pub direction: Option<String>,
135
136 pub script: Option<String>,
138
139 pub full_page: Option<bool>,
141}
142
143#[serde_with::skip_serializing_none]
144#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
145#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
146#[serde(rename_all = "camelCase")]
147pub struct LocationOptions {
148 pub country: String,
150
151 pub languages: Vec<String>,
153}
154
155#[serde_with::skip_serializing_none]
156#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
157#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
158#[serde(rename_all = "camelCase")]
159pub struct ScrapeOptions {
160 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
162 pub formats: Option<Vec<ScrapeFormats>>,
163
164 pub only_main_content: Option<bool>,
166
167 pub include_tags: Option<Vec<String>>,
169
170 pub exclude_tags: Option<Vec<String>>,
172
173 pub headers: Option<HashMap<String, String>>,
175
176 pub wait_for: Option<u32>,
178
179 pub timeout: Option<u32>,
181
182 #[serde(rename = "jsonOptions")]
184 pub json_options: Option<JsonOptions>,
185
186 pub location: Option<LocationOptions>,
188
189 pub actions: Option<Vec<Action>>,
191
192 pub mobile: Option<bool>,
194
195 pub skip_tls_verification: Option<bool>,
197
198 pub remove_base64_images: Option<bool>,
200
201 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
203 pub block_ads: Option<bool>,
204
205 #[cfg_attr(feature = "mcp_tool", schemars(skip))]
207 pub proxy: Option<String>,
208}
209
210#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
211#[serde(rename_all = "camelCase")]
212pub struct ScrapeRequestBody {
213 pub url: String,
215
216 #[serde(flatten)]
217 pub options: ScrapeOptions,
218}
219
220#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
221#[serde(rename_all = "camelCase")]
222struct ScrapeResponse {
223 success: bool,
226
227 data: Document,
229}
230
231#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
232#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
233#[serde(rename_all = "camelCase")]
234pub struct ScrapeUrlInput {
235 pub url: String,
237
238 #[serde(flatten)]
239 pub options: ScrapeOptions,
240}
241
242impl FirecrawlApp {
243 pub async fn scrape_url(
245 &self,
246 url: impl AsRef<str>,
247 options: impl Into<Option<ScrapeOptions>>,
248 ) -> Result<Document, FirecrawlError> {
249 let body = ScrapeRequestBody {
250 url: url.as_ref().to_string(),
251 options: options.into().unwrap_or_default(),
252 };
253
254 let headers = self.prepare_headers(None);
255
256 let response = self
257 .client
258 .post(format!("{}/{}/scrape", self.api_url, API_VERSION))
259 .headers(headers)
260 .json(&body)
261 .send()
262 .await
263 .map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
264
265 let response = self
266 .handle_response::<ScrapeResponse>(response, "scrape URL")
267 .await?;
268
269 Ok(response.data)
270 }
271}
272
273#[cfg(all(test, feature = "mcp_tool"))]
274mod schema_tests {
275 use super::*;
276 use async_claude;
277 use serde_json::json;
278
279 #[test]
280 fn test_scrape_options_schema() {
281 let actual_schema = async_claude::tool::parse_input_schema::<ScrapeOptions>().unwrap();
282
283 let expected_schema = json!({
285 "actions": {
286 "description": "List of actions to perform before scraping",
287 "items": {
288 "properties": {
289 "direction": {
290 "description": "Scroll direction (up or down)",
291 "type": "string"
292 },
293 "fullPage": {
294 "description": "Take full page screenshot (for screenshot action)",
295 "type": "boolean"
296 },
297 "key": {
298 "description": "Key to press (for press action)",
299 "type": "string"
300 },
301 "milliseconds": {
302 "description": "Time to wait in milliseconds (for wait action)",
303 "format": "uint32",
304 "minimum": 0.0,
305 "type": "integer"
306 },
307 "script": {
308 "description": "JavaScript code to execute (for executeJavascript action)",
309 "type": "string"
310 },
311 "selector": {
312 "description": "CSS selector for the target element",
313 "type": "string"
314 },
315 "text": {
316 "description": "Text to write (for write action)",
317 "type": "string"
318 },
319 "type": {
320 "description": "Type of action to perform",
321 "enum": [
322 "click",
323 "type",
324 "wait",
325 "screenshot",
326 "write",
327 "press",
328 "scroll",
329 "scrape",
330 "executeJavascript"
331 ],
332 "type": "string"
333 }
334 },
335 "required": [
336 "type"
337 ],
338 "type": "object"
339 },
340 "type": "array"
341 },
342 "excludeTags": {
343 "description": "HTML tags to exclude from extraction",
344 "items": {
345 "type": "string"
346 },
347 "type": "array"
348 },
349 "headers": {
350 "additionalProperties": {
351 "type": "string"
352 },
353 "description": "Additional HTTP headers to use when loading the page.",
354 "type": "object"
355 },
356 "includeTags": {
357 "description": "HTML tags to specifically include in extraction",
358 "items": {
359 "type": "string"
360 },
361 "type": "array"
362 },
363 "jsonOptions": {
364 "description": "The JSON options to use for the final extract.",
365 "properties": {
366 "prompt": {
367 "description": "Extraction prompt to send to the LLM agent",
368 "type": "string"
369 },
370 "schema": {
371 "description": "Schema the output should adhere to, provided in JSON Schema format."
372 },
373 "systemPrompt": {
374 "description": "System prompt to send to the LLM agent for schema extraction",
375 "type": "string"
376 }
377 },
378 "type": "object"
379 },
380 "location": {
381 "description": "Location settings for scraping",
382 "properties": {
383 "country": {
384 "description": "Country code for location emulation",
385 "type": "string"
386 },
387 "languages": {
388 "description": "Language preferences",
389 "items": {
390 "type": "string"
391 },
392 "type": "array"
393 }
394 },
395 "required": [
396 "country",
397 "languages"
398 ],
399 "type": "object"
400 },
401 "mobile": {
402 "description": "Use mobile viewport. (default: `false`)",
403 "type": "boolean"
404 },
405 "onlyMainContent": {
406 "description": "Extract only the main content, filtering out navigation, footers, etc. (default: `true`)",
407 "type": "boolean"
408 },
409 "removeBase64Images": {
410 "description": "Remove base64 encoded images from output. (default: `false`)",
411 "type": "boolean"
412 },
413 "skipTlsVerification": {
414 "description": "Skip TLS certificate verification. (default: `false`)",
415 "type": "boolean"
416 },
417 "timeout": {
418 "description": "Maximum time in milliseconds to wait for the page to load. (default: `60000`)",
419 "format": "uint32",
420 "minimum": 0.0,
421 "type": "integer"
422 },
423 "waitFor": {
424 "description": "Time in milliseconds to wait for dynamic content to load. (default: `0`)",
425 "format": "uint32",
426 "minimum": 0.0,
427 "type": "integer"
428 }
429 });
430
431 let actual_json_str = serde_json::to_string_pretty(&actual_schema["properties"]).unwrap();
433 let expected_json_str = serde_json::to_string_pretty(&expected_schema).unwrap();
434
435 assert_eq!(
437 actual_json_str, expected_json_str,
438 "Schema properties don't match"
439 );
440 }
441}
442
443#[cfg(test)]
444mod tests {
445 use super::*;
446 use serde_json::json;
447
448 #[test]
449 fn test_scrape_request_body_deserialization() {
450 let json_data = json!({
451 "url": "https://example.com",
452 "formats": [
453 "markdown"
454 ],
455 "onlyMainContent": true,
456 "includeTags": [
457 "div"
458 ],
459 "excludeTags": [
460 "img"
461 ],
462 "headers": {
463 "User-Agent": "Custom User Agent"
464 },
465 "waitFor": 1000,
466 "mobile": false,
467 "skipTlsVerification": false,
468 "timeout": 30000,
469 "jsonOptions": {
470 "schema": {
471 "type": "object",
472 "properties": {
473 "title": {
474 "type": "string"
475 }
476 }
477 },
478 "systemPrompt": "Extract data from the page",
479 "prompt": "Pull out the title"
480 },
481 "actions": [
482 {
483 "type": "wait",
484 "milliseconds": 2000,
485 "selector": "#my-element"
486 }
487 ],
488 "location": {
489 "country": "US",
490 "languages": [
491 "en-US"
492 ]
493 },
494 "removeBase64Images": true,
495 "blockAds": true,
496 "proxy": "basic"
497 });
498
499 let req_body: ScrapeRequestBody =
501 serde_json::from_value(json_data).expect("Failed to deserialize ScrapeRequestBody");
502
503 let mut expected_headers = HashMap::new();
505 expected_headers.insert("User-Agent".to_string(), "Custom User Agent".to_string());
506
507 let expected_req_body = ScrapeRequestBody {
509 url: "https://example.com".to_string(),
510 options: ScrapeOptions {
511 formats: Some(vec![ScrapeFormats::Markdown]),
512 include_tags: Some(vec!["div".to_string()]),
513 exclude_tags: Some(vec!["img".to_string()]),
514 only_main_content: Some(true),
515 headers: Some(expected_headers),
516 wait_for: Some(1000),
517 mobile: Some(false),
518 skip_tls_verification: Some(false),
519 timeout: Some(30000),
520 json_options: Some(JsonOptions {
521 schema: Some(json!({
522 "type": "object",
523 "properties": {
524 "title": { "type": "string" }
525 }
526 })),
527 system_prompt: Some("Extract data from the page".to_string()),
528 prompt: Some("Pull out the title".to_string()),
529 }),
530 actions: Some(vec![Action {
531 action_type: ActionType::Wait,
532 milliseconds: Some(2000),
533 selector: Some("#my-element".to_string()),
534 text: None,
535 key: None,
536 direction: None,
537 script: None,
538 full_page: None,
539 }]),
540 location: Some(LocationOptions {
541 country: "US".to_string(),
542 languages: vec!["en-US".to_string()],
543 }),
544 remove_base64_images: Some(true),
545 block_ads: Some(true),
546 proxy: Some("basic".to_string()),
547 },
548 };
549
550 let json_opts_actual = req_body.options.json_options.clone();
553 let json_opts_expected = expected_req_body.options.json_options.clone();
554
555 let mut req_body_compare = req_body.clone();
557 let mut expected_req_body_compare = expected_req_body.clone();
558 req_body_compare.options.json_options = None;
559 expected_req_body_compare.options.json_options = None;
560
561 assert_eq!(req_body_compare, expected_req_body_compare);
563
564 assert_eq!(
566 json_opts_actual.as_ref().unwrap().system_prompt,
567 json_opts_expected.as_ref().unwrap().system_prompt
568 );
569 assert_eq!(
570 json_opts_actual.as_ref().unwrap().prompt,
571 json_opts_expected.as_ref().unwrap().prompt
572 );
573
574 let schema_actual =
576 serde_json::to_string(&json_opts_actual.as_ref().unwrap().schema).unwrap();
577 let schema_expected =
578 serde_json::to_string(&json_opts_expected.as_ref().unwrap().schema).unwrap();
579 assert_eq!(schema_actual, schema_expected);
580 }
581
582 #[test]
583 fn test_json_options_deserialization() {
584 let json_data = json!({
585 "schema": {
586 "type": "object",
587 "properties": {
588 "title": { "type": "string" }
589 }
590 },
591 "systemPrompt": "Custom system prompt for extraction",
592 "prompt": "Extract the title from the page"
593 });
594
595 let json_options: JsonOptions =
597 serde_json::from_value(json_data).expect("Failed to deserialize JsonOptions");
598
599 let expected_json_options = JsonOptions {
601 schema: Some(json!({
602 "type": "object",
603 "properties": {
604 "title": { "type": "string" }
605 }
606 })),
607 system_prompt: Some("Custom system prompt for extraction".to_string()),
608 prompt: Some("Extract the title from the page".to_string()),
609 };
610
611 assert_eq!(
613 json_options.system_prompt,
614 expected_json_options.system_prompt
615 );
616 assert_eq!(json_options.prompt, expected_json_options.prompt);
617
618 let schema_actual = serde_json::to_string(&json_options.schema).unwrap();
620 let schema_expected = serde_json::to_string(&expected_json_options.schema).unwrap();
621 assert_eq!(schema_actual, schema_expected);
622 }
623
624 #[test]
625 fn test_action_deserialization() {
626 let wait_action_json = json!({
628 "type": "wait",
629 "milliseconds": 3000,
630 "selector": "#loading"
631 });
632
633 let wait_action: Action =
634 serde_json::from_value(wait_action_json).expect("Failed to deserialize wait Action");
635
636 let expected_wait_action = Action {
637 action_type: ActionType::Wait,
638 milliseconds: Some(3000),
639 selector: Some("#loading".to_string()),
640 text: None,
641 key: None,
642 direction: None,
643 script: None,
644 full_page: None,
645 };
646
647 assert_eq!(wait_action, expected_wait_action);
649
650 let click_action_json = json!({
652 "type": "click",
653 "selector": "#submit-button"
654 });
655
656 let click_action: Action =
657 serde_json::from_value(click_action_json).expect("Failed to deserialize click Action");
658
659 let expected_click_action = Action {
660 action_type: ActionType::Click,
661 milliseconds: None,
662 selector: Some("#submit-button".to_string()),
663 text: None,
664 key: None,
665 direction: None,
666 script: None,
667 full_page: None,
668 };
669
670 assert_eq!(click_action, expected_click_action);
671
672 let type_action_json = json!({
674 "type": "type",
675 "selector": "#search-input",
676 "text": "search query"
677 });
678
679 let type_action: Action =
680 serde_json::from_value(type_action_json).expect("Failed to deserialize type Action");
681
682 let expected_type_action = Action {
683 action_type: ActionType::Type,
684 milliseconds: None,
685 selector: Some("#search-input".to_string()),
686 text: Some("search query".to_string()),
687 key: None,
688 direction: None,
689 script: None,
690 full_page: None,
691 };
692
693 assert_eq!(type_action, expected_type_action);
694 }
695}