firecrawl_sdk/
scrape.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6#[cfg(feature = "mcp-tool")]
7use schemars::JsonSchema;
8
9use crate::{API_VERSION, FirecrawlApp, FirecrawlError, document::Document};
10
11#[derive(Deserialize, Serialize, Clone, Copy, Debug, PartialEq, Eq)]
12#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
13pub enum ScrapeFormats {
14    /// Will result in a copy of the Markdown content of the page.
15    #[serde(rename = "markdown")]
16    Markdown,
17
18    /// Will result in a copy of the filtered, content-only HTML of the page.
19    #[serde(rename = "html")]
20    HTML,
21
22    /// Will result in a copy of the raw HTML of the page.
23    #[serde(rename = "rawHtml")]
24    RawHTML,
25
26    /// Will result in a Vec of URLs found on the page.
27    #[serde(rename = "links")]
28    Links,
29
30    /// Will result in a URL to a screenshot of the page.
31    ///
32    /// Can not be used in conjunction with `ScrapeFormats::ScreenshotFullPage`.
33    #[serde(rename = "screenshot")]
34    Screenshot,
35
36    /// Will result in a URL to a full-page screenshot of the page.
37    ///
38    /// Can not be used in conjunction with `ScrapeFormats::Screenshot`.
39    #[serde(rename = "screenshot@fullPage")]
40    ScreenshotFullPage,
41
42    /// Will result in structured JSON data based on the schema provided in `jsonOptions`.
43    ///
44    /// See `ScrapeOptions.json_options` for more options.
45    #[serde(rename = "json")]
46    JSON,
47}
48
49#[serde_with::skip_serializing_none]
50#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
51#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
52#[serde(rename_all = "camelCase")]
53pub struct ExtractOptions {
54    /// Schema for structured data extraction
55    pub schema: Option<Value>,
56
57    /// System prompt for LLM extraction
58    pub system_prompt: Option<String>,
59
60    /// User prompt for LLM extraction
61    pub prompt: Option<String>,
62}
63
64#[serde_with::skip_serializing_none]
65#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
66#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
67#[serde(rename_all = "camelCase")]
68pub struct JsonOptions {
69    /// Schema the output should adhere to, provided in JSON Schema format.
70    pub schema: Option<Value>,
71
72    /// System prompt to send to the LLM agent for schema extraction
73    pub system_prompt: Option<String>,
74
75    /// Extraction prompt to send to the LLM agent
76    pub prompt: Option<String>,
77}
78
79#[derive(Deserialize, Serialize, Clone, Debug, Default, PartialEq, Eq)]
80#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
81#[serde(rename_all = "camelCase")]
82pub enum ActionType {
83    #[default]
84    #[serde(rename = "click")]
85    Click,
86
87    #[serde(rename = "type")]
88    Type,
89
90    #[serde(rename = "wait")]
91    Wait,
92
93    #[serde(rename = "screenshot")]
94    Screenshot,
95
96    #[serde(rename = "write")]
97    Write,
98
99    #[serde(rename = "press")]
100    Press,
101
102    #[serde(rename = "scroll")]
103    Scroll,
104
105    #[serde(rename = "scrape")]
106    Scrape,
107
108    #[serde(rename = "executeJavascript")]
109    ExecuteJavascript,
110}
111
112#[serde_with::skip_serializing_none]
113#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
114#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
115#[serde(rename_all = "camelCase")]
116pub struct Action {
117    /// Type of action to perform
118    #[serde(rename = "type")]
119    pub action_type: ActionType,
120
121    /// CSS selector for the target element
122    pub selector: Option<String>,
123
124    /// Text to write (for write action)
125    pub text: Option<String>,
126
127    /// Time to wait in milliseconds (for wait action)
128    pub milliseconds: Option<u32>,
129
130    /// Key to press (for press action)
131    pub key: Option<String>,
132
133    /// Scroll direction (up or down)
134    pub direction: Option<String>,
135
136    /// JavaScript code to execute (for executeJavascript action)
137    pub script: Option<String>,
138
139    /// Take full page screenshot (for screenshot action)
140    pub full_page: Option<bool>,
141}
142
143#[serde_with::skip_serializing_none]
144#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
145#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
146#[serde(rename_all = "camelCase")]
147pub struct LocationOptions {
148    /// Country code for location emulation
149    pub country: String,
150
151    /// Language preferences
152    pub languages: Vec<String>,
153}
154
155#[serde_with::skip_serializing_none]
156#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
157#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
158#[serde(rename_all = "camelCase")]
159pub struct ScrapeOptions {
160    /// Content formats to extract (default: ['markdown'])
161    #[cfg_attr(feature = "mcp-tool", schemars(skip))]
162    pub formats: Option<Vec<ScrapeFormats>>,
163
164    /// Extract only the main content, filtering out navigation, footers, etc. (default: `true`)
165    pub only_main_content: Option<bool>,
166
167    /// HTML tags to specifically include in extraction
168    pub include_tags: Option<Vec<String>>,
169
170    /// HTML tags to exclude from extraction
171    pub exclude_tags: Option<Vec<String>>,
172
173    /// Additional HTTP headers to use when loading the page.
174    pub headers: Option<HashMap<String, String>>,
175
176    /// Time in milliseconds to wait for dynamic content to load. (default: `0`)
177    pub wait_for: Option<u32>,
178
179    /// Maximum time in milliseconds to wait for the page to load. (default: `60000`)
180    pub timeout: Option<u32>,
181
182    /// The JSON options to use for the final extract.
183    #[serde(rename = "jsonOptions")]
184    pub json_options: Option<JsonOptions>,
185
186    /// Location settings for scraping
187    #[cfg_attr(feature = "self-host", schemars(skip))]
188    pub location: Option<LocationOptions>,
189
190    /// List of actions to perform before scraping
191    #[cfg_attr(feature = "self-host", schemars(skip))]
192    pub actions: Option<Vec<Action>>,
193
194    /// Use mobile viewport. (default: `false`)
195    pub mobile: Option<bool>,
196
197    /// Skip TLS certificate verification. (default: `false`)
198    pub skip_tls_verification: Option<bool>,
199
200    /// Remove base64 encoded images from output. (default: `false`)
201    pub remove_base64_images: Option<bool>,
202
203    /// Block ads during page loading (default: `true`)
204    #[cfg_attr(feature = "mcp-tool", schemars(skip))]
205    pub block_ads: Option<bool>,
206
207    /// Proxy configuration to use (values: "none", "basic", "residential") (default: `"none"`)
208    #[cfg_attr(feature = "mcp-tool", schemars(skip))]
209    pub proxy: Option<String>,
210}
211
212#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
213#[serde(rename_all = "camelCase")]
214pub struct ScrapeRequestBody {
215    /// The URL to scrape
216    pub url: String,
217
218    #[serde(flatten)]
219    pub options: ScrapeOptions,
220}
221
222#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
223#[serde(rename_all = "camelCase")]
224struct ScrapeResponse {
225    /// This will always be `true` due to `FirecrawlApp::handle_response`.
226    /// No need to expose.
227    success: bool,
228
229    /// The resulting document.
230    data: Document,
231}
232
233#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
234#[cfg_attr(feature = "mcp-tool", derive(JsonSchema))]
235#[serde(rename_all = "camelCase")]
236pub struct ScrapeUrlInput {
237    /// The URL to scrape
238    pub url: String,
239
240    #[serde(flatten)]
241    pub options: ScrapeOptions,
242}
243
244impl FirecrawlApp {
245    /// Scrapes a URL using the Firecrawl API.
246    pub async fn scrape_url(
247        &self,
248        url: impl AsRef<str>,
249        options: impl Into<Option<ScrapeOptions>>,
250    ) -> Result<Document, FirecrawlError> {
251        let body = ScrapeRequestBody {
252            url: url.as_ref().to_string(),
253            options: options.into().unwrap_or_default(),
254        };
255
256        let headers = self.prepare_headers(None);
257
258        let response = self
259            .client
260            .post(format!("{}/{}/scrape", self.api_url, API_VERSION))
261            .headers(headers)
262            .json(&body)
263            .send()
264            .await
265            .map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
266
267        let response = self
268            .handle_response::<ScrapeResponse>(response, "scrape URL")
269            .await?;
270
271        Ok(response.data)
272    }
273}
274
275#[cfg(all(test, feature = "mcp-tool"))]
276mod schema_tests {
277    use super::*;
278    use async_claude;
279    use serde_json::json;
280
281    #[test]
282    fn test_scrape_options_schema() {
283        let actual_schema = async_claude::tool::parse_input_schema::<ScrapeOptions>().unwrap();
284
285        // Create expected schema using json! macro
286        let expected_schema = json!({
287            "actions": {
288                "description": "List of actions to perform before scraping",
289                "items": {
290                    "properties": {
291                        "direction": {
292                            "description": "Scroll direction (up or down)",
293                            "type": "string"
294                        },
295                        "fullPage": {
296                            "description": "Take full page screenshot (for screenshot action)",
297                            "type": "boolean"
298                        },
299                        "key": {
300                            "description": "Key to press (for press action)",
301                            "type": "string"
302                        },
303                        "milliseconds": {
304                            "description": "Time to wait in milliseconds (for wait action)",
305                            "format": "uint32",
306                            "minimum": 0.0,
307                            "type": "integer"
308                        },
309                        "script": {
310                            "description": "JavaScript code to execute (for executeJavascript action)",
311                            "type": "string"
312                        },
313                        "selector": {
314                            "description": "CSS selector for the target element",
315                            "type": "string"
316                        },
317                        "text": {
318                            "description": "Text to write (for write action)",
319                            "type": "string"
320                        },
321                        "type": {
322                            "description": "Type of action to perform",
323                            "enum": [
324                                "click",
325                                "type",
326                                "wait",
327                                "screenshot",
328                                "write",
329                                "press",
330                                "scroll",
331                                "scrape",
332                                "executeJavascript"
333                            ],
334                            "type": "string"
335                        }
336                    },
337                    "required": [
338                        "type"
339                    ],
340                    "type": "object"
341                },
342                "type": "array"
343            },
344            "excludeTags": {
345                "description": "HTML tags to exclude from extraction",
346                "items": {
347                    "type": "string"
348                },
349                "type": "array"
350            },
351            "headers": {
352                "additionalProperties": {
353                    "type": "string"
354                },
355                "description": "Additional HTTP headers to use when loading the page.",
356                "type": "object"
357            },
358            "includeTags": {
359                "description": "HTML tags to specifically include in extraction",
360                "items": {
361                    "type": "string"
362                },
363                "type": "array"
364            },
365            "jsonOptions": {
366                "description": "The JSON options to use for the final extract.",
367                "properties": {
368                    "prompt": {
369                        "description": "Extraction prompt to send to the LLM agent",
370                        "type": "string"
371                    },
372                    "schema": {
373                        "description": "Schema the output should adhere to, provided in JSON Schema format."
374                    },
375                    "systemPrompt": {
376                        "description": "System prompt to send to the LLM agent for schema extraction",
377                        "type": "string"
378                    }
379                },
380                "type": "object"
381            },
382            "location": {
383                "description": "Location settings for scraping",
384                "properties": {
385                    "country": {
386                        "description": "Country code for location emulation",
387                        "type": "string"
388                    },
389                    "languages": {
390                        "description": "Language preferences",
391                        "items": {
392                            "type": "string"
393                        },
394                        "type": "array"
395                    }
396                },
397                "required": [
398                    "country",
399                    "languages"
400                ],
401                "type": "object"
402            },
403            "mobile": {
404                "description": "Use mobile viewport. (default: `false`)",
405                "type": "boolean"
406            },
407            "onlyMainContent": {
408                "description": "Extract only the main content, filtering out navigation, footers, etc. (default: `true`)",
409                "type": "boolean"
410            },
411            "removeBase64Images": {
412                "description": "Remove base64 encoded images from output. (default: `false`)",
413                "type": "boolean"
414            },
415            "skipTlsVerification": {
416                "description": "Skip TLS certificate verification. (default: `false`)",
417                "type": "boolean"
418            },
419            "timeout": {
420                "description": "Maximum time in milliseconds to wait for the page to load. (default: `60000`)",
421                "format": "uint32",
422                "minimum": 0.0,
423                "type": "integer"
424            },
425            "waitFor": {
426                "description": "Time in milliseconds to wait for dynamic content to load. (default: `0`)",
427                "format": "uint32",
428                "minimum": 0.0,
429                "type": "integer"
430            }
431        });
432
433        // Convert both to strings for comparison
434        let actual_json_str = serde_json::to_string_pretty(&actual_schema["properties"]).unwrap();
435        let expected_json_str = serde_json::to_string_pretty(&expected_schema).unwrap();
436
437        // Compare the serialized strings
438        assert_eq!(
439            actual_json_str, expected_json_str,
440            "Schema properties don't match"
441        );
442    }
443}
444
445#[cfg(test)]
446mod tests {
447    use super::*;
448    use serde_json::json;
449
450    #[test]
451    fn test_scrape_request_body_deserialization() {
452        let json_data = json!({
453            "url": "https://example.com",
454            "formats": [
455                "markdown"
456            ],
457            "onlyMainContent": true,
458            "includeTags": [
459                "div"
460            ],
461            "excludeTags": [
462                "img"
463            ],
464            "headers": {
465                "User-Agent": "Custom User Agent"
466            },
467            "waitFor": 1000,
468            "mobile": false,
469            "skipTlsVerification": false,
470            "timeout": 30000,
471            "jsonOptions": {
472                "schema": {
473                    "type": "object",
474                    "properties": {
475                        "title": {
476                            "type": "string"
477                        }
478                    }
479                },
480                "systemPrompt": "Extract data from the page",
481                "prompt": "Pull out the title"
482            },
483            "actions": [
484                {
485                    "type": "wait",
486                    "milliseconds": 2000,
487                    "selector": "#my-element"
488                }
489            ],
490            "location": {
491                "country": "US",
492                "languages": [
493                    "en-US"
494                ]
495            },
496            "removeBase64Images": true,
497            "blockAds": true,
498            "proxy": "basic"
499        });
500
501        // Deserialize the JSON to our struct
502        let req_body: ScrapeRequestBody =
503            serde_json::from_value(json_data).expect("Failed to deserialize ScrapeRequestBody");
504
505        // Create expected headers map
506        let mut expected_headers = HashMap::new();
507        expected_headers.insert("User-Agent".to_string(), "Custom User Agent".to_string());
508
509        // Create expected request body directly
510        let expected_req_body = ScrapeRequestBody {
511            url: "https://example.com".to_string(),
512            options: ScrapeOptions {
513                formats: Some(vec![ScrapeFormats::Markdown]),
514                include_tags: Some(vec!["div".to_string()]),
515                exclude_tags: Some(vec!["img".to_string()]),
516                only_main_content: Some(true),
517                headers: Some(expected_headers),
518                wait_for: Some(1000),
519                mobile: Some(false),
520                skip_tls_verification: Some(false),
521                timeout: Some(30000),
522                json_options: Some(JsonOptions {
523                    schema: Some(json!({
524                        "type": "object",
525                        "properties": {
526                            "title": { "type": "string" }
527                        }
528                    })),
529                    system_prompt: Some("Extract data from the page".to_string()),
530                    prompt: Some("Pull out the title".to_string()),
531                }),
532                actions: Some(vec![Action {
533                    action_type: ActionType::Wait,
534                    milliseconds: Some(2000),
535                    selector: Some("#my-element".to_string()),
536                    text: None,
537                    key: None,
538                    direction: None,
539                    script: None,
540                    full_page: None,
541                }]),
542                location: Some(LocationOptions {
543                    country: "US".to_string(),
544                    languages: vec!["en-US".to_string()],
545                }),
546                remove_base64_images: Some(true),
547                block_ads: Some(true),
548                proxy: Some("basic".to_string()),
549            },
550        };
551
552        // Since req_body has Value fields, we need to compare them separately
553        // First compare the entire structs except for json_options
554        let json_opts_actual = req_body.options.json_options.clone();
555        let json_opts_expected = expected_req_body.options.json_options.clone();
556
557        // Set json_options to None before comparison
558        let mut req_body_compare = req_body.clone();
559        let mut expected_req_body_compare = expected_req_body.clone();
560        req_body_compare.options.json_options = None;
561        expected_req_body_compare.options.json_options = None;
562
563        // Compare the structs without the Value fields
564        assert_eq!(req_body_compare, expected_req_body_compare);
565
566        // Now compare the json_options fields
567        assert_eq!(
568            json_opts_actual.as_ref().unwrap().system_prompt,
569            json_opts_expected.as_ref().unwrap().system_prompt
570        );
571        assert_eq!(
572            json_opts_actual.as_ref().unwrap().prompt,
573            json_opts_expected.as_ref().unwrap().prompt
574        );
575
576        // Compare schema values by serializing them to strings
577        let schema_actual =
578            serde_json::to_string(&json_opts_actual.as_ref().unwrap().schema).unwrap();
579        let schema_expected =
580            serde_json::to_string(&json_opts_expected.as_ref().unwrap().schema).unwrap();
581        assert_eq!(schema_actual, schema_expected);
582    }
583
584    #[test]
585    fn test_json_options_deserialization() {
586        let json_data = json!({
587            "schema": {
588                "type": "object",
589                "properties": {
590                    "title": { "type": "string" }
591                }
592            },
593            "systemPrompt": "Custom system prompt for extraction",
594            "prompt": "Extract the title from the page"
595        });
596
597        // Deserialize the JSON
598        let json_options: JsonOptions =
599            serde_json::from_value(json_data).expect("Failed to deserialize JsonOptions");
600
601        // Create expected struct
602        let expected_json_options = JsonOptions {
603            schema: Some(json!({
604                "type": "object",
605                "properties": {
606                    "title": { "type": "string" }
607                }
608            })),
609            system_prompt: Some("Custom system prompt for extraction".to_string()),
610            prompt: Some("Extract the title from the page".to_string()),
611        };
612
613        // Compare non-Value fields
614        assert_eq!(
615            json_options.system_prompt,
616            expected_json_options.system_prompt
617        );
618        assert_eq!(json_options.prompt, expected_json_options.prompt);
619
620        // Compare schema values by serializing them to strings
621        let schema_actual = serde_json::to_string(&json_options.schema).unwrap();
622        let schema_expected = serde_json::to_string(&expected_json_options.schema).unwrap();
623        assert_eq!(schema_actual, schema_expected);
624    }
625
626    #[test]
627    fn test_action_deserialization() {
628        // Test wait action
629        let wait_action_json = json!({
630            "type": "wait",
631            "milliseconds": 3000,
632            "selector": "#loading"
633        });
634
635        let wait_action: Action =
636            serde_json::from_value(wait_action_json).expect("Failed to deserialize wait Action");
637
638        let expected_wait_action = Action {
639            action_type: ActionType::Wait,
640            milliseconds: Some(3000),
641            selector: Some("#loading".to_string()),
642            text: None,
643            key: None,
644            direction: None,
645            script: None,
646            full_page: None,
647        };
648
649        // Direct comparison works since Action doesn't contain Value fields
650        assert_eq!(wait_action, expected_wait_action);
651
652        // Test click action
653        let click_action_json = json!({
654            "type": "click",
655            "selector": "#submit-button"
656        });
657
658        let click_action: Action =
659            serde_json::from_value(click_action_json).expect("Failed to deserialize click Action");
660
661        let expected_click_action = Action {
662            action_type: ActionType::Click,
663            milliseconds: None,
664            selector: Some("#submit-button".to_string()),
665            text: None,
666            key: None,
667            direction: None,
668            script: None,
669            full_page: None,
670        };
671
672        assert_eq!(click_action, expected_click_action);
673
674        // Test type action
675        let type_action_json = json!({
676            "type": "type",
677            "selector": "#search-input",
678            "text": "search query"
679        });
680
681        let type_action: Action =
682            serde_json::from_value(type_action_json).expect("Failed to deserialize type Action");
683
684        let expected_type_action = Action {
685            action_type: ActionType::Type,
686            milliseconds: None,
687            selector: Some("#search-input".to_string()),
688            text: Some("search query".to_string()),
689            key: None,
690            direction: None,
691            script: None,
692            full_page: None,
693        };
694
695        assert_eq!(type_action, expected_type_action);
696    }
697}