firecrawl_sdk/
scrape.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6#[cfg(feature = "mcp_tool")]
7use schemars::JsonSchema;
8
9use crate::{document::Document, FirecrawlApp, FirecrawlError, API_VERSION};
10
11#[derive(Deserialize, Serialize, Clone, Copy, Debug, PartialEq, Eq)]
12#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
13pub enum ScrapeFormats {
14    /// Will result in a copy of the Markdown content of the page.
15    #[serde(rename = "markdown")]
16    Markdown,
17
18    /// Will result in a copy of the filtered, content-only HTML of the page.
19    #[serde(rename = "html")]
20    HTML,
21
22    /// Will result in a copy of the raw HTML of the page.
23    #[serde(rename = "rawHtml")]
24    RawHTML,
25
26    /// Will result in a Vec of URLs found on the page.
27    #[serde(rename = "links")]
28    Links,
29
30    /// Will result in a URL to a screenshot of the page.
31    ///
32    /// Can not be used in conjunction with `ScrapeFormats::ScreenshotFullPage`.
33    #[serde(rename = "screenshot")]
34    Screenshot,
35
36    /// Will result in a URL to a full-page screenshot of the page.
37    ///
38    /// Can not be used in conjunction with `ScrapeFormats::Screenshot`.
39    #[serde(rename = "screenshot@fullPage")]
40    ScreenshotFullPage,
41
42    /// Will result in structured JSON data based on the schema provided in `jsonOptions`.
43    ///
44    /// See `ScrapeOptions.json_options` for more options.
45    #[serde(rename = "json")]
46    JSON,
47}
48
49#[serde_with::skip_serializing_none]
50#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
51#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
52#[serde(rename_all = "camelCase")]
53pub struct ExtractOptions {
54    /// Schema for structured data extraction
55    pub schema: Option<Value>,
56
57    /// System prompt for LLM extraction
58    pub system_prompt: Option<String>,
59
60    /// User prompt for LLM extraction
61    pub prompt: Option<String>,
62}
63
64#[serde_with::skip_serializing_none]
65#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
66#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
67#[serde(rename_all = "camelCase")]
68pub struct JsonOptions {
69    /// Schema the output should adhere to, provided in JSON Schema format.
70    pub schema: Option<Value>,
71
72    /// System prompt to send to the LLM agent for schema extraction
73    pub system_prompt: Option<String>,
74
75    /// Extraction prompt to send to the LLM agent
76    pub prompt: Option<String>,
77}
78
79#[derive(Deserialize, Serialize, Clone, Debug, Default, PartialEq, Eq)]
80#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
81#[serde(rename_all = "camelCase")]
82pub enum ActionType {
83    #[default]
84    #[serde(rename = "click")]
85    Click,
86
87    #[serde(rename = "type")]
88    Type,
89
90    #[serde(rename = "wait")]
91    Wait,
92
93    #[serde(rename = "screenshot")]
94    Screenshot,
95
96    #[serde(rename = "write")]
97    Write,
98
99    #[serde(rename = "press")]
100    Press,
101
102    #[serde(rename = "scroll")]
103    Scroll,
104
105    #[serde(rename = "scrape")]
106    Scrape,
107
108    #[serde(rename = "executeJavascript")]
109    ExecuteJavascript,
110}
111
112#[serde_with::skip_serializing_none]
113#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
114#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
115#[serde(rename_all = "camelCase")]
116pub struct Action {
117    /// Type of action to perform
118    #[serde(rename = "type")]
119    pub action_type: ActionType,
120
121    /// CSS selector for the target element
122    pub selector: Option<String>,
123
124    /// Text to write (for write action)
125    pub text: Option<String>,
126
127    /// Time to wait in milliseconds (for wait action)
128    pub milliseconds: Option<u32>,
129
130    /// Key to press (for press action)
131    pub key: Option<String>,
132
133    /// Scroll direction (up or down)
134    pub direction: Option<String>,
135
136    /// JavaScript code to execute (for executeJavascript action)
137    pub script: Option<String>,
138
139    /// Take full page screenshot (for screenshot action)
140    pub full_page: Option<bool>,
141}
142
143#[serde_with::skip_serializing_none]
144#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
145#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
146#[serde(rename_all = "camelCase")]
147pub struct LocationOptions {
148    /// Country code for location emulation
149    pub country: String,
150
151    /// Language preferences
152    pub languages: Vec<String>,
153}
154
155#[serde_with::skip_serializing_none]
156#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
157#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
158#[serde(rename_all = "camelCase")]
159pub struct ScrapeOptions {
160    /// Content formats to extract (default: ['markdown'])
161    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
162    pub formats: Option<Vec<ScrapeFormats>>,
163
164    /// Extract only the main content, filtering out navigation, footers, etc. (default: `true`)
165    pub only_main_content: Option<bool>,
166
167    /// HTML tags to specifically include in extraction
168    pub include_tags: Option<Vec<String>>,
169
170    /// HTML tags to exclude from extraction
171    pub exclude_tags: Option<Vec<String>>,
172
173    /// Additional HTTP headers to use when loading the page.
174    pub headers: Option<HashMap<String, String>>,
175
176    /// Time in milliseconds to wait for dynamic content to load. (default: `0`)
177    pub wait_for: Option<u32>,
178
179    /// Maximum time in milliseconds to wait for the page to load. (default: `60000`)
180    pub timeout: Option<u32>,
181
182    /// The JSON options to use for the final extract.
183    #[serde(rename = "jsonOptions")]
184    pub json_options: Option<JsonOptions>,
185
186    /// Location settings for scraping
187    pub location: Option<LocationOptions>,
188
189    /// List of actions to perform before scraping
190    pub actions: Option<Vec<Action>>,
191
192    /// Use mobile viewport. (default: `false`)
193    pub mobile: Option<bool>,
194
195    /// Skip TLS certificate verification. (default: `false`)
196    pub skip_tls_verification: Option<bool>,
197
198    /// Remove base64 encoded images from output. (default: `false`)
199    pub remove_base64_images: Option<bool>,
200
201    /// Block ads during page loading (default: `true`)
202    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
203    pub block_ads: Option<bool>,
204
205    /// Proxy configuration to use (values: "none", "basic", "residential") (default: `"none"`)
206    #[cfg_attr(feature = "mcp_tool", schemars(skip))]
207    pub proxy: Option<String>,
208}
209
210#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
211#[serde(rename_all = "camelCase")]
212pub struct ScrapeRequestBody {
213    /// The URL to scrape
214    pub url: String,
215
216    #[serde(flatten)]
217    pub options: ScrapeOptions,
218}
219
220#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
221#[serde(rename_all = "camelCase")]
222struct ScrapeResponse {
223    /// This will always be `true` due to `FirecrawlApp::handle_response`.
224    /// No need to expose.
225    success: bool,
226
227    /// The resulting document.
228    data: Document,
229}
230
231#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
232#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
233#[serde(rename_all = "camelCase")]
234pub struct ScrapeUrlInput {
235    /// The URL to scrape
236    pub url: String,
237
238    #[serde(flatten)]
239    pub options: ScrapeOptions,
240}
241
242impl FirecrawlApp {
243    /// Scrapes a URL using the Firecrawl API.
244    pub async fn scrape_url(
245        &self,
246        url: impl AsRef<str>,
247        options: impl Into<Option<ScrapeOptions>>,
248    ) -> Result<Document, FirecrawlError> {
249        let body = ScrapeRequestBody {
250            url: url.as_ref().to_string(),
251            options: options.into().unwrap_or_default(),
252        };
253
254        let headers = self.prepare_headers(None);
255
256        let response = self
257            .client
258            .post(format!("{}/{}/scrape", self.api_url, API_VERSION))
259            .headers(headers)
260            .json(&body)
261            .send()
262            .await
263            .map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
264
265        let response = self
266            .handle_response::<ScrapeResponse>(response, "scrape URL")
267            .await?;
268
269        Ok(response.data)
270    }
271}
272
273#[cfg(all(test, feature = "mcp_tool"))]
274mod schema_tests {
275    use super::*;
276    use async_claude;
277    use serde_json::json;
278
279    #[test]
280    fn test_scrape_options_schema() {
281        let actual_schema = async_claude::tool::parse_input_schema::<ScrapeOptions>().unwrap();
282
283        // Create expected schema using json! macro
284        let expected_schema = json!({
285            "actions": {
286                "description": "List of actions to perform before scraping",
287                "items": {
288                    "properties": {
289                        "direction": {
290                            "description": "Scroll direction (up or down)",
291                            "type": "string"
292                        },
293                        "fullPage": {
294                            "description": "Take full page screenshot (for screenshot action)",
295                            "type": "boolean"
296                        },
297                        "key": {
298                            "description": "Key to press (for press action)",
299                            "type": "string"
300                        },
301                        "milliseconds": {
302                            "description": "Time to wait in milliseconds (for wait action)",
303                            "format": "uint32",
304                            "minimum": 0.0,
305                            "type": "integer"
306                        },
307                        "script": {
308                            "description": "JavaScript code to execute (for executeJavascript action)",
309                            "type": "string"
310                        },
311                        "selector": {
312                            "description": "CSS selector for the target element",
313                            "type": "string"
314                        },
315                        "text": {
316                            "description": "Text to write (for write action)",
317                            "type": "string"
318                        },
319                        "type": {
320                            "description": "Type of action to perform",
321                            "enum": [
322                                "click",
323                                "type",
324                                "wait",
325                                "screenshot",
326                                "write",
327                                "press",
328                                "scroll",
329                                "scrape",
330                                "executeJavascript"
331                            ],
332                            "type": "string"
333                        }
334                    },
335                    "required": [
336                        "type"
337                    ],
338                    "type": "object"
339                },
340                "type": "array"
341            },
342            "excludeTags": {
343                "description": "HTML tags to exclude from extraction",
344                "items": {
345                    "type": "string"
346                },
347                "type": "array"
348            },
349            "headers": {
350                "additionalProperties": {
351                    "type": "string"
352                },
353                "description": "Additional HTTP headers to use when loading the page.",
354                "type": "object"
355            },
356            "includeTags": {
357                "description": "HTML tags to specifically include in extraction",
358                "items": {
359                    "type": "string"
360                },
361                "type": "array"
362            },
363            "jsonOptions": {
364                "description": "The JSON options to use for the final extract.",
365                "properties": {
366                    "prompt": {
367                        "description": "Extraction prompt to send to the LLM agent",
368                        "type": "string"
369                    },
370                    "schema": {
371                        "description": "Schema the output should adhere to, provided in JSON Schema format."
372                    },
373                    "systemPrompt": {
374                        "description": "System prompt to send to the LLM agent for schema extraction",
375                        "type": "string"
376                    }
377                },
378                "type": "object"
379            },
380            "location": {
381                "description": "Location settings for scraping",
382                "properties": {
383                    "country": {
384                        "description": "Country code for location emulation",
385                        "type": "string"
386                    },
387                    "languages": {
388                        "description": "Language preferences",
389                        "items": {
390                            "type": "string"
391                        },
392                        "type": "array"
393                    }
394                },
395                "required": [
396                    "country",
397                    "languages"
398                ],
399                "type": "object"
400            },
401            "mobile": {
402                "description": "Use mobile viewport. (default: `false`)",
403                "type": "boolean"
404            },
405            "onlyMainContent": {
406                "description": "Extract only the main content, filtering out navigation, footers, etc. (default: `true`)",
407                "type": "boolean"
408            },
409            "removeBase64Images": {
410                "description": "Remove base64 encoded images from output. (default: `false`)",
411                "type": "boolean"
412            },
413            "skipTlsVerification": {
414                "description": "Skip TLS certificate verification. (default: `false`)",
415                "type": "boolean"
416            },
417            "timeout": {
418                "description": "Maximum time in milliseconds to wait for the page to load. (default: `60000`)",
419                "format": "uint32",
420                "minimum": 0.0,
421                "type": "integer"
422            },
423            "waitFor": {
424                "description": "Time in milliseconds to wait for dynamic content to load. (default: `0`)",
425                "format": "uint32",
426                "minimum": 0.0,
427                "type": "integer"
428            }
429        });
430
431        // Convert both to strings for comparison
432        let actual_json_str = serde_json::to_string_pretty(&actual_schema["properties"]).unwrap();
433        let expected_json_str = serde_json::to_string_pretty(&expected_schema).unwrap();
434
435        // Compare the serialized strings
436        assert_eq!(
437            actual_json_str, expected_json_str,
438            "Schema properties don't match"
439        );
440    }
441}
442
443#[cfg(test)]
444mod tests {
445    use super::*;
446    use serde_json::json;
447
448    #[test]
449    fn test_scrape_request_body_deserialization() {
450        let json_data = json!({
451            "url": "https://example.com",
452            "formats": [
453                "markdown"
454            ],
455            "onlyMainContent": true,
456            "includeTags": [
457                "div"
458            ],
459            "excludeTags": [
460                "img"
461            ],
462            "headers": {
463                "User-Agent": "Custom User Agent"
464            },
465            "waitFor": 1000,
466            "mobile": false,
467            "skipTlsVerification": false,
468            "timeout": 30000,
469            "jsonOptions": {
470                "schema": {
471                    "type": "object",
472                    "properties": {
473                        "title": {
474                            "type": "string"
475                        }
476                    }
477                },
478                "systemPrompt": "Extract data from the page",
479                "prompt": "Pull out the title"
480            },
481            "actions": [
482                {
483                    "type": "wait",
484                    "milliseconds": 2000,
485                    "selector": "#my-element"
486                }
487            ],
488            "location": {
489                "country": "US",
490                "languages": [
491                    "en-US"
492                ]
493            },
494            "removeBase64Images": true,
495            "blockAds": true,
496            "proxy": "basic"
497        });
498
499        // Deserialize the JSON to our struct
500        let req_body: ScrapeRequestBody =
501            serde_json::from_value(json_data).expect("Failed to deserialize ScrapeRequestBody");
502
503        // Create expected headers map
504        let mut expected_headers = HashMap::new();
505        expected_headers.insert("User-Agent".to_string(), "Custom User Agent".to_string());
506
507        // Create expected request body directly
508        let expected_req_body = ScrapeRequestBody {
509            url: "https://example.com".to_string(),
510            options: ScrapeOptions {
511                formats: Some(vec![ScrapeFormats::Markdown]),
512                include_tags: Some(vec!["div".to_string()]),
513                exclude_tags: Some(vec!["img".to_string()]),
514                only_main_content: Some(true),
515                headers: Some(expected_headers),
516                wait_for: Some(1000),
517                mobile: Some(false),
518                skip_tls_verification: Some(false),
519                timeout: Some(30000),
520                json_options: Some(JsonOptions {
521                    schema: Some(json!({
522                        "type": "object",
523                        "properties": {
524                            "title": { "type": "string" }
525                        }
526                    })),
527                    system_prompt: Some("Extract data from the page".to_string()),
528                    prompt: Some("Pull out the title".to_string()),
529                }),
530                actions: Some(vec![Action {
531                    action_type: ActionType::Wait,
532                    milliseconds: Some(2000),
533                    selector: Some("#my-element".to_string()),
534                    text: None,
535                    key: None,
536                    direction: None,
537                    script: None,
538                    full_page: None,
539                }]),
540                location: Some(LocationOptions {
541                    country: "US".to_string(),
542                    languages: vec!["en-US".to_string()],
543                }),
544                remove_base64_images: Some(true),
545                block_ads: Some(true),
546                proxy: Some("basic".to_string()),
547            },
548        };
549
550        // Since req_body has Value fields, we need to compare them separately
551        // First compare the entire structs except for json_options
552        let json_opts_actual = req_body.options.json_options.clone();
553        let json_opts_expected = expected_req_body.options.json_options.clone();
554
555        // Set json_options to None before comparison
556        let mut req_body_compare = req_body.clone();
557        let mut expected_req_body_compare = expected_req_body.clone();
558        req_body_compare.options.json_options = None;
559        expected_req_body_compare.options.json_options = None;
560
561        // Compare the structs without the Value fields
562        assert_eq!(req_body_compare, expected_req_body_compare);
563
564        // Now compare the json_options fields
565        assert_eq!(
566            json_opts_actual.as_ref().unwrap().system_prompt,
567            json_opts_expected.as_ref().unwrap().system_prompt
568        );
569        assert_eq!(
570            json_opts_actual.as_ref().unwrap().prompt,
571            json_opts_expected.as_ref().unwrap().prompt
572        );
573
574        // Compare schema values by serializing them to strings
575        let schema_actual =
576            serde_json::to_string(&json_opts_actual.as_ref().unwrap().schema).unwrap();
577        let schema_expected =
578            serde_json::to_string(&json_opts_expected.as_ref().unwrap().schema).unwrap();
579        assert_eq!(schema_actual, schema_expected);
580    }
581
582    #[test]
583    fn test_json_options_deserialization() {
584        let json_data = json!({
585            "schema": {
586                "type": "object",
587                "properties": {
588                    "title": { "type": "string" }
589                }
590            },
591            "systemPrompt": "Custom system prompt for extraction",
592            "prompt": "Extract the title from the page"
593        });
594
595        // Deserialize the JSON
596        let json_options: JsonOptions =
597            serde_json::from_value(json_data).expect("Failed to deserialize JsonOptions");
598
599        // Create expected struct
600        let expected_json_options = JsonOptions {
601            schema: Some(json!({
602                "type": "object",
603                "properties": {
604                    "title": { "type": "string" }
605                }
606            })),
607            system_prompt: Some("Custom system prompt for extraction".to_string()),
608            prompt: Some("Extract the title from the page".to_string()),
609        };
610
611        // Compare non-Value fields
612        assert_eq!(
613            json_options.system_prompt,
614            expected_json_options.system_prompt
615        );
616        assert_eq!(json_options.prompt, expected_json_options.prompt);
617
618        // Compare schema values by serializing them to strings
619        let schema_actual = serde_json::to_string(&json_options.schema).unwrap();
620        let schema_expected = serde_json::to_string(&expected_json_options.schema).unwrap();
621        assert_eq!(schema_actual, schema_expected);
622    }
623
624    #[test]
625    fn test_action_deserialization() {
626        // Test wait action
627        let wait_action_json = json!({
628            "type": "wait",
629            "milliseconds": 3000,
630            "selector": "#loading"
631        });
632
633        let wait_action: Action =
634            serde_json::from_value(wait_action_json).expect("Failed to deserialize wait Action");
635
636        let expected_wait_action = Action {
637            action_type: ActionType::Wait,
638            milliseconds: Some(3000),
639            selector: Some("#loading".to_string()),
640            text: None,
641            key: None,
642            direction: None,
643            script: None,
644            full_page: None,
645        };
646
647        // Direct comparison works since Action doesn't contain Value fields
648        assert_eq!(wait_action, expected_wait_action);
649
650        // Test click action
651        let click_action_json = json!({
652            "type": "click",
653            "selector": "#submit-button"
654        });
655
656        let click_action: Action =
657            serde_json::from_value(click_action_json).expect("Failed to deserialize click Action");
658
659        let expected_click_action = Action {
660            action_type: ActionType::Click,
661            milliseconds: None,
662            selector: Some("#submit-button".to_string()),
663            text: None,
664            key: None,
665            direction: None,
666            script: None,
667            full_page: None,
668        };
669
670        assert_eq!(click_action, expected_click_action);
671
672        // Test type action
673        let type_action_json = json!({
674            "type": "type",
675            "selector": "#search-input",
676            "text": "search query"
677        });
678
679        let type_action: Action =
680            serde_json::from_value(type_action_json).expect("Failed to deserialize type Action");
681
682        let expected_type_action = Action {
683            action_type: ActionType::Type,
684            milliseconds: None,
685            selector: Some("#search-input".to_string()),
686            text: Some("search query".to_string()),
687            key: None,
688            direction: None,
689            script: None,
690            full_page: None,
691        };
692
693        assert_eq!(type_action, expected_type_action);
694    }
695}