firecrawl_sdk/
scrape.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6#[cfg(feature = "mcp_tool")]
7use schemars::JsonSchema;
8
9use crate::{document::Document, FirecrawlApp, FirecrawlError, API_VERSION};
10
11#[derive(Deserialize, Serialize, Clone, Copy, Debug, PartialEq, Eq)]
12#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
13pub enum ScrapeFormats {
14    /// Will result in a copy of the Markdown content of the page.
15    #[serde(rename = "markdown")]
16    Markdown,
17
18    /// Will result in a copy of the filtered, content-only HTML of the page.
19    #[serde(rename = "html")]
20    HTML,
21
22    /// Will result in a copy of the raw HTML of the page.
23    #[serde(rename = "rawHtml")]
24    RawHTML,
25
26    /// Will result in a Vec of URLs found on the page.
27    #[serde(rename = "links")]
28    Links,
29
30    /// Will result in a URL to a screenshot of the page.
31    ///
32    /// Can not be used in conjunction with `ScrapeFormats::ScreenshotFullPage`.
33    #[serde(rename = "screenshot")]
34    Screenshot,
35
36    /// Will result in a URL to a full-page screenshot of the page.
37    ///
38    /// Can not be used in conjunction with `ScrapeFormats::Screenshot`.
39    #[serde(rename = "screenshot@fullPage")]
40    ScreenshotFullPage,
41
42    /// Will result in the results of an LLM extraction.
43    ///
44    /// See `ScrapeOptions.extract` for more options.
45    #[serde(rename = "extract")]
46    Extract,
47
48    /// Will result in structured JSON data based on the schema provided in `jsonOptions`.
49    ///
50    /// See `ScrapeOptions.json_options` for more options.
51    #[serde(rename = "json")]
52    JSON,
53}
54
55#[serde_with::skip_serializing_none]
56#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
57#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
58#[serde(rename_all = "camelCase")]
59pub struct ExtractOptions {
60    /// Schema the output should adhere to, provided in JSON Schema format.
61    pub schema: Option<Value>,
62
63    pub system_prompt: Option<String>,
64
65    /// Extraction prompt to send to the LLM agent along with the page content.
66    pub prompt: Option<String>,
67}
68
69#[serde_with::skip_serializing_none]
70#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
71#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
72#[serde(rename_all = "camelCase")]
73pub struct JsonOptions {
74    /// Schema the output should adhere to, provided in JSON Schema format.
75    pub schema: Option<Value>,
76
77    /// System prompt to send to the LLM agent for schema extraction
78    pub system_prompt: Option<String>,
79
80    /// Extraction prompt to send to the LLM agent
81    pub prompt: Option<String>,
82}
83
84#[derive(Deserialize, Serialize, Clone, Debug, Default, PartialEq, Eq)]
85#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
86#[serde(rename_all = "camelCase")]
87pub enum ActionType {
88    #[default]
89    #[serde(rename = "click")]
90    Click,
91
92    #[serde(rename = "type")]
93    Type,
94
95    #[serde(rename = "wait")]
96    Wait,
97
98    #[serde(rename = "screenshot")]
99    Screenshot,
100
101    #[serde(rename = "write")]
102    Write,
103
104    #[serde(rename = "press")]
105    Press,
106
107    #[serde(rename = "scroll")]
108    Scroll,
109
110    #[serde(rename = "scrape")]
111    Scrape,
112
113    #[serde(rename = "executeJavascript")]
114    ExecuteJavascript,
115}
116
117#[serde_with::skip_serializing_none]
118#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
119#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
120#[serde(rename_all = "camelCase")]
121pub struct Action {
122    /// Type of action to perform
123    #[serde(rename = "type")]
124    pub action_type: ActionType,
125
126    /// CSS selector for the target element
127    pub selector: Option<String>,
128
129    /// Text to write (for write action)
130    pub text: Option<String>,
131
132    /// Time to wait in milliseconds (for wait action)
133    pub milliseconds: Option<u32>,
134
135    /// Key to press (for press action)
136    pub key: Option<String>,
137
138    /// Scroll direction (up or down)
139    pub direction: Option<String>,
140
141    /// JavaScript code to execute (for executeJavascript action)
142    pub script: Option<String>,
143
144    /// Take full page screenshot (for screenshot action)
145    pub full_page: Option<bool>,
146}
147
148#[serde_with::skip_serializing_none]
149#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
150#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
151#[serde(rename_all = "camelCase")]
152pub struct LocationOptions {
153    /// Country code for location emulation
154    pub country: String,
155
156    /// Language preferences
157    pub languages: Vec<String>,
158}
159
160#[serde_with::skip_serializing_none]
161#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
162#[cfg_attr(feature = "mcp_tool", derive(JsonSchema))]
163#[serde(rename_all = "camelCase")]
164pub struct ScrapeOptions {
165    /// Content formats to extract (default: `[ Markdown ]`)
166    pub formats: Option<Vec<ScrapeFormats>>,
167
168    /// Extract only the main content, filtering out navigation, footers, etc. (default: `true`)
169    pub only_main_content: Option<bool>,
170
171    /// HTML tags to specifically include in extraction
172    pub include_tags: Option<Vec<String>>,
173
174    /// HTML tags to exclude from extraction
175    pub exclude_tags: Option<Vec<String>>,
176
177    /// Additional HTTP headers to use when loading the page.
178    pub headers: Option<HashMap<String, String>>,
179
180    /// Time in milliseconds to wait for dynamic content to load (default: `0`)
181    pub wait_for: Option<u32>,
182
183    /// Maximum time in milliseconds to wait for the page to load (default: `60000`)
184    pub timeout: Option<u32>,
185
186    /// Configuration for structured data extraction
187    pub extract: Option<ExtractOptions>,
188
189    /// JSON options, to be used in conjunction with `ScrapeFormats::JSON`.
190    #[serde(rename = "jsonOptions")]
191    pub json_options: Option<JsonOptions>,
192
193    /// Enable or disable PDF parsing capability (default: `true`)
194    #[serde(rename = "parsePDF")]
195    pub parse_pdf: Option<bool>,
196
197    /// Location settings for scraping
198    pub location: Option<LocationOptions>,
199
200    /// Language preference header to use
201    pub language: Option<String>,
202
203    /// List of actions to perform before scraping
204    pub actions: Option<Vec<Action>>,
205
206    /// Use mobile viewport (default: `false`)
207    pub mobile: Option<bool>,
208
209    /// Skip TLS certificate verification (default: `false`)
210    pub skip_tls_verification: Option<bool>,
211
212    /// Remove base64 encoded images from output (default: `false`)
213    pub remove_base64_images: Option<bool>,
214
215    /// Block ads during page loading (default: `false`)
216    pub block_ads: Option<bool>,
217
218    /// Proxy configuration to use (values: "none", "basic", "residential") (default: `"none"`)
219    pub proxy: Option<String>,
220}
221
222#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
223#[serde(rename_all = "camelCase")]
224struct ScrapeRequestBody {
225    url: String,
226
227    #[serde(flatten)]
228    options: ScrapeOptions,
229}
230
231#[derive(Deserialize, Serialize, Debug, Default, Clone, PartialEq, Eq)]
232#[serde(rename_all = "camelCase")]
233struct ScrapeResponse {
234    /// This will always be `true` due to `FirecrawlApp::handle_response`.
235    /// No need to expose.
236    success: bool,
237
238    /// The resulting document.
239    data: Document,
240}
241
242impl FirecrawlApp {
243    /// Scrapes a URL using the Firecrawl API.
244    pub async fn scrape_url(
245        &self,
246        url: impl AsRef<str>,
247        options: impl Into<Option<ScrapeOptions>>,
248    ) -> Result<Document, FirecrawlError> {
249        let body = ScrapeRequestBody {
250            url: url.as_ref().to_string(),
251            options: options.into().unwrap_or_default(),
252        };
253
254        let headers = self.prepare_headers(None);
255
256        let response = self
257            .client
258            .post(format!("{}/{}/scrape", self.api_url, API_VERSION))
259            .headers(headers)
260            .json(&body)
261            .send()
262            .await
263            .map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
264
265        let response = self
266            .handle_response::<ScrapeResponse>(response, "scrape URL")
267            .await?;
268
269        Ok(response.data)
270    }
271}
272
273#[cfg(all(test, feature = "mcp_tool"))]
274mod schema_tests {
275    use super::*;
276    use async_claude;
277    use serde_json::json;
278
279    #[test]
280    fn test_scrape_options_schema() {
281        let actual_schema = async_claude::tool::parse_input_schema::<ScrapeOptions>().unwrap();
282
283        // Check basic structure
284        assert_eq!(actual_schema["type"], "object");
285
286        // Get properties object
287        let properties = &actual_schema["properties"];
288        assert!(properties.is_object());
289
290        // Check all expected properties exist
291        let expected_properties = [
292            "formats",
293            "onlyMainContent",
294            "includeTags",
295            "excludeTags",
296            "headers",
297            "waitFor",
298            "timeout",
299            "extract",
300            "jsonOptions",
301            "parsePDF",
302            "location",
303            "language",
304            "actions",
305            "mobile",
306            "skipTlsVerification",
307            "removeBase64Images",
308            "blockAds",
309            "proxy",
310        ];
311
312        for prop in expected_properties.iter() {
313            assert!(
314                properties.get(*prop).is_some(),
315                "Property {} not found",
316                prop
317            );
318        }
319
320        // Check formats property
321        assert_eq!(properties["formats"]["type"], "array");
322        assert!(properties["formats"]["items"].is_object());
323        // Check formats description
324        assert_eq!(
325            properties["formats"]["description"],
326            "Content formats to extract (default: `[ Markdown ]`)"
327        );
328
329        // Check boolean properties
330        let boolean_properties = [
331            "onlyMainContent",
332            "mobile",
333            "skipTlsVerification",
334            "removeBase64Images",
335            "blockAds",
336        ];
337        for prop in boolean_properties.iter() {
338            assert_eq!(
339                properties[*prop]["type"], "boolean",
340                "Property {} should be boolean",
341                prop
342            );
343        }
344
345        // Check array properties with string items
346        let string_array_properties = ["includeTags", "excludeTags"];
347        for prop in string_array_properties.iter() {
348            assert_eq!(properties[*prop]["type"], "array");
349            assert_eq!(properties[*prop]["items"]["type"], "string");
350        }
351
352        // Check numeric properties
353        let numeric_properties = ["waitFor", "timeout"];
354        for prop in numeric_properties.iter() {
355            assert!(
356                properties[*prop]["type"] == "integer" || properties[*prop]["type"] == "number",
357                "Property {} should be numeric",
358                prop
359            );
360        }
361
362        // Check actions property
363        assert_eq!(properties["actions"]["type"], "array");
364        assert!(properties["actions"]["items"].is_object());
365        assert_eq!(properties["actions"]["items"]["type"], "object");
366
367        // Check action properties
368        let action_props = &properties["actions"]["items"]["properties"];
369        let expected_action_props = [
370            "type",
371            "selector",
372            "text",
373            "milliseconds",
374            "key",
375            "direction",
376            "script",
377            "fullPage",
378        ];
379        for prop in expected_action_props.iter() {
380            assert!(
381                action_props.get(*prop).is_some(),
382                "Action property {} not found",
383                prop
384            );
385        }
386
387        // Check action type enum values
388        let action_type_prop = &action_props["type"];
389        assert!(action_type_prop["enum"].is_array());
390        let expected_action_types = [
391            "wait",
392            "click",
393            "screenshot",
394            "write",
395            "press",
396            "scroll",
397            "scrape",
398            "executeJavascript",
399        ];
400
401        for action_type in expected_action_types.iter() {
402            assert!(
403                action_type_prop["enum"]
404                    .as_array()
405                    .unwrap()
406                    .iter()
407                    .any(|v| v.as_str().unwrap_or("") == *action_type),
408                "Action type {} not found in enum",
409                action_type
410            );
411        }
412
413        // Check extract property
414        assert_eq!(properties["extract"]["type"], "object");
415        let extract_props = &properties["extract"]["properties"];
416        assert!(extract_props.get("schema").is_some());
417        assert!(extract_props.get("systemPrompt").is_some());
418        assert!(extract_props.get("prompt").is_some());
419
420        // Check location property
421        assert_eq!(properties["location"]["type"], "object");
422        let location_props = &properties["location"]["properties"];
423        assert!(location_props.get("country").is_some());
424        assert!(location_props.get("languages").is_some());
425        assert_eq!(location_props["languages"]["type"], "array");
426        assert_eq!(location_props["languages"]["items"]["type"], "string");
427
428        // Check descriptions to ensure they match our expected values
429        assert_eq!(
430            properties["onlyMainContent"]["description"],
431            "Extract only the main content, filtering out navigation, footers, etc. (default: `true`)"
432        );
433
434        assert_eq!(
435            properties["actions"]["description"],
436            "List of actions to perform before scraping"
437        );
438
439        assert_eq!(
440            properties["mobile"]["description"],
441            "Use mobile viewport (default: `false`)"
442        );
443
444        assert_eq!(
445            properties["skipTlsVerification"]["description"],
446            "Skip TLS certificate verification (default: `false`)"
447        );
448
449        assert_eq!(
450            properties["removeBase64Images"]["description"],
451            "Remove base64 encoded images from output (default: `false`)"
452        );
453    }
454}
455
456#[cfg(test)]
457mod tests {
458    use super::*;
459    use serde_json::json;
460
461    #[test]
462    fn test_scrape_request_body_deserialization() {
463        let json_data = json!({
464            "url": "https://example.com",
465            "formats": [
466                "markdown"
467            ],
468            "onlyMainContent": true,
469            "includeTags": [
470                "div"
471            ],
472            "excludeTags": [
473                "img"
474            ],
475            "headers": {
476                "User-Agent": "Custom User Agent"
477            },
478            "waitFor": 1000,
479            "mobile": false,
480            "skipTlsVerification": false,
481            "timeout": 30000,
482            "jsonOptions": {
483                "schema": {
484                    "type": "object",
485                    "properties": {
486                        "title": {
487                            "type": "string"
488                        }
489                    }
490                },
491                "systemPrompt": "Extract data from the page",
492                "prompt": "Pull out the title"
493            },
494            "actions": [
495                {
496                    "type": "wait",
497                    "milliseconds": 2000,
498                    "selector": "#my-element"
499                }
500            ],
501            "location": {
502                "country": "US",
503                "languages": [
504                    "en-US"
505                ]
506            },
507            "removeBase64Images": true,
508            "blockAds": true,
509            "proxy": "basic"
510        });
511
512        // Deserialize the JSON to our struct
513        let req_body: ScrapeRequestBody =
514            serde_json::from_value(json_data).expect("Failed to deserialize ScrapeRequestBody");
515
516        // Create expected headers map
517        let mut expected_headers = HashMap::new();
518        expected_headers.insert("User-Agent".to_string(), "Custom User Agent".to_string());
519
520        // Create expected request body directly
521        let expected_req_body = ScrapeRequestBody {
522            url: "https://example.com".to_string(),
523            options: ScrapeOptions {
524                formats: Some(vec![ScrapeFormats::Markdown]),
525                only_main_content: Some(true),
526                include_tags: Some(vec!["div".to_string()]),
527                exclude_tags: Some(vec!["img".to_string()]),
528                headers: Some(expected_headers),
529                wait_for: Some(1000),
530                mobile: Some(false),
531                skip_tls_verification: Some(false),
532                timeout: Some(30000),
533                json_options: Some(JsonOptions {
534                    schema: Some(json!({
535                        "type": "object",
536                        "properties": {
537                            "title": {
538                                "type": "string"
539                            }
540                        }
541                    })),
542                    system_prompt: Some("Extract data from the page".to_string()),
543                    prompt: Some("Pull out the title".to_string()),
544                }),
545                actions: Some(vec![Action {
546                    action_type: ActionType::Wait,
547                    milliseconds: Some(2000),
548                    selector: Some("#my-element".to_string()),
549                    text: None,
550                    key: None,
551                    direction: None,
552                    script: None,
553                    full_page: None,
554                }]),
555                location: Some(LocationOptions {
556                    country: "US".to_string(),
557                    languages: vec!["en-US".to_string()],
558                }),
559                remove_base64_images: Some(true),
560                block_ads: Some(true),
561                proxy: Some("basic".to_string()),
562                language: None,
563                extract: None,
564                parse_pdf: None,
565            },
566        };
567
568        // Since req_body has Value fields, we need to compare them separately
569        // First compare the entire structs except for json_options
570        let json_opts_actual = req_body.options.json_options.clone();
571        let json_opts_expected = expected_req_body.options.json_options.clone();
572
573        // Set json_options to None before comparison
574        let mut req_body_compare = req_body.clone();
575        let mut expected_req_body_compare = expected_req_body.clone();
576        req_body_compare.options.json_options = None;
577        expected_req_body_compare.options.json_options = None;
578
579        // Compare the structs without the Value fields
580        assert_eq!(req_body_compare, expected_req_body_compare);
581
582        // Now compare the json_options fields
583        assert_eq!(
584            json_opts_actual.as_ref().unwrap().system_prompt,
585            json_opts_expected.as_ref().unwrap().system_prompt
586        );
587        assert_eq!(
588            json_opts_actual.as_ref().unwrap().prompt,
589            json_opts_expected.as_ref().unwrap().prompt
590        );
591
592        // Compare schema values by serializing them to strings
593        let schema_actual =
594            serde_json::to_string(&json_opts_actual.as_ref().unwrap().schema).unwrap();
595        let schema_expected =
596            serde_json::to_string(&json_opts_expected.as_ref().unwrap().schema).unwrap();
597        assert_eq!(schema_actual, schema_expected);
598    }
599
600    #[test]
601    fn test_json_options_deserialization() {
602        let json_data = json!({
603            "schema": {
604                "type": "object",
605                "properties": {
606                    "title": { "type": "string" }
607                }
608            },
609            "systemPrompt": "Custom system prompt for extraction",
610            "prompt": "Extract the title from the page"
611        });
612
613        // Deserialize the JSON
614        let json_options: JsonOptions =
615            serde_json::from_value(json_data).expect("Failed to deserialize JsonOptions");
616
617        // Create expected struct
618        let expected_json_options = JsonOptions {
619            schema: Some(json!({
620                "type": "object",
621                "properties": {
622                    "title": { "type": "string" }
623                }
624            })),
625            system_prompt: Some("Custom system prompt for extraction".to_string()),
626            prompt: Some("Extract the title from the page".to_string()),
627        };
628
629        // Compare non-Value fields
630        assert_eq!(
631            json_options.system_prompt,
632            expected_json_options.system_prompt
633        );
634        assert_eq!(json_options.prompt, expected_json_options.prompt);
635
636        // Compare schema values by serializing them to strings
637        let schema_actual = serde_json::to_string(&json_options.schema).unwrap();
638        let schema_expected = serde_json::to_string(&expected_json_options.schema).unwrap();
639        assert_eq!(schema_actual, schema_expected);
640    }
641
642    #[test]
643    fn test_action_deserialization() {
644        // Test wait action
645        let wait_action_json = json!({
646            "type": "wait",
647            "milliseconds": 3000,
648            "selector": "#loading"
649        });
650
651        let wait_action: Action =
652            serde_json::from_value(wait_action_json).expect("Failed to deserialize wait Action");
653
654        let expected_wait_action = Action {
655            action_type: ActionType::Wait,
656            milliseconds: Some(3000),
657            selector: Some("#loading".to_string()),
658            text: None,
659            key: None,
660            direction: None,
661            script: None,
662            full_page: None,
663        };
664
665        // Direct comparison works since Action doesn't contain Value fields
666        assert_eq!(wait_action, expected_wait_action);
667
668        // Test click action
669        let click_action_json = json!({
670            "type": "click",
671            "selector": "#submit-button"
672        });
673
674        let click_action: Action =
675            serde_json::from_value(click_action_json).expect("Failed to deserialize click Action");
676
677        let expected_click_action = Action {
678            action_type: ActionType::Click,
679            milliseconds: None,
680            selector: Some("#submit-button".to_string()),
681            text: None,
682            key: None,
683            direction: None,
684            script: None,
685            full_page: None,
686        };
687
688        assert_eq!(click_action, expected_click_action);
689
690        // Test type action
691        let type_action_json = json!({
692            "type": "type",
693            "selector": "#search-input",
694            "text": "search query"
695        });
696
697        let type_action: Action =
698            serde_json::from_value(type_action_json).expect("Failed to deserialize type Action");
699
700        let expected_type_action = Action {
701            action_type: ActionType::Type,
702            milliseconds: None,
703            selector: Some("#search-input".to_string()),
704            text: Some("search query".to_string()),
705            key: None,
706            direction: None,
707            script: None,
708            full_page: None,
709        };
710
711        assert_eq!(type_action, expected_type_action);
712    }
713}