essence/
types.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Main scrape request matching Firecrawl v1 schema
5#[derive(Debug, Clone, Deserialize)]
6#[serde(rename_all = "camelCase")]
7pub struct ScrapeRequest {
8    /// Required: URL to scrape
9    pub url: String,
10
11    /// Output formats (default: ["markdown"])
12    #[serde(default = "default_formats")]
13    pub formats: Vec<String>,
14
15    /// Headers to send with request
16    #[serde(default)]
17    pub headers: HashMap<String, String>,
18
19    /// CSS selectors to include
20    #[serde(default)]
21    pub include_tags: Vec<String>,
22
23    /// CSS selectors to exclude
24    #[serde(default)]
25    pub exclude_tags: Vec<String>,
26
27    /// Extract only main content (default: true)
28    #[serde(default = "default_true")]
29    pub only_main_content: bool,
30
31    /// Request timeout in milliseconds (default: 30000)
32    #[serde(default = "default_timeout")]
33    pub timeout: u64,
34
35    /// Wait time before scraping in milliseconds (default: 0)
36    #[serde(default)]
37    pub wait_for: u64,
38
39    /// Remove base64 images (default: true)
40    #[serde(default = "default_true")]
41    pub remove_base64_images: bool,
42
43    /// Skip TLS verification
44    #[serde(default)]
45    pub skip_tls_verification: bool,
46
47    /// Engine to use: "auto" | "http" | "browser" (default: "auto")
48    #[serde(default = "default_engine")]
49    pub engine: String,
50
51    /// CSS selector to wait for before scraping (browser only)
52    #[serde(default)]
53    pub wait_for_selector: Option<String>,
54
55    /// Browser actions to perform before scraping
56    #[serde(default)]
57    pub actions: Vec<BrowserAction>,
58
59    /// Capture screenshot (browser only)
60    #[serde(default)]
61    pub screenshot: bool,
62
63    /// Screenshot format: "png" | "jpeg" (default: "png")
64    #[serde(default = "default_screenshot_format")]
65    pub screenshot_format: String,
66}
67
68/// Browser actions to perform
69#[derive(Debug, Clone, Deserialize, Serialize)]
70#[serde(tag = "type", rename_all = "camelCase")]
71pub enum BrowserAction {
72    Click { selector: String },
73    Type { selector: String, text: String },
74    Scroll { direction: String },
75    Wait { milliseconds: u64 },
76    WaitForSelector { selector: String },
77}
78
79// Default functions for ScrapeRequest
80fn default_formats() -> Vec<String> {
81    vec!["markdown".to_string()]
82}
83
84fn default_true() -> bool {
85    true
86}
87
88fn default_timeout() -> u64 {
89    30000
90}
91
92fn default_engine() -> String {
93    "auto".to_string()
94}
95
96fn default_screenshot_format() -> String {
97    "png".to_string()
98}
99
100impl Default for ScrapeRequest {
101    fn default() -> Self {
102        Self {
103            url: String::new(),
104            formats: default_formats(),
105            headers: HashMap::new(),
106            include_tags: Vec::new(),
107            exclude_tags: Vec::new(),
108            only_main_content: default_true(),
109            timeout: default_timeout(),
110            wait_for: 0,
111            remove_base64_images: default_true(),
112            skip_tls_verification: false,
113            engine: default_engine(),
114            wait_for_selector: None,
115            actions: Vec::new(),
116            screenshot: false,
117            screenshot_format: default_screenshot_format(),
118        }
119    }
120}
121
122/// Scrape response matching Firecrawl v1 schema
123#[derive(Debug, Clone, Serialize)]
124pub struct ScrapeResponse {
125    pub success: bool,
126    #[serde(skip_serializing_if = "Option::is_none")]
127    pub warning: Option<String>,
128    #[serde(skip_serializing_if = "Option::is_none")]
129    pub data: Option<Document>,
130    #[serde(skip_serializing_if = "Option::is_none")]
131    pub error: Option<String>,
132    #[serde(skip_serializing_if = "Option::is_none")]
133    pub scrape_id: Option<String>,
134}
135
136/// Document structure containing scraped data
137#[derive(Debug, Clone, Default, Serialize, Deserialize)]
138#[serde(rename_all = "camelCase")]
139pub struct Document {
140    /// Page title
141    #[serde(skip_serializing_if = "Option::is_none")]
142    pub title: Option<String>,
143
144    /// Page description
145    #[serde(skip_serializing_if = "Option::is_none")]
146    pub description: Option<String>,
147
148    /// Page URL
149    #[serde(skip_serializing_if = "Option::is_none")]
150    pub url: Option<String>,
151
152    /// Markdown content
153    #[serde(skip_serializing_if = "Option::is_none")]
154    pub markdown: Option<String>,
155
156    /// HTML content
157    #[serde(skip_serializing_if = "Option::is_none")]
158    pub html: Option<String>,
159
160    /// Raw HTML
161    #[serde(skip_serializing_if = "Option::is_none")]
162    pub raw_html: Option<String>,
163
164    /// Links found on page
165    #[serde(skip_serializing_if = "Option::is_none")]
166    pub links: Option<Vec<String>>,
167
168    /// Images found on page
169    #[serde(skip_serializing_if = "Option::is_none")]
170    pub images: Option<Vec<String>>,
171
172    /// Screenshot (base64 encoded)
173    #[serde(skip_serializing_if = "Option::is_none")]
174    pub screenshot: Option<String>,
175
176    /// Metadata
177    pub metadata: Metadata,
178}
179
180/// Metadata structure
181#[derive(Debug, Clone, Serialize, Deserialize)]
182#[serde(rename_all = "camelCase")]
183pub struct Metadata {
184    #[serde(skip_serializing_if = "Option::is_none")]
185    pub title: Option<String>,
186
187    #[serde(skip_serializing_if = "Option::is_none")]
188    pub description: Option<String>,
189
190    #[serde(skip_serializing_if = "Option::is_none")]
191    pub language: Option<String>,
192
193    #[serde(skip_serializing_if = "Option::is_none")]
194    pub keywords: Option<String>,
195
196    #[serde(skip_serializing_if = "Option::is_none")]
197    pub robots: Option<String>,
198
199    #[serde(skip_serializing_if = "Option::is_none")]
200    pub og_title: Option<String>,
201
202    #[serde(skip_serializing_if = "Option::is_none")]
203    pub og_description: Option<String>,
204
205    #[serde(skip_serializing_if = "Option::is_none")]
206    pub og_url: Option<String>,
207
208    #[serde(skip_serializing_if = "Option::is_none")]
209    pub og_image: Option<String>,
210
211    #[serde(skip_serializing_if = "Option::is_none")]
212    pub url: Option<String>,
213
214    #[serde(skip_serializing_if = "Option::is_none")]
215    pub source_url: Option<String>,
216
217    pub status_code: u16,
218
219    #[serde(skip_serializing_if = "Option::is_none")]
220    pub content_type: Option<String>,
221
222    #[serde(skip_serializing_if = "Option::is_none")]
223    pub canonical_url: Option<String>,
224
225    // Advanced extraction metadata
226    #[serde(skip_serializing_if = "Option::is_none")]
227    pub word_count: Option<usize>,
228
229    #[serde(skip_serializing_if = "Option::is_none")]
230    pub reading_time: Option<usize>,
231
232    #[serde(skip_serializing_if = "Option::is_none")]
233    pub excerpt: Option<String>,
234
235    // Engine detection metadata
236    #[serde(skip_serializing_if = "Option::is_none")]
237    pub detected_frameworks: Option<Vec<String>>,
238
239    #[serde(skip_serializing_if = "Option::is_none")]
240    pub detection_reason: Option<String>,
241
242    #[serde(skip_serializing_if = "Option::is_none")]
243    pub content_script_ratio: Option<f64>,
244}
245
246// Default implementations
247impl Default for Metadata {
248    fn default() -> Self {
249        Self {
250            title: None,
251            description: None,
252            language: None,
253            keywords: None,
254            robots: None,
255            og_title: None,
256            og_description: None,
257            og_url: None,
258            og_image: None,
259            url: None,
260            source_url: None,
261            status_code: 200,
262            content_type: None,
263            canonical_url: None,
264            word_count: None,
265            reading_time: None,
266            excerpt: None,
267            detected_frameworks: None,
268            detection_reason: None,
269            content_script_ratio: None,
270        }
271    }
272}
273
274
275// Default function for optional bools
276fn default_true_option() -> Option<bool> {
277    Some(true)
278}
279
280impl ScrapeResponse {
281    pub fn success(data: Document) -> Self {
282        Self {
283            success: true,
284            warning: None,
285            data: Some(data),
286            error: None,
287            scrape_id: None,
288        }
289    }
290
291    pub fn error(error: String) -> Self {
292        Self {
293            success: false,
294            warning: None,
295            data: None,
296            error: Some(error),
297            scrape_id: None,
298        }
299    }
300}
301
302/// Map request matching Firecrawl v1 schema
303#[derive(Debug, Clone, Deserialize)]
304#[serde(rename_all = "camelCase")]
305pub struct MapRequest {
306    /// Required: URL to map
307    pub url: String,
308
309    /// Search query to filter URLs
310    #[serde(default)]
311    pub search: Option<String>,
312
313    /// Skip sitemap.xml (default: false)
314    #[serde(default)]
315    pub ignore_sitemap: Option<bool>,
316
317    /// Include subdomains (default: true)
318    #[serde(default = "default_include_subdomains")]
319    pub include_subdomains: Option<bool>,
320
321    /// Max URLs to return (default: 5000, max: 100000)
322    #[serde(default = "default_map_limit")]
323    pub limit: Option<u32>,
324}
325
326/// Map response matching Firecrawl v1 schema
327#[derive(Debug, Clone, Serialize)]
328pub struct MapResponse {
329    pub success: bool,
330    #[serde(skip_serializing_if = "Option::is_none")]
331    pub links: Option<Vec<String>>,
332    #[serde(skip_serializing_if = "Option::is_none")]
333    pub error: Option<String>,
334    #[serde(skip_serializing_if = "Option::is_none")]
335    pub scrape_id: Option<String>,
336}
337
338fn default_include_subdomains() -> Option<bool> {
339    Some(true)
340}
341
342fn default_map_limit() -> Option<u32> {
343    Some(5000)
344}
345
346impl MapResponse {
347    pub fn success(links: Vec<String>) -> Self {
348        Self {
349            success: true,
350            links: Some(links),
351            error: None,
352            scrape_id: None,
353        }
354    }
355
356    pub fn error(error: String) -> Self {
357        Self {
358            success: false,
359            links: None,
360            error: Some(error),
361            scrape_id: None,
362        }
363    }
364}
365
366/// Crawl request matching Firecrawl v1 crawl schema
367#[derive(Debug, Clone, Deserialize, Serialize)]
368#[serde(rename_all = "camelCase")]
369pub struct CrawlRequest {
370    /// Required: Starting URL
371    pub url: String,
372
373    /// Patterns to exclude (glob patterns)
374    #[serde(default)]
375    pub exclude_paths: Option<Vec<String>>,
376
377    /// Patterns to include (glob patterns)
378    #[serde(default)]
379    pub include_paths: Option<Vec<String>>,
380
381    /// Max crawl depth (default: 2)
382    #[serde(default = "default_max_depth")]
383    pub max_depth: u32,
384
385    /// Max pages to crawl (default: 100)
386    #[serde(default = "default_limit")]
387    pub limit: u32,
388
389    /// Allow backward links (crawl entire domain)
390    #[serde(default)]
391    pub allow_backward_links: Option<bool>,
392
393    /// Allow external links
394    #[serde(default)]
395    pub allow_external_links: Option<bool>,
396
397    /// Ignore sitemap
398    #[serde(default)]
399    pub ignore_sitemap: Option<bool>,
400
401    /// Enable pagination detection (default: true)
402    #[serde(default = "default_true_option")]
403    pub detect_pagination: Option<bool>,
404
405    /// Maximum pagination pages to follow (default: 50)
406    #[serde(default = "default_max_pagination_pages")]
407    pub max_pagination_pages: Option<u32>,
408
409    /// Use parallel crawler for better performance (default: false)
410    #[serde(default)]
411    pub use_parallel: Option<bool>,
412}
413
414/// Crawl response
415#[derive(Debug, Clone, Serialize, Deserialize)]
416pub struct CrawlResponse {
417    pub success: bool,
418    #[serde(skip_serializing_if = "Option::is_none")]
419    pub data: Option<Vec<Document>>,
420    #[serde(skip_serializing_if = "Option::is_none")]
421    pub error: Option<String>,
422    /// Crawl ID for three-phase crawls (poll for status)
423    #[serde(skip_serializing_if = "Option::is_none")]
424    pub crawl_id: Option<String>,
425    /// Status message
426    #[serde(skip_serializing_if = "Option::is_none")]
427    pub message: Option<String>,
428}
429
430fn default_max_depth() -> u32 {
431    2
432}
433
434fn default_limit() -> u32 {
435    100
436}
437
438fn default_max_pagination_pages() -> Option<u32> {
439    Some(50)
440}
441
442impl CrawlResponse {
443    pub fn success(data: Vec<Document>) -> Self {
444        Self {
445            success: true,
446            data: Some(data),
447            error: None,
448            crawl_id: None,
449            message: None,
450        }
451    }
452
453    pub fn error(error: String) -> Self {
454        Self {
455            success: false,
456            data: None,
457            error: Some(error),
458            crawl_id: None,
459            message: None,
460        }
461    }
462
463    pub fn started(crawl_id: String) -> Self {
464        Self {
465            success: true,
466            data: None,
467            error: None,
468            crawl_id: Some(crawl_id.clone()),
469            message: Some(format!("Crawl started with ID: {}", crawl_id)),
470        }
471    }
472}
473
474// ===== Search Types =====
475
476/// Search request
477#[derive(Debug, Clone, Deserialize)]
478#[serde(rename_all = "camelCase")]
479pub struct SearchRequest {
480    /// Search query
481    pub query: String,
482
483    /// Max results to return (default: 10)
484    #[serde(default = "default_search_limit")]
485    pub limit: u32,
486
487    /// Whether to scrape each result URL (default: false)
488    #[serde(default)]
489    pub scrape_results: bool,
490
491    /// Scrape options to apply if scraping results
492    #[serde(default)]
493    pub scrape_options: Option<ScrapeOptions>,
494}
495
496/// Scrape options for search results
497#[derive(Debug, Clone, Deserialize)]
498#[serde(rename_all = "camelCase")]
499pub struct ScrapeOptions {
500    /// Formats to return (default: ["markdown"])
501    #[serde(default = "default_formats")]
502    pub formats: Vec<String>,
503
504    /// Extract only main content (default: true)
505    #[serde(default = "default_true")]
506    pub only_main_content: bool,
507
508    /// Timeout in milliseconds (default: 10000)
509    #[serde(default = "default_scrape_timeout")]
510    pub timeout: u64,
511}
512
513/// Search response
514#[derive(Debug, Clone, Serialize)]
515pub struct SearchResponse {
516    pub success: bool,
517    #[serde(skip_serializing_if = "Option::is_none")]
518    pub data: Option<Vec<SearchResult>>,
519    #[serde(skip_serializing_if = "Option::is_none")]
520    pub error: Option<String>,
521}
522
523/// Individual search result
524#[derive(Debug, Clone, Serialize)]
525pub struct SearchResult {
526    /// Title of the search result
527    pub title: String,
528    /// URL of the search result
529    pub url: String,
530    /// Snippet/description from search engine
531    pub snippet: String,
532    /// Scraped content (if scrape_results was true)
533    #[serde(skip_serializing_if = "Option::is_none")]
534    pub content: Option<Document>,
535}
536
537fn default_search_limit() -> u32 {
538    10
539}
540
541fn default_scrape_timeout() -> u64 {
542    10000
543}
544
545impl SearchResponse {
546    pub fn success(data: Vec<SearchResult>) -> Self {
547        Self {
548            success: true,
549            data: Some(data),
550            error: None,
551        }
552    }
553
554    pub fn error(error: String) -> Self {
555        Self {
556            success: false,
557            data: None,
558            error: Some(error),
559        }
560    }
561}
562
563// ===== Streaming Crawl Types =====
564
565/// Crawl event types for SSE streaming
566#[derive(Debug, Clone, Serialize)]
567#[serde(tag = "type", rename_all = "lowercase")]
568pub enum CrawlEvent {
569    /// Crawl started event
570    Status {
571        pages_crawled: usize,
572        queue_size: usize,
573        current_url: Option<String>,
574    },
575    /// Document completed event
576    Document {
577        url: String,
578        title: Option<String>,
579        markdown: Option<String>,
580        metadata: Box<Metadata>,
581    },
582    /// Error event for individual URL
583    Error {
584        url: String,
585        error: String,
586    },
587    /// Crawl completion event
588    Complete {
589        total_pages: usize,
590        success: usize,
591        errors: usize,
592    },
593}
essence/types.rs

essence/
types.rs