spider_client/shapes/
request.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7    /// The chunking algorithm to use, defined as a specific type.
8    pub r#type: ChunkingType,
9    /// The amount to chunk by.
10    pub value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone, Default)]
15pub struct Timeout {
16    /// The seconds up to 60.
17    pub secs: u64,
18    /// The nanoseconds.
19    pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24    /// The timeout to wait until.
25    pub timeout: Timeout,
26}
27
28
29/// Represents various web automation actions.
30#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
31pub enum WebAutomation {
32    /// Runs custom JavaScript code.
33    Evaluate(String),
34    /// Clicks on an element.
35    Click(String),
36    /// Clicks on all elements.
37    ClickAll(String),
38    /// Clicks on all elements.
39    ClickAllClickable(),
40    /// Clicks at the position x and y coordinates.
41    ClickPoint {
42        /// The horizontal (X) coordinate.
43        x: f64,
44        /// The vertical (Y) coordinate.
45        y: f64,
46    },
47    Type { 
48        /// The value to type.
49        value: String,
50        /// The click modifier.
51        modifier: Option<i64> 
52    },
53    /// Waits for a fixed duration in milliseconds.
54    Wait(u64),
55    /// Waits for the next navigation event.
56    WaitForNavigation,
57    /// Wait for dom updates to stop.
58    WaitForDom {
59        /// The selector of the element to wait for updates.
60        selector: Option<String>,
61        ///  The timeout to wait for in ms.
62        timeout: u32,
63    },
64    /// Waits for an element to appear.
65    WaitFor(String),
66    /// Waits for an element to appear with a timeout.
67    WaitForWithTimeout {
68        /// The selector of the element to wait for updates.
69        selector: String,
70        ///  The timeout to wait for in ms.
71        timeout: u64,
72    },
73    /// Waits for an element to appear and then clicks on it.
74    WaitForAndClick(String),
75    /// Scrolls the screen in the horizontal axis by a specified amount in pixels.
76    ScrollX(i32),
77    /// Scrolls the screen in the vertical axis by a specified amount in pixels.
78    ScrollY(i32),
79    /// Fills an input element with a specified value.
80    Fill {
81        /// The selector of the input element to fill.
82        selector: String,
83        ///  The value to fill the input element with.
84        value: String,
85    },
86    /// Scrolls the page until the end.
87    InfiniteScroll(u32),
88    /// Perform a screenshot on the page - fullscreen and omit background for params.
89    Screenshot {
90        /// Take a full page screenshot.
91        full_page: bool,
92        /// Omit the background.
93        omit_background: bool,
94        /// The output file to store the screenshot.
95        output: String,
96    },
97    /// Only continue to the next automation if the prior step was valid. Use this intermediate after a step to break out of the chain.
98    ValidateChain,
99}
100
101#[derive(Default, Serialize, Deserialize, Debug, Clone)]
102#[serde(tag = "type", rename_all = "PascalCase")]
103pub enum RedirectPolicy {
104    Loose,
105    #[default]
106    Strict,
107}
108
109pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
110pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
111
112#[derive(Serialize, Deserialize, Debug, Clone)]
113pub struct Selector {
114    /// The timeout to wait until.
115    pub timeout: Timeout,
116    /// The selector to wait for.
117    pub selector: String,
118}
119
120#[derive(Serialize, Deserialize, Debug, Clone, Default)]
121pub struct Delay {
122    /// The timeout to wait until.
123    pub timeout: Timeout,
124}
125
126/// Default as true.
127fn default_some_true() -> Option<bool> {
128    Some(true)
129}
130
131#[derive(Serialize, Deserialize, Debug, Clone, Default)]
132pub struct WaitFor {
133    /// Wait until idle networks with a timeout of idleness.
134    pub idle_network: Option<IdleNetwork>,
135    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
136    pub selector: Option<Selector>,
137    /// Wait for the dom to update
138    pub dom: Option<Selector>,
139    /// Wait until a hard delay.
140    pub delay: Option<Delay>,
141    /// Wait until page navigation happen. Default is true.
142    #[serde(default = "default_some_true")]
143    pub page_navigations: Option<bool>,
144}
145
146/// Query request to get a document.
147#[derive(Serialize, Deserialize, Debug, Clone, Default)]
148pub struct QueryRequest {
149    /// The exact website url.
150    pub url: Option<String>,
151    /// The website domain.
152    pub domain: Option<String>,
153    /// The path of the resource.
154    pub pathname: Option<String>,
155}
156
157/// Enum representing different types of Chunking.
158#[derive(Default, Debug, Deserialize, Serialize, Clone)]
159#[serde(rename_all = "lowercase")]
160pub enum ChunkingType {
161    #[default]
162    /// By the word count.
163    ByWords,
164    /// By the line count.
165    ByLines,
166    /// By the char length.
167    ByCharacterLength,
168    /// By sentence.
169    BySentence,
170}
171
172#[derive(Default, Debug, Deserialize, Serialize, Clone)]
173/// View port handling for chrome.
174pub struct Viewport {
175    /// Device screen Width
176    pub width: u32,
177    /// Device screen size
178    pub height: u32,
179    /// Device scale factor
180    pub device_scale_factor: Option<f64>,
181    /// Emulating Mobile?
182    pub emulating_mobile: bool,
183    /// Use landscape mode instead of portrait.
184    pub is_landscape: bool,
185    /// Touch screen device?
186    pub has_touch: bool,
187}
188
189// Define the CSSSelector struct
190#[derive(Debug, Clone, Default, Deserialize, Serialize)]
191pub struct CSSSelector {
192    /// The name of the selector group
193    pub name: String,
194    /// A vector of CSS selectors
195    pub selectors: Vec<String>,
196}
197
198// Define the CSSExtractionMap type
199pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
200
201/// Represents the settings for a webhook configuration
202#[derive(Debug, Default, Deserialize, Serialize, Clone)]
203pub struct WebhookSettings {
204    /// The destination where the webhook information will be sent
205    destination: String,
206    /// Trigger an action when all credits are depleted
207    on_credits_depleted: bool,
208    /// Trigger an action when half of the credits are depleted
209    on_credits_half_depleted: bool,
210    /// Trigger an action on a website status update event
211    on_website_status: bool,
212    /// Send information about a new page find (such as links and bytes)
213    on_find: bool,
214    /// Handle the metadata of a found page
215    on_find_metadata: bool,
216}
217
218/// Proxy pool selection for outbound request routing.
219/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
220///
221/// - 'residential'         → cost-effective entry-level residential pool
222/// - 'residential_fast'    → faster residential pool for higher throughput
223/// - 'residential_static'  → static residential IPs, rotated daily
224/// - 'residential_premium' → low-latency premium IPs
225/// - 'residential_core'    → balanced plan (quality vs. cost)
226/// - 'residential_plus'    → largest and highest quality core pool
227/// - 'mobile'              → 4G/5G mobile proxies for maximum evasion
228/// - 'isp'                 → ISP-grade datacenters
229#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
230pub enum ProxyType {
231    /// Cost-effective entry-level residential pool.
232    #[serde(rename = "residential")]
233    Residential,
234    /// Higher-throughput residential pool for better performance.
235    #[serde(rename = "residential_fast")]
236    ResidentialFast,
237    /// Static residential IPs, rotated daily for session persistence.
238    #[serde(rename = "residential_static")]
239    ResidentialStatic,
240    /// 4G / 5G mobile proxies for maximum stealth and evasion.
241    #[serde(rename = "mobile")]
242    Mobile,
243    /// ISP-grade residential routing (alias: `datacenter`).
244    #[serde(rename = "isp", alias = "datacenter")]
245    #[default]
246    Isp,
247    /// Premium low-latency residential proxy pool.
248    #[serde(rename = "residential_premium")]
249    ResidentialPremium,
250    /// Core residential plan optimized for balance between cost and quality.
251    #[serde(rename = "residential_core")]
252    ResidentialCore,
253    /// Extended core residential pool with the largest, highest-quality IPs.
254    #[serde(rename = "residential_plus")]
255    ResidentialPlus,
256}
257
258/// List of proxies.
259pub const PROXY_TYPE_LIST: [ProxyType; 10] = [
260    ProxyType::ResidentialStatic,
261    ProxyType::Residential,
262    ProxyType::Isp,
263    ProxyType::Mobile,
264    ProxyType::ResidentialPremium,
265    ProxyType::ResidentialPlus,
266    ProxyType::ResidentialCore,
267    ProxyType::ResidentialFast,
268    ProxyType::ResidentialStatic,
269    ProxyType::Residential,
270];
271
272impl ProxyType {
273    /// Get the canonical string representation of the proxy type.
274    pub fn as_str(&self) -> &'static str {
275        match self {
276            ProxyType::Residential => "residential",
277            ProxyType::ResidentialFast => "residential_fast",
278            ProxyType::ResidentialStatic => "residential_static",
279            ProxyType::Mobile => "mobile",
280            ProxyType::Isp => "isp",
281            ProxyType::ResidentialPremium => "residential_premium",
282            ProxyType::ResidentialCore => "residential_core",
283            ProxyType::ResidentialPlus => "residential_plus",
284        }
285    }
286}
287
288/// Send multiple return formats.
289#[derive(Debug, Deserialize, Serialize, Clone)]
290#[serde(untagged)]
291pub enum ReturnFormatHandling {
292    /// A single return item.
293    Single(ReturnFormat),
294    /// Multiple return formats.
295    Multi(std::collections::HashSet<ReturnFormat>),
296}
297
298impl Default for ReturnFormatHandling {
299    fn default() -> ReturnFormatHandling {
300        ReturnFormatHandling::Single(ReturnFormat::Raw)
301    }
302}
303
304#[derive(Debug, Default, Deserialize, Serialize, Clone)]
305pub struct EventTracker {
306    /// The responses received.
307    pub responses: Option<bool>,
308    /// The request sent.
309    pub requests: Option<bool>,
310    /// Track the automation events with data changes and screenshots.
311    pub automation: Option<bool>,
312}
313
314/// Structure representing request parameters.
315#[derive(Debug, Default, Deserialize, Serialize, Clone)]
316pub struct RequestParams {
317    #[serde(default)]
318    /// The URL to be crawled.
319    pub url: Option<String>,
320    #[serde(default)]
321    /// The type of request to be made.
322    pub request: Option<RequestType>,
323    #[serde(default)]
324    /// The maximum number of pages the crawler should visit.
325    pub limit: Option<u32>,
326    #[serde(default)]
327    /// The format in which the result should be returned.
328    pub return_format: Option<ReturnFormatHandling>,
329    /// The country code for request
330    pub country_code: Option<String>,
331    #[serde(default)]
332    /// Specifies whether to only visit the top-level domain.
333    pub tld: Option<bool>,
334    #[serde(default)]
335    /// The depth of the crawl.
336    pub depth: Option<u32>,
337    #[serde(default)]
338    /// Specifies whether the request should be cached.
339    pub cache: Option<bool>,
340    #[serde(default)]
341    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
342    pub scroll: Option<u32>,
343    #[serde(default)]
344    /// The budget for various resources.
345    pub budget: Option<HashMap<String, u32>>,
346    #[serde(default)]
347    /// The blacklist routes to ignore. This can be a Regex string pattern.
348    pub blacklist: Option<Vec<String>>,
349    #[serde(default)]
350    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
351    pub whitelist: Option<Vec<String>>,
352    #[serde(default)]
353    /// The locale to be used during the crawl.
354    pub locale: Option<String>,
355    #[serde(default)]
356    /// The cookies to be set for the request, formatted as a single string.
357    pub cookies: Option<String>,
358    #[serde(default)]
359    /// Specifies whether to use stealth techniques to avoid detection.
360    pub stealth: Option<bool>,
361    #[serde(default)]
362    /// The headers to be used for the request.
363    pub headers: Option<HashMap<String, String>>,
364    #[serde(default)]
365    /// Specifies whether to send data via webhooks.
366    pub webhooks: Option<WebhookSettings>,
367    #[serde(default)]
368    /// Specifies whether to include metadata in the response.
369    pub metadata: Option<bool>,
370    #[serde(default)]
371    /// The dimensions of the viewport.
372    pub viewport: Option<Viewport>,
373    #[serde(default)]
374    /// The encoding to be used for the request.
375    pub encoding: Option<String>,
376    #[serde(default)]
377    /// Specifies whether to include subdomains in the crawl.
378    pub subdomains: Option<bool>,
379    #[serde(default)]
380    /// The user agent string to be used for the request.
381    pub user_agent: Option<String>,
382    #[serde(default)]
383    /// Specifies whether to use fingerprinting protection.
384    pub fingerprint: Option<bool>,
385    #[serde(default)]
386    /// Specifies whether to perform the request without using storage.
387    pub storageless: Option<bool>,
388    #[serde(default)]
389    /// Specifies whether readability optimizations should be applied.
390    pub readability: Option<bool>,
391    #[serde(default)]
392    /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
393    pub proxy_enabled: Option<bool>,
394    #[serde(default)]
395    /// Specifies whether to respect the site's robots.txt file.
396    pub respect_robots: Option<bool>,
397    #[serde(default)]
398    /// CSS selector to be used to filter the content.
399    pub root_selector: Option<String>,
400    #[serde(default)]
401    /// Specifies whether to load all resources of the crawl target.
402    pub full_resources: Option<bool>,
403    #[serde(default)]
404    /// The text string to extract data from.
405    pub text: Option<String>,
406    #[serde(default)]
407    /// Specifies whether to use the sitemap links.
408    pub sitemap: Option<bool>,
409    #[serde(default)]
410    /// External domains to include the crawl.
411    pub external_domains: Option<Vec<String>>,
412    #[serde(default)]
413    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
414    pub return_embeddings: Option<bool>,
415    #[serde(default)]
416    /// Returns the HTTP response headers.
417    pub return_headers: Option<bool>,
418    #[serde(default)]
419    /// Returns the link(s) found on the page that match the crawler query.
420    pub return_page_links: Option<bool>,
421    #[serde(default)]
422    /// Returns the HTTP response cookies.
423    pub return_cookies: Option<bool>,
424    #[serde(default)]
425    /// The timeout for the request, in seconds.
426    pub request_timeout: Option<u8>,
427    #[serde(default)]
428    /// Specifies whether to run the request in the background.
429    pub run_in_background: Option<bool>,
430    #[serde(default)]
431    /// Specifies whether to skip configuration checks.
432    pub skip_config_checks: Option<bool>,
433    #[serde(default)]
434    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
435    pub css_extraction_map: Option<CSSExtractionMap>,
436    #[serde(default)]
437    /// The chunking algorithm to use.
438    pub chunking_alg: Option<ChunkingAlgDict>,
439    #[serde(default)]
440    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
441    pub disable_intercept: Option<bool>,
442    #[serde(default)]
443    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
444    pub wait_for: Option<WaitFor>,
445    #[serde(default)]
446    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
447    pub execution_scripts: Option<ExecutionScriptsMap>,
448    #[serde(default)]
449    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
450    pub automation_scripts: Option<WebAutomationMap>,
451    #[serde(default)]
452    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
453    pub redirect_policy: Option<RedirectPolicy>,
454    #[serde(default)]
455    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
456    pub event_tracker: Option<EventTracker>,
457    #[serde(default)]
458    /// The timeout to stop the crawl.
459    pub crawl_timeout: Option<Timeout>,
460    #[serde(default)]
461    /// Evaluates given script in every frame upon creation (before loading frame's scripts).
462    pub evaluate_on_new_document: Option<Box<String>>,
463    #[serde(default)]
464    /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 50%, with trade-offs in speed, accuracy,
465    /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
466    /// targeting websites with minimal anti-bot protections.
467    pub lite_mode: Option<bool>,
468    #[serde(default)]
469    /// The proxy to use for request.
470    pub proxy: Option<ProxyType>,
471    #[serde(default)]
472    /// Use a remote proxy at ~50% reduced cost for file downloads.
473    /// This requires a user-supplied static IP proxy endpoint.
474    pub remote_proxy: Option<String>,
475    #[serde(default)]
476    /// Set the maximum number of credits to use per page.
477    /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
478    /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
479    pub max_credits_per_page: Option<f64>,
480}
481
482
483#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
484#[serde(rename_all = "lowercase")]
485pub enum TBS {
486    #[serde(rename = "qdr:h")]
487    PastHour,
488    #[serde(rename = "qdr:d")]
489    Past24Hours,
490    #[serde(rename = "qdr:w")]
491    PastWeek,
492    #[serde(rename = "qdr:m")]
493    PastMonth,
494    #[serde(rename = "qdr:y")]
495    PastYear,
496}
497
498/// The engine to use.
499#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
500pub enum Engine {
501    /// Google
502    Google,
503    /// Brave
504    Brave,
505    /// All
506    #[default]
507    All
508}
509
510/// The structure representing request parameters for a search request.
511#[derive(Debug, Default, Deserialize, Serialize, Clone)]
512pub struct SearchRequestParams {
513    /// The base request parameters.
514    #[serde(default, flatten)]
515    pub base: RequestParams,
516    // The search request.
517    pub search: String,
518    /// The search limit.
519    pub search_limit: Option<u32>,
520    // Fetch the page content. Defaults to true.
521    pub fetch_page_content: Option<bool>,
522    /// The search location of the request
523    pub location: Option<String>,
524    /// The country code of the request
525    pub country: Option<crate::shapes::country_codes::CountryCode>,
526    /// The language code of the request.
527    pub language: Option<String>,
528    /// The number of search results
529    pub num: Option<u32>,
530    /// The time period range.
531    pub tbs: Option<TBS>,
532    /// The page of the search results.
533    pub page: Option<u32>,
534    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
535    pub website_limit: Option<u32>,
536    /// Prioritize speed over output quantity.
537    pub quick_search: Option<bool>,
538    /// Auto paginate pages ( up to 100 pages ).
539    pub auto_pagination: Option<bool>,
540    /// The search engine to use.
541    pub engine: Option<Engine>
542}
543
544/// Structure representing request parameters for transforming files.
545#[derive(Debug, Default, Deserialize, Serialize, Clone)]
546pub struct TransformParams {
547    #[serde(default)]
548    /// The format in which the result should be returned.
549    pub return_format: Option<ReturnFormat>,
550    #[serde(default)]
551    /// Specifies whether readability optimizations should be applied.
552    pub readability: Option<bool>,
553    #[serde(default)]
554    /// Clean the markdown or text for AI.
555    pub clean: Option<bool>,
556    #[serde(default)]
557    /// Clean the markdown or text for AI removing footers, navigation, and more.
558    pub clean_full: Option<bool>,
559    /// The data being transformed.
560    pub data: Vec<Resource>,
561}
562
563#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Default)]
564/// Transformation resource to use.
565pub struct Resource {
566    #[serde(default)]
567    /// the html to transform
568    pub html: Option<bytes::Bytes>,
569    #[serde(default)]
570    /// the content to transform
571    pub content: Option<bytes::Bytes>,
572    #[serde(default)]
573    /// the url of the html incase of readability to improve transformations.
574    pub url: Option<String>,
575    #[serde(default)]
576    /// the language of the resource.
577    pub lang: Option<String>,
578}
579
580/// the request type to perform
581#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
582#[serde(rename_all = "lowercase")]
583pub enum RequestType {
584    /// Default HTTP request
585    Http,
586    /// Chrome browser rendering
587    Chrome,
588    #[default]
589    /// Smart mode defaulting to HTTP and using Chrome when needed.
590    SmartMode,
591}
592
593/// Enum representing different return formats.
594#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
595#[serde(rename_all = "lowercase")]
596pub enum ReturnFormat {
597    #[default]
598    /// The default return format of the resource.
599    Raw,
600    /// Return the response as Markdown.
601    Markdown,
602    /// Return the response as Commonmark.
603    Commonmark,
604    /// Return the response as Html2text.
605    Html2text,
606    /// Return the response as Text.
607    Text,
608    /// Returns a screenshot as Base64Url
609    Screenshot,
610    /// Return the response as XML.
611    Xml,
612    /// Return the response as Bytes.
613    Bytes,
614}