spider_client/shapes/
request.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7    /// The chunking algorithm to use, defined as a specific type.
8    pub r#type: ChunkingType,
9    /// The amount to chunk by.
10    pub value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone, Default)]
15pub struct Timeout {
16    /// The seconds up to 60.
17    pub secs: u64,
18    /// The nanoseconds.
19    pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24    /// The timeout to wait until.
25    pub timeout: Timeout,
26}
27
28
29/// Represents various web automation actions.
30#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
31pub enum WebAutomation {
32    /// Runs custom JavaScript code.
33    Evaluate(String),
34    /// Clicks on an element.
35    Click(String),
36    /// Clicks on all elements.
37    ClickAll(String),
38    /// Clicks on all elements.
39    ClickAllClickable(),
40    /// Clicks at the position x and y coordinates.
41    ClickPoint {
42        /// The horizontal (X) coordinate.
43        x: f64,
44        /// The vertical (Y) coordinate.
45        y: f64,
46    },
47    Type { 
48        /// The value to type.
49        value: String,
50        /// The click modifier.
51        modifier: Option<i64> 
52    },
53    /// Waits for a fixed duration in milliseconds.
54    Wait(u64),
55    /// Waits for the next navigation event.
56    WaitForNavigation,
57    /// Wait for dom updates to stop.
58    WaitForDom {
59        /// The selector of the element to wait for updates.
60        selector: Option<String>,
61        ///  The timeout to wait for in ms.
62        timeout: u32,
63    },
64    /// Waits for an element to appear.
65    WaitFor(String),
66    /// Waits for an element to appear with a timeout.
67    WaitForWithTimeout {
68        /// The selector of the element to wait for updates.
69        selector: String,
70        ///  The timeout to wait for in ms.
71        timeout: u64,
72    },
73    /// Waits for an element to appear and then clicks on it.
74    WaitForAndClick(String),
75    /// Scrolls the screen in the horizontal axis by a specified amount in pixels.
76    ScrollX(i32),
77    /// Scrolls the screen in the vertical axis by a specified amount in pixels.
78    ScrollY(i32),
79    /// Fills an input element with a specified value.
80    Fill {
81        /// The selector of the input element to fill.
82        selector: String,
83        ///  The value to fill the input element with.
84        value: String,
85    },
86    /// Scrolls the page until the end.
87    InfiniteScroll(u32),
88    /// Perform a screenshot on the page - fullscreen and omit background for params.
89    Screenshot {
90        /// Take a full page screenshot.
91        full_page: bool,
92        /// Omit the background.
93        omit_background: bool,
94        /// The output file to store the screenshot.
95        output: String,
96    },
97    /// Only continue to the next automation if the prior step was valid. Use this intermediate after a step to break out of the chain.
98    ValidateChain,
99}
100
101#[derive(Default, Serialize, Deserialize, Debug, Clone)]
102#[serde(tag = "type", rename_all = "PascalCase")]
103pub enum RedirectPolicy {
104    Loose,
105    #[default]
106    Strict,
107}
108
109pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
110pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
111
112#[derive(Serialize, Deserialize, Debug, Clone)]
113pub struct Selector {
114    /// The timeout to wait until.
115    pub timeout: Timeout,
116    /// The selector to wait for.
117    pub selector: String,
118}
119
120#[derive(Serialize, Deserialize, Debug, Clone, Default)]
121pub struct Delay {
122    /// The timeout to wait until.
123    pub timeout: Timeout,
124}
125
126/// Default as true.
127fn default_some_true() -> Option<bool> {
128    Some(true)
129}
130
131#[derive(Serialize, Deserialize, Debug, Clone, Default)]
132pub struct WaitFor {
133    /// Wait until idle networks with a timeout of idleness.
134    pub idle_network: Option<IdleNetwork>,
135    /// Wait until network to be idle with a max timeout.
136    pub idle_network0: Option<IdleNetwork>,
137    /// Wait until network to almost be idle with a max timeout.
138    pub almost_idle_network0: Option<IdleNetwork>,
139    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
140    pub selector: Option<Selector>,
141    /// Wait for the dom to update
142    pub dom: Option<Selector>,
143    /// Wait until a hard delay.
144    pub delay: Option<Delay>,
145    /// Wait until page navigation happen. Default is true.
146    #[serde(default = "default_some_true")]
147    pub page_navigations: Option<bool>,
148}
149
150/// Query request to get a document.
151#[derive(Serialize, Deserialize, Debug, Clone, Default)]
152pub struct QueryRequest {
153    /// The exact website url.
154    pub url: Option<String>,
155    /// The website domain.
156    pub domain: Option<String>,
157    /// The path of the resource.
158    pub pathname: Option<String>,
159}
160
161/// Enum representing different types of Chunking.
162#[derive(Default, Debug, Deserialize, Serialize, Clone)]
163#[serde(rename_all = "lowercase")]
164pub enum ChunkingType {
165    #[default]
166    /// By the word count.
167    ByWords,
168    /// By the line count.
169    ByLines,
170    /// By the char length.
171    ByCharacterLength,
172    /// By sentence.
173    BySentence,
174}
175
176#[derive(Default, Debug, Deserialize, Serialize, Clone)]
177/// View port handling for chrome.
178pub struct Viewport {
179    /// Device screen Width
180    pub width: u32,
181    /// Device screen size
182    pub height: u32,
183    /// Device scale factor
184    pub device_scale_factor: Option<f64>,
185    /// Emulating Mobile?
186    pub emulating_mobile: bool,
187    /// Use landscape mode instead of portrait.
188    pub is_landscape: bool,
189    /// Touch screen device?
190    pub has_touch: bool,
191}
192
193// Define the CSSSelector struct
194#[derive(Debug, Clone, Default, Deserialize, Serialize)]
195pub struct CSSSelector {
196    /// The name of the selector group
197    pub name: String,
198    /// A vector of CSS selectors
199    pub selectors: Vec<String>,
200}
201
202// Define the CSSExtractionMap type
203pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
204
205/// Represents the settings for a webhook configuration
206#[derive(Debug, Default, Deserialize, Serialize, Clone)]
207pub struct WebhookSettings {
208    /// The destination where the webhook information will be sent
209    destination: String,
210    /// Trigger an action when all credits are depleted
211    on_credits_depleted: bool,
212    /// Trigger an action when half of the credits are depleted
213    on_credits_half_depleted: bool,
214    /// Trigger an action on a website status update event
215    on_website_status: bool,
216    /// Send information about a new page find (such as links and bytes)
217    on_find: bool,
218    /// Handle the metadata of a found page
219    on_find_metadata: bool,
220}
221
222/// Proxy pool selection for outbound request routing.
223/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
224///
225/// - 'residential'         → cost-effective entry-level residential pool
226/// - 'residential_fast'    → faster residential pool for higher throughput
227/// - 'residential_static'  → static residential IPs, rotated daily
228/// - 'residential_premium' → low-latency premium IPs
229/// - 'residential_core'    → balanced plan (quality vs. cost)
230/// - 'residential_plus'    → largest and highest quality core pool
231/// - 'mobile'              → 4G/5G mobile proxies for maximum evasion
232/// - 'isp'                 → ISP-grade datacenters
233#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
234pub enum ProxyType {
235    /// Cost-effective entry-level residential pool.
236    #[serde(rename = "residential")]
237    Residential,
238    /// 4G / 5G mobile proxies for maximum stealth and evasion.
239    #[serde(rename = "mobile")]
240    Mobile,
241    /// ISP-grade residential routing (alias: `datacenter`).
242    #[serde(rename = "isp", alias = "datacenter")]
243    #[default]
244    Isp
245}
246
247/// List of proxies.
248pub const PROXY_TYPE_LIST: [ProxyType; 3] = [
249    ProxyType::Residential,
250    ProxyType::Isp,
251    ProxyType::Mobile
252];
253
254impl ProxyType {
255    /// Get the canonical string representation of the proxy type.
256    pub fn as_str(&self) -> &'static str {
257        match self {
258            ProxyType::Residential => "residential",
259            ProxyType::Mobile => "mobile",
260            ProxyType::Isp => "isp"
261        }
262    }
263}
264
265/// Send multiple return formats.
266#[derive(Debug, Deserialize, Serialize, Clone)]
267#[serde(untagged)]
268pub enum ReturnFormatHandling {
269    /// A single return item.
270    Single(ReturnFormat),
271    /// Multiple return formats.
272    Multi(std::collections::HashSet<ReturnFormat>),
273}
274
275impl Default for ReturnFormatHandling {
276    fn default() -> ReturnFormatHandling {
277        ReturnFormatHandling::Single(ReturnFormat::Raw)
278    }
279}
280
281#[derive(Debug, Default, Deserialize, Serialize, Clone)]
282pub struct EventTracker {
283    /// The responses received.
284    pub responses: Option<bool>,
285    /// The request sent.
286    pub requests: Option<bool>,
287    /// Track the automation events with data changes and screenshots.
288    pub automation: Option<bool>,
289}
290
291#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
292#[serde(tag = "type")]
293pub enum LinkRewriteRule {
294    #[serde(rename = "replace")]
295    /// A string replacer.
296    Replace {
297        /// Only apply when the link's host matches this value.
298        #[serde(default)]
299        host: Option<String>,
300        find: String,
301        replace_with: String,
302    },
303
304    #[serde(rename = "regex")]
305    /// A regex replacer.
306    Regex {
307        /// Only apply when the link's host matches this value.
308        #[serde(default)]
309        host: Option<String>,
310        pattern: String,
311        replace_with: String,
312    },
313}
314
315/// Structure representing request parameters.
316#[derive(Debug, Default, Deserialize, Serialize, Clone)]
317pub struct RequestParams {
318    #[serde(default)]
319    /// The URL to be crawled.
320    pub url: Option<String>,
321    #[serde(default)]
322    /// The type of request to be made.
323    pub request: Option<RequestType>,
324    #[serde(default)]
325    /// The maximum number of pages the crawler should visit.
326    pub limit: Option<u32>,
327    #[serde(default)]
328    /// The format in which the result should be returned.
329    pub return_format: Option<ReturnFormatHandling>,
330    /// The country code for request
331    pub country_code: Option<String>,
332    #[serde(default)]
333    /// Specifies whether to only visit the top-level domain.
334    pub tld: Option<bool>,
335    #[serde(default)]
336    /// The depth of the crawl.
337    pub depth: Option<u32>,
338    #[serde(default)]
339    /// Specifies whether the request should be cached.
340    pub cache: Option<bool>,
341    #[serde(default)]
342    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
343    pub scroll: Option<u32>,
344    #[serde(default)]
345    /// The budget for various resources.
346    pub budget: Option<HashMap<String, u32>>,
347    #[serde(default)]
348    /// The blacklist routes to ignore. This can be a Regex string pattern.
349    pub blacklist: Option<Vec<String>>,
350    #[serde(default)]
351    /// URL rewrite rule applied to every discovered link.
352    pub link_rewrite: Option<LinkRewriteRule>,
353    #[serde(default)]
354    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
355    pub whitelist: Option<Vec<String>>,
356    #[serde(default)]
357    /// The locale to be used during the crawl.
358    pub locale: Option<String>,
359    #[serde(default)]
360    /// The cookies to be set for the request, formatted as a single string.
361    pub cookies: Option<String>,
362    #[serde(default)]
363    /// Specifies whether to use stealth techniques to avoid detection.
364    pub stealth: Option<bool>,
365    #[serde(default)]
366    /// The headers to be used for the request.
367    pub headers: Option<HashMap<String, String>>,
368    #[serde(default)]
369    /// Specifies whether to send data via webhooks.
370    pub webhooks: Option<WebhookSettings>,
371    #[serde(default)]
372    /// Specifies whether to include metadata in the response.
373    pub metadata: Option<bool>,
374    #[serde(default)]
375    /// The dimensions of the viewport.
376    pub viewport: Option<Viewport>,
377    #[serde(default)]
378    /// The encoding to be used for the request.
379    pub encoding: Option<String>,
380    #[serde(default)]
381    /// Specifies whether to include subdomains in the crawl.
382    pub subdomains: Option<bool>,
383    #[serde(default)]
384    /// The user agent string to be used for the request.
385    pub user_agent: Option<String>,
386    #[serde(default)]
387    /// Specifies whether to use fingerprinting protection.
388    pub fingerprint: Option<bool>,
389    #[serde(default)]
390    /// Specifies whether to perform the request without using storage.
391    pub storageless: Option<bool>,
392    #[serde(default)]
393    /// Specifies whether readability optimizations should be applied.
394    pub readability: Option<bool>,
395    #[serde(default)]
396    /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
397    pub proxy_enabled: Option<bool>,
398    #[serde(default)]
399    /// Specifies whether to respect the site's robots.txt file.
400    pub respect_robots: Option<bool>,
401    #[serde(default)]
402    /// CSS selector to be used to filter the content.
403    pub root_selector: Option<String>,
404    #[serde(default)]
405    /// Specifies whether to load all resources of the crawl target.
406    pub full_resources: Option<bool>,
407    #[serde(default)]
408    /// The text string to extract data from.
409    pub text: Option<String>,
410    #[serde(default)]
411    /// Specifies whether to use the sitemap links.
412    pub sitemap: Option<bool>,
413    #[serde(default)]
414    /// External domains to include the crawl.
415    pub external_domains: Option<Vec<String>>,
416    #[serde(default)]
417    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
418    pub return_embeddings: Option<bool>,
419    #[serde(default)]
420    /// Returns the HTTP response headers.
421    pub return_headers: Option<bool>,
422    #[serde(default)]
423    /// Returns the link(s) found on the page that match the crawler query.
424    pub return_page_links: Option<bool>,
425    #[serde(default)]
426    /// Returns the HTTP response cookies.
427    pub return_cookies: Option<bool>,
428    #[serde(default)]
429    /// The timeout for the request, in seconds.
430    pub request_timeout: Option<u8>,
431    #[serde(default)]
432    /// Specifies whether to run the request in the background.
433    pub run_in_background: Option<bool>,
434    #[serde(default)]
435    /// Specifies whether to skip configuration checks.
436    pub skip_config_checks: Option<bool>,
437    #[serde(default)]
438    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
439    pub css_extraction_map: Option<CSSExtractionMap>,
440    #[serde(default)]
441    /// The chunking algorithm to use.
442    pub chunking_alg: Option<ChunkingAlgDict>,
443    #[serde(default)]
444    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
445    pub disable_intercept: Option<bool>,
446    #[serde(default)]
447    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
448    pub wait_for: Option<WaitFor>,
449    #[serde(default)]
450    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
451    pub execution_scripts: Option<ExecutionScriptsMap>,
452    #[serde(default)]
453    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
454    pub automation_scripts: Option<WebAutomationMap>,
455    #[serde(default)]
456    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
457    pub redirect_policy: Option<RedirectPolicy>,
458    #[serde(default)]
459    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
460    pub event_tracker: Option<EventTracker>,
461    #[serde(default)]
462    /// The timeout to stop the crawl.
463    pub crawl_timeout: Option<Timeout>,
464    #[serde(default)]
465    /// Evaluates given script in every frame upon creation (before loading frame's scripts).
466    pub evaluate_on_new_document: Option<Box<String>>,
467    #[serde(default)]
468    /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 50%, with trade-offs in speed, accuracy,
469    /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
470    /// targeting websites with minimal anti-bot protections.
471    pub lite_mode: Option<bool>,
472    #[serde(default)]
473    /// The proxy to use for request.
474    pub proxy: Option<ProxyType>,
475    #[serde(default)]
476    /// Use a remote proxy at ~50% reduced cost for file downloads.
477    /// This requires a user-supplied static IP proxy endpoint.
478    pub remote_proxy: Option<String>,
479    #[serde(default)]
480    /// Set the maximum number of credits to use per page.
481    /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
482    /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
483    pub max_credits_per_page: Option<f64>,
484}
485
486
487#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
488#[serde(rename_all = "lowercase")]
489pub enum TBS {
490    #[serde(rename = "qdr:h")]
491    PastHour,
492    #[serde(rename = "qdr:d")]
493    Past24Hours,
494    #[serde(rename = "qdr:w")]
495    PastWeek,
496    #[serde(rename = "qdr:m")]
497    PastMonth,
498    #[serde(rename = "qdr:y")]
499    PastYear,
500}
501
502/// The engine to use.
503#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
504pub enum Engine {
505    /// Google
506    Google,
507    /// Brave
508    Brave,
509    /// All
510    #[default]
511    All
512}
513
514/// The structure representing request parameters for a search request.
515#[derive(Debug, Default, Deserialize, Serialize, Clone)]
516pub struct SearchRequestParams {
517    /// The base request parameters.
518    #[serde(default, flatten)]
519    pub base: RequestParams,
520    // The search request.
521    pub search: String,
522    /// The search limit.
523    pub search_limit: Option<u32>,
524    // Fetch the page content. Defaults to true.
525    pub fetch_page_content: Option<bool>,
526    /// The search location of the request
527    pub location: Option<String>,
528    /// The country code of the request
529    pub country: Option<crate::shapes::country_codes::CountryCode>,
530    /// The language code of the request.
531    pub language: Option<String>,
532    /// The number of search results
533    pub num: Option<u32>,
534    /// The time period range.
535    pub tbs: Option<TBS>,
536    /// The page of the search results.
537    pub page: Option<u32>,
538    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
539    pub website_limit: Option<u32>,
540    /// Prioritize speed over output quantity.
541    pub quick_search: Option<bool>,
542    /// Auto paginate pages ( up to 100 pages ).
543    pub auto_pagination: Option<bool>,
544    /// The search engine to use.
545    pub engine: Option<Engine>
546}
547
548/// Structure representing request parameters for transforming files.
549#[derive(Debug, Default, Deserialize, Serialize, Clone)]
550pub struct TransformParams {
551    #[serde(default)]
552    /// The format in which the result should be returned.
553    pub return_format: Option<ReturnFormat>,
554    #[serde(default)]
555    /// Specifies whether readability optimizations should be applied.
556    pub readability: Option<bool>,
557    #[serde(default)]
558    /// Clean the markdown or text for AI.
559    pub clean: Option<bool>,
560    #[serde(default)]
561    /// Clean the markdown or text for AI removing footers, navigation, and more.
562    pub clean_full: Option<bool>,
563    /// The data being transformed.
564    pub data: Vec<Resource>,
565}
566
567#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Default)]
568/// Transformation resource to use.
569pub struct Resource {
570    #[serde(default)]
571    /// the html to transform
572    pub html: Option<bytes::Bytes>,
573    #[serde(default)]
574    /// the content to transform
575    pub content: Option<bytes::Bytes>,
576    #[serde(default)]
577    /// the url of the html incase of readability to improve transformations.
578    pub url: Option<String>,
579    #[serde(default)]
580    /// the language of the resource.
581    pub lang: Option<String>,
582}
583
584/// the request type to perform
585#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
586#[serde(rename_all = "lowercase")]
587pub enum RequestType {
588    /// Default HTTP request
589    Http,
590    /// Chrome browser rendering
591    Chrome,
592    #[default]
593    /// Smart mode defaulting to HTTP and using Chrome when needed.
594    SmartMode,
595}
596
597/// Enum representing different return formats.
598#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
599#[serde(rename_all = "lowercase")]
600pub enum ReturnFormat {
601    #[default]
602    /// The default return format of the resource.
603    Raw,
604    /// Return the response as Markdown.
605    Markdown,
606    /// Return the response as Commonmark.
607    Commonmark,
608    /// Return the response as Html2text.
609    Html2text,
610    /// Return the response as Text.
611    Text,
612    /// Returns a screenshot as Base64Url
613    Screenshot,
614    /// Return the response as XML.
615    Xml,
616    /// Return the response as Bytes.
617    Bytes,
618}
spider_client/shapes/request.rs

spider_client/shapes/
request.rs