spider_client/shapes/
request.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7    /// The chunking algorithm to use, defined as a specific type.
8    pub r#type: ChunkingType,
9    /// The amount to chunk by.
10    pub value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone, Default)]
15pub struct Timeout {
16    /// The seconds up to 60.
17    pub secs: u64,
18    /// The nanoseconds.
19    pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24    /// The timeout to wait until.
25    pub timeout: Timeout,
26}
27
28
29/// Represents various web automation actions.
30#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
31pub enum WebAutomation {
32    /// Runs custom JavaScript code.
33    Evaluate(String),
34    /// Clicks on an element.
35    Click(String),
36    /// Clicks on all elements.
37    ClickAll(String),
38    /// Clicks on all elements.
39    ClickAllClickable(),
40    /// Waits for a fixed duration in milliseconds.
41    Wait(u64),
42    /// Waits for the next navigation event.
43    WaitForNavigation,
44    /// Wait for dom updates to stop.
45    WaitForDom {
46        /// The selector of the element to wait for updates.
47        selector: Option<String>,
48        ///  The timeout to wait for in ms.
49        timeout: u32,
50    },
51    /// Waits for an element to appear.
52    WaitFor(String),
53    /// Waits for an element to appear with a timeout.
54    WaitForWithTimeout {
55        /// The selector of the element to wait for updates.
56        selector: String,
57        ///  The timeout to wait for in ms.
58        timeout: u64,
59    },
60    /// Waits for an element to appear and then clicks on it.
61    WaitForAndClick(String),
62    /// Scrolls the screen in the horizontal axis by a specified amount in pixels.
63    ScrollX(i32),
64    /// Scrolls the screen in the vertical axis by a specified amount in pixels.
65    ScrollY(i32),
66    /// Fills an input element with a specified value.
67    Fill {
68        /// The selector of the input element to fill.
69        selector: String,
70        ///  The value to fill the input element with.
71        value: String,
72    },
73    /// Scrolls the page until the end.
74    InfiniteScroll(u32),
75    /// Perform a screenshot on the page - fullscreen and omit background for params.
76    Screenshot {
77        /// Take a full page screenshot.
78        full_page: bool,
79        /// Omit the background.
80        omit_background: bool,
81        /// The output file to store the screenshot.
82        output: String,
83    },
84    /// Only continue to the next automation if the prior step was valid. Use this intermediate after a step to break out of the chain.
85    ValidateChain,
86}
87
88#[derive(Default, Serialize, Deserialize, Debug, Clone)]
89#[serde(tag = "type", rename_all = "PascalCase")]
90pub enum RedirectPolicy {
91    Loose,
92    #[default]
93    Strict,
94}
95
96pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
97pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
98
99#[derive(Serialize, Deserialize, Debug, Clone)]
100pub struct Selector {
101    /// The timeout to wait until.
102    pub timeout: Timeout,
103    /// The selector to wait for.
104    pub selector: String,
105}
106
107#[derive(Serialize, Deserialize, Debug, Clone, Default)]
108pub struct Delay {
109    /// The timeout to wait until.
110    pub timeout: Timeout,
111}
112
113/// Default as true.
114fn default_some_true() -> Option<bool> {
115    Some(true)
116}
117
118#[derive(Serialize, Deserialize, Debug, Clone, Default)]
119pub struct WaitFor {
120    /// Wait until idle networks with a timeout of idleness.
121    pub idle_network: Option<IdleNetwork>,
122    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
123    pub selector: Option<Selector>,
124    /// Wait for the dom to update
125    pub dom: Option<Selector>,
126    /// Wait until a hard delay.
127    pub delay: Option<Delay>,
128    /// Wait until page navigation happen. Default is true.
129    #[serde(default = "default_some_true")]
130    pub page_navigations: Option<bool>,
131}
132
133/// Query request to get a document.
134#[derive(Serialize, Deserialize, Debug, Clone, Default)]
135pub struct QueryRequest {
136    /// The exact website url.
137    pub url: Option<String>,
138    /// The website domain.
139    pub domain: Option<String>,
140    /// The path of the resource.
141    pub pathname: Option<String>,
142}
143
144/// Enum representing different types of Chunking.
145#[derive(Default, Debug, Deserialize, Serialize, Clone)]
146#[serde(rename_all = "lowercase")]
147pub enum ChunkingType {
148    #[default]
149    /// By the word count.
150    ByWords,
151    /// By the line count.
152    ByLines,
153    /// By the char length.
154    ByCharacterLength,
155    /// By sentence.
156    BySentence,
157}
158
159#[derive(Default, Debug, Deserialize, Serialize, Clone)]
160/// View port handling for chrome.
161pub struct Viewport {
162    /// Device screen Width
163    pub width: u32,
164    /// Device screen size
165    pub height: u32,
166    /// Device scale factor
167    pub device_scale_factor: Option<f64>,
168    /// Emulating Mobile?
169    pub emulating_mobile: bool,
170    /// Use landscape mode instead of portrait.
171    pub is_landscape: bool,
172    /// Touch screen device?
173    pub has_touch: bool,
174}
175
176// Define the CSSSelector struct
177#[derive(Debug, Clone, Default, Deserialize, Serialize)]
178pub struct CSSSelector {
179    /// The name of the selector group
180    pub name: String,
181    /// A vector of CSS selectors
182    pub selectors: Vec<String>,
183}
184
185// Define the CSSExtractionMap type
186pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
187
188/// Represents the settings for a webhook configuration
189#[derive(Debug, Default, Deserialize, Serialize, Clone)]
190pub struct WebhookSettings {
191    /// The destination where the webhook information will be sent
192    destination: String,
193    /// Trigger an action when all credits are depleted
194    on_credits_depleted: bool,
195    /// Trigger an action when half of the credits are depleted
196    on_credits_half_depleted: bool,
197    /// Trigger an action on a website status update event
198    on_website_status: bool,
199    /// Send information about a new page find (such as links and bytes)
200    on_find: bool,
201    /// Handle the metadata of a found page
202    on_find_metadata: bool,
203}
204
205/// Proxy pool selection for outbound request routing.
206/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
207///
208/// - 'residential'         → cost-effective entry-level residential pool
209/// - 'residential_fast'    → faster residential pool for higher throughput
210/// - 'residential_static'  → static residential IPs, rotated daily
211/// - 'residential_premium' → low-latency premium IPs
212/// - 'residential_core'    → balanced plan (quality vs. cost)
213/// - 'residential_plus'    → largest and highest quality core pool
214/// - 'mobile'              → 4G/5G mobile proxies for maximum evasion
215/// - 'isp'                 → ISP-grade datacenters
216#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
217pub enum ProxyType {
218    /// Cost-effective entry-level residential pool.
219    #[serde(rename = "residential")]
220    Residential,
221    /// Higher-throughput residential pool for better performance.
222    #[serde(rename = "residential_fast")]
223    ResidentialFast,
224    /// Static residential IPs, rotated daily for session persistence.
225    #[serde(rename = "residential_static")]
226    ResidentialStatic,
227    /// 4G / 5G mobile proxies for maximum stealth and evasion.
228    #[serde(rename = "mobile")]
229    Mobile,
230    /// ISP-grade residential routing (alias: `datacenter`).
231    #[serde(rename = "isp", alias = "datacenter")]
232    #[default]
233    Isp,
234    /// Premium low-latency residential proxy pool.
235    #[serde(rename = "residential_premium")]
236    ResidentialPremium,
237    /// Core residential plan optimized for balance between cost and quality.
238    #[serde(rename = "residential_core")]
239    ResidentialCore,
240    /// Extended core residential pool with the largest, highest-quality IPs.
241    #[serde(rename = "residential_plus")]
242    ResidentialPlus,
243}
244
245/// List of proxies.
246pub const PROXY_TYPE_LIST: [ProxyType; 10] = [
247    ProxyType::ResidentialStatic,
248    ProxyType::Residential,
249    ProxyType::Isp,
250    ProxyType::Mobile,
251    ProxyType::ResidentialPremium,
252    ProxyType::ResidentialPlus,
253    ProxyType::ResidentialCore,
254    ProxyType::ResidentialFast,
255    ProxyType::ResidentialStatic,
256    ProxyType::Residential,
257];
258
259impl ProxyType {
260    /// Get the canonical string representation of the proxy type.
261    pub fn as_str(&self) -> &'static str {
262        match self {
263            ProxyType::Residential => "residential",
264            ProxyType::ResidentialFast => "residential_fast",
265            ProxyType::ResidentialStatic => "residential_static",
266            ProxyType::Mobile => "mobile",
267            ProxyType::Isp => "isp",
268            ProxyType::ResidentialPremium => "residential_premium",
269            ProxyType::ResidentialCore => "residential_core",
270            ProxyType::ResidentialPlus => "residential_plus",
271        }
272    }
273}
274
275/// Send multiple return formats.
276#[derive(Debug, Deserialize, Serialize, Clone)]
277#[serde(untagged)]
278pub enum ReturnFormatHandling {
279    /// A single return item.
280    Single(ReturnFormat),
281    /// Multiple return formats.
282    Multi(std::collections::HashSet<ReturnFormat>),
283}
284
285impl Default for ReturnFormatHandling {
286    fn default() -> ReturnFormatHandling {
287        ReturnFormatHandling::Single(ReturnFormat::Raw)
288    }
289}
290
291#[derive(Debug, Default, Deserialize, Serialize, Clone)]
292pub struct EventTracker {
293    /// The responses received.
294    pub responses: Option<bool>,
295    /// The request sent.
296    pub requests: Option<bool>,
297    /// Track the automation events with data changes and screenshots.
298    pub automation: Option<bool>,
299}
300
301/// Structure representing request parameters.
302#[derive(Debug, Default, Deserialize, Serialize, Clone)]
303pub struct RequestParams {
304    #[serde(default)]
305    /// The URL to be crawled.
306    pub url: Option<String>,
307    #[serde(default)]
308    /// The type of request to be made.
309    pub request: Option<RequestType>,
310    #[serde(default)]
311    /// The maximum number of pages the crawler should visit.
312    pub limit: Option<u32>,
313    #[serde(default)]
314    /// The format in which the result should be returned.
315    pub return_format: Option<ReturnFormatHandling>,
316    /// The country code for request
317    pub country_code: Option<String>,
318    #[serde(default)]
319    /// Specifies whether to only visit the top-level domain.
320    pub tld: Option<bool>,
321    #[serde(default)]
322    /// The depth of the crawl.
323    pub depth: Option<u32>,
324    #[serde(default)]
325    /// Specifies whether the request should be cached.
326    pub cache: Option<bool>,
327    #[serde(default)]
328    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
329    pub scroll: Option<u32>,
330    #[serde(default)]
331    /// The budget for various resources.
332    pub budget: Option<HashMap<String, u32>>,
333    #[serde(default)]
334    /// The blacklist routes to ignore. This can be a Regex string pattern.
335    pub blacklist: Option<Vec<String>>,
336    #[serde(default)]
337    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
338    pub whitelist: Option<Vec<String>>,
339    #[serde(default)]
340    /// The locale to be used during the crawl.
341    pub locale: Option<String>,
342    #[serde(default)]
343    /// The cookies to be set for the request, formatted as a single string.
344    pub cookies: Option<String>,
345    #[serde(default)]
346    /// Specifies whether to use stealth techniques to avoid detection.
347    pub stealth: Option<bool>,
348    #[serde(default)]
349    /// The headers to be used for the request.
350    pub headers: Option<HashMap<String, String>>,
351    #[serde(default)]
352    /// Specifies whether anti-bot measures should be used.
353    pub anti_bot: Option<bool>,
354    #[serde(default)]
355    /// Specifies whether to send data via webhooks.
356    pub webhooks: Option<WebhookSettings>,
357    #[serde(default)]
358    /// Specifies whether to include metadata in the response.
359    pub metadata: Option<bool>,
360    #[serde(default)]
361    /// The dimensions of the viewport.
362    pub viewport: Option<Viewport>,
363    #[serde(default)]
364    /// The encoding to be used for the request.
365    pub encoding: Option<String>,
366    #[serde(default)]
367    /// Specifies whether to include subdomains in the crawl.
368    pub subdomains: Option<bool>,
369    #[serde(default)]
370    /// The user agent string to be used for the request.
371    pub user_agent: Option<String>,
372    #[serde(default)]
373    /// Specifies whether to use fingerprinting protection.
374    pub fingerprint: Option<bool>,
375    #[serde(default)]
376    /// Specifies whether to perform the request without using storage.
377    pub storageless: Option<bool>,
378    #[serde(default)]
379    /// Specifies whether readability optimizations should be applied.
380    pub readability: Option<bool>,
381    #[serde(default)]
382    /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
383    pub proxy_enabled: Option<bool>,
384    #[serde(default)]
385    /// Specifies whether to respect the site's robots.txt file.
386    pub respect_robots: Option<bool>,
387    #[serde(default)]
388    /// CSS selector to be used to filter the content.
389    pub root_selector: Option<String>,
390    #[serde(default)]
391    /// Specifies whether to load all resources of the crawl target.
392    pub full_resources: Option<bool>,
393    #[serde(default)]
394    /// The text string to extract data from.
395    pub text: Option<String>,
396    #[serde(default)]
397    /// Specifies whether to use the sitemap links.
398    pub sitemap: Option<bool>,
399    #[serde(default)]
400    /// External domains to include the crawl.
401    pub external_domains: Option<Vec<String>>,
402    #[serde(default)]
403    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
404    pub return_embeddings: Option<bool>,
405    #[serde(default)]
406    /// Returns the HTTP response headers.
407    pub return_headers: Option<bool>,
408    #[serde(default)]
409    /// Returns the link(s) found on the page that match the crawler query.
410    pub return_page_links: Option<bool>,
411    #[serde(default)]
412    /// Returns the HTTP response cookies.
413    pub return_cookies: Option<bool>,
414    #[serde(default)]
415    /// The timeout for the request, in seconds.
416    pub request_timeout: Option<u8>,
417    #[serde(default)]
418    /// Specifies whether to run the request in the background.
419    pub run_in_background: Option<bool>,
420    #[serde(default)]
421    /// Specifies whether to skip configuration checks.
422    pub skip_config_checks: Option<bool>,
423    #[serde(default)]
424    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
425    pub css_extraction_map: Option<CSSExtractionMap>,
426    #[serde(default)]
427    /// The chunking algorithm to use.
428    pub chunking_alg: Option<ChunkingAlgDict>,
429    #[serde(default)]
430    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
431    pub disable_intercept: Option<bool>,
432    #[serde(default)]
433    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
434    pub wait_for: Option<WaitFor>,
435    #[serde(default)]
436    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
437    pub execution_scripts: Option<ExecutionScriptsMap>,
438    #[serde(default)]
439    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
440    pub automation_scripts: Option<WebAutomationMap>,
441    #[serde(default)]
442    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
443    pub redirect_policy: Option<RedirectPolicy>,
444    #[serde(default)]
445    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
446    pub event_tracker: Option<EventTracker>,
447    #[serde(default)]
448    /// The timeout to stop the crawl.
449    pub crawl_timeout: Option<Timeout>,
450    #[serde(default)]
451    /// Evaluates given script in every frame upon creation (before loading frame's scripts).
452    pub evaluate_on_new_document: Option<Box<String>>,
453    #[serde(default)]
454    /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 50%, with trade-offs in speed, accuracy,
455    /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
456    /// targeting websites with minimal anti-bot protections.
457    pub lite_mode: Option<bool>,
458    #[serde(default)]
459    /// The proxy to use for request.
460    pub proxy: Option<ProxyType>,
461    #[serde(default)]
462    /// Use a remote proxy at ~50% reduced cost for file downloads.
463    /// This requires a user-supplied static IP proxy endpoint.
464    pub remote_proxy: Option<String>,
465    #[serde(default)]
466    /// Set the maximum number of credits to use per page.
467    /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
468    /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
469    pub max_credits_per_page: Option<f64>,
470}
471
472
473#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
474#[serde(rename_all = "lowercase")]
475pub enum TBS {
476    #[serde(rename = "qdr:h")]
477    PastHour,
478    #[serde(rename = "qdr:d")]
479    Past24Hours,
480    #[serde(rename = "qdr:w")]
481    PastWeek,
482    #[serde(rename = "qdr:m")]
483    PastMonth,
484    #[serde(rename = "qdr:y")]
485    PastYear,
486}
487
488/// The engine to use.
489#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
490pub enum Engine {
491    /// Google
492    Google,
493    /// Brave
494    Brave,
495    /// All
496    #[default]
497    All
498}
499
500/// The structure representing request parameters for a search request.
501#[derive(Debug, Default, Deserialize, Serialize, Clone)]
502pub struct SearchRequestParams {
503    /// The base request parameters.
504    #[serde(default, flatten)]
505    pub base: RequestParams,
506    // The search request.
507    pub search: String,
508    /// The search limit.
509    pub search_limit: Option<u32>,
510    // Fetch the page content. Defaults to true.
511    pub fetch_page_content: Option<bool>,
512    /// The search location of the request
513    pub location: Option<String>,
514    /// The country code of the request
515    pub country: Option<crate::shapes::country_codes::CountryCode>,
516    /// The language code of the request.
517    pub language: Option<String>,
518    /// The number of search results
519    pub num: Option<u32>,
520    /// The time period range.
521    pub tbs: Option<TBS>,
522    /// The page of the search results.
523    pub page: Option<u32>,
524    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
525    pub website_limit: Option<u32>,
526    /// Prioritize speed over output quantity.
527    pub quick_search: Option<bool>,
528    /// Auto paginate pages ( up to 100 pages ).
529    pub auto_pagination: Option<bool>,
530    /// The search engine to use.
531    pub engine: Option<Engine>
532}
533
534/// Structure representing request parameters for transforming files.
535#[derive(Debug, Default, Deserialize, Serialize, Clone)]
536pub struct TransformParams {
537    #[serde(default)]
538    /// The format in which the result should be returned.
539    pub return_format: Option<ReturnFormat>,
540    #[serde(default)]
541    /// Specifies whether readability optimizations should be applied.
542    pub readability: Option<bool>,
543    #[serde(default)]
544    /// Clean the markdown or text for AI.
545    pub clean: Option<bool>,
546    #[serde(default)]
547    /// Clean the markdown or text for AI removing footers, navigation, and more.
548    pub clean_full: Option<bool>,
549    /// The data being transformed.
550    pub data: Vec<Resource>,
551}
552
553#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Default)]
554/// Transformation resource to use.
555pub struct Resource {
556    #[serde(default)]
557    /// the html to transform
558    pub html: Option<bytes::Bytes>,
559    #[serde(default)]
560    /// the content to transform
561    pub content: Option<bytes::Bytes>,
562    #[serde(default)]
563    /// the url of the html incase of readability to improve transformations.
564    pub url: Option<String>,
565    #[serde(default)]
566    /// the language of the resource.
567    pub lang: Option<String>,
568}
569
570/// the request type to perform
571#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
572#[serde(rename_all = "lowercase")]
573pub enum RequestType {
574    /// Default HTTP request
575    Http,
576    /// Chrome browser rendering
577    Chrome,
578    #[default]
579    /// Smart mode defaulting to HTTP and using Chrome when needed.
580    SmartMode,
581}
582
583/// Enum representing different return formats.
584#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
585#[serde(rename_all = "lowercase")]
586pub enum ReturnFormat {
587    #[default]
588    /// The default return format of the resource.
589    Raw,
590    /// Return the response as Markdown.
591    Markdown,
592    /// Return the response as Commonmark.
593    Commonmark,
594    /// Return the response as Html2text.
595    Html2text,
596    /// Return the response as Text.
597    Text,
598    /// Returns a screenshot as Base64Url
599    Screenshot,
600    /// Return the response as XML.
601    Xml,
602    /// Return the response as Bytes.
603    Bytes,
604}
spider_client/shapes/request.rs

spider_client/shapes/
request.rs