spider_client/shapes/
request.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7    /// The chunking algorithm to use, defined as a specific type.
8    r#type: ChunkingType,
9    /// The amount to chunk by.
10    value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone, Default)]
15pub struct Timeout {
16    /// The seconds up to 60.
17    pub secs: u64,
18    /// The nanoseconds.
19    pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24    /// The timeout to wait until.
25    pub timeout: Timeout,
26}
27
28#[derive(Serialize, Deserialize, Debug, Clone)]
29#[serde(tag = "type", rename_all = "PascalCase")]
30pub enum WebAutomation {
31    Evaluate { code: String },
32    Click { selector: String },
33    Wait { duration: u64 },
34    WaitForNavigation,
35    WaitFor { selector: String },
36    WaitForAndClick { selector: String },
37    ScrollX { pixels: i32 },
38    ScrollY { pixels: i32 },
39    Fill { selector: String, value: String },
40    InfiniteScroll { times: u32 },
41}
42
43#[derive(Default, Serialize, Deserialize, Debug, Clone)]
44#[serde(tag = "type", rename_all = "PascalCase")]
45pub enum RedirectPolicy {
46    Loose,
47    #[default]
48    Strict,
49}
50
51pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
52pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
53
54#[derive(Serialize, Deserialize, Debug, Clone)]
55pub struct Selector {
56    /// The timeout to wait until.
57    pub timeout: Timeout,
58    /// The selector to wait for.
59    pub selector: String,
60}
61
62#[derive(Serialize, Deserialize, Debug, Clone, Default)]
63pub struct Delay {
64    /// The timeout to wait until.
65    pub timeout: Timeout,
66}
67
68/// Default as true.
69fn default_true() -> bool {
70    true
71}
72
73#[derive(Serialize, Deserialize, Debug, Clone, Default)]
74pub struct WaitFor {
75    /// Wait until idle networks with a timeout of idleness.
76    pub idle_network: Option<IdleNetwork>,
77    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
78    pub selector: Option<Selector>,
79    /// Wait for the dom to update
80    pub dom: Option<Selector>,
81    /// Wait until a hard delay.
82    pub delay: Option<Delay>,
83    #[serde(default = "default_true")]
84    pub page_navigations: bool
85}
86
87/// Query request to get a document.
88#[derive(Serialize, Deserialize, Debug, Clone, Default)]
89pub struct QueryRequest {
90    /// The exact website url.
91    pub url: Option<String>,
92    /// The website domain.
93    pub domain: Option<String>,
94    /// The path of the resource.
95    pub pathname: Option<String>,
96}
97
98/// Enum representing different types of Chunking.
99#[derive(Default, Debug, Deserialize, Serialize, Clone)]
100#[serde(rename_all = "lowercase")]
101pub enum ChunkingType {
102    #[default]
103    /// By the word count.
104    ByWords,
105    /// By the line count.
106    ByLines,
107    /// By the char length.
108    ByCharacterLength,
109    /// By sentence.
110    BySentence,
111}
112
113#[derive(Default, Debug, Deserialize, Serialize, Clone)]
114/// View port handling for chrome.
115pub struct Viewport {
116    /// Device screen Width
117    pub width: u32,
118    /// Device screen size
119    pub height: u32,
120    /// Device scale factor
121    pub device_scale_factor: Option<f64>,
122    /// Emulating Mobile?
123    pub emulating_mobile: bool,
124    /// Use landscape mode instead of portrait.
125    pub is_landscape: bool,
126    /// Touch screen device?
127    pub has_touch: bool,
128}
129
130// Define the CSSSelector struct
131#[derive(Debug, Clone, Default, Deserialize, Serialize)]
132pub struct CSSSelector {
133    /// The name of the selector group
134    pub name: String,
135    /// A vector of CSS selectors
136    pub selectors: Vec<String>,
137}
138
139// Define the CSSExtractionMap type
140pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
141
142/// Represents the settings for a webhook configuration
143#[derive(Debug, Default, Deserialize, Serialize, Clone)]
144pub struct WebhookSettings {
145    /// The destination where the webhook information will be sent
146    destination: String,
147    /// Trigger an action when all credits are depleted
148    on_credits_depleted: bool,
149    /// Trigger an action when half of the credits are depleted
150    on_credits_half_depleted: bool,
151    /// Trigger an action on a website status update event
152    on_website_status: bool,
153    /// Send information about a new page find (such as links and bytes)
154    on_find: bool,
155    /// Handle the metadata of a found page
156    on_find_metadata: bool,
157}
158
159/// Proxy pool selection for outbound request routing.
160/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
161///
162/// - 'residential'         → cost-effective entry-level residential pool
163/// - 'residential_fast'    → faster residential pool for higher throughput
164/// - 'residential_static'  → static residential IPs, rotated daily
165/// - 'residential_premium' → low-latency premium IPs
166/// - 'residential_core'    → balanced plan (quality vs. cost)
167/// - 'residential_plus'    → largest and highest quality core pool
168/// - 'mobile'              → 4G/5G mobile proxies for maximum evasion
169/// - 'isp'                 → ISP-grade datacenters
170#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
171pub enum ProxyType {
172    /// Cost-effective entry-level residential pool.
173    #[serde(rename = "residential")]
174    Residential,
175    /// Higher-throughput residential pool for better performance.
176    #[serde(rename = "residential_fast")]
177    ResidentialFast,
178    /// Static residential IPs, rotated daily for session persistence.
179    #[serde(rename = "residential_static")]
180    ResidentialStatic,
181    /// 4G / 5G mobile proxies for maximum stealth and evasion.
182    #[serde(rename = "mobile")]
183    Mobile,
184    /// ISP-grade residential routing (alias: `datacenter`).
185    #[serde(rename = "isp", alias = "datacenter")]
186    #[default]
187    Isp,
188    /// Premium low-latency residential proxy pool.
189    #[serde(rename = "residential_premium")]
190    ResidentialPremium,
191    /// Core residential plan optimized for balance between cost and quality.
192    #[serde(rename = "residential_core")]
193    ResidentialCore,
194    /// Extended core residential pool with the largest, highest-quality IPs.
195    #[serde(rename = "residential_plus")]
196    ResidentialPlus,
197}
198
199/// Send multiple return formats.
200#[derive(Debug, Deserialize, Serialize, Clone)]
201#[serde(untagged)]
202pub enum ReturnFormatHandling {
203    /// A single return item.
204    Single(ReturnFormat),
205    /// Multiple return formats.
206    Multi(std::collections::HashSet<ReturnFormat>),
207}
208
209impl Default for ReturnFormatHandling {
210    fn default() -> ReturnFormatHandling {
211        ReturnFormatHandling::Single(ReturnFormat::Raw)
212    }
213}
214
215#[derive(Debug, Default, Deserialize, Serialize, Clone)]
216pub struct EventTracker {
217    /// The responses received.
218    responses: Option<bool>,
219    ///The request sent.
220    requests: Option<bool>,
221}
222
223/// Structure representing request parameters.
224#[derive(Debug, Default, Deserialize, Serialize, Clone)]
225pub struct RequestParams {
226    #[serde(default)]
227    /// The URL to be crawled.
228    pub url: Option<String>,
229    #[serde(default)]
230    /// The type of request to be made.
231    pub request: Option<RequestType>,
232    #[serde(default)]
233    /// The maximum number of pages the crawler should visit.
234    pub limit: Option<u32>,
235    #[serde(default)]
236    /// The format in which the result should be returned.
237    pub return_format: Option<ReturnFormatHandling>,
238    #[serde(default)]
239    /// Specifies whether to only visit the top-level domain.
240    pub tld: Option<bool>,
241    #[serde(default)]
242    /// The depth of the crawl.
243    pub depth: Option<u32>,
244    #[serde(default)]
245    /// Specifies whether the request should be cached.
246    pub cache: Option<bool>,
247    #[serde(default)]
248    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
249    pub scroll: Option<u32>,
250    #[serde(default)]
251    /// The budget for various resources.
252    pub budget: Option<HashMap<String, u32>>,
253    #[serde(default)]
254    /// The blacklist routes to ignore. This can be a Regex string pattern.
255    pub blacklist: Option<Vec<String>>,
256    #[serde(default)]
257    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
258    pub whitelist: Option<Vec<String>>,
259    #[serde(default)]
260    /// The locale to be used during the crawl.
261    pub locale: Option<String>,
262    #[serde(default)]
263    /// The cookies to be set for the request, formatted as a single string.
264    pub cookies: Option<String>,
265    #[serde(default)]
266    /// Specifies whether to use stealth techniques to avoid detection.
267    pub stealth: Option<bool>,
268    #[serde(default)]
269    /// The headers to be used for the request.
270    pub headers: Option<HashMap<String, String>>,
271    #[serde(default)]
272    /// Specifies whether anti-bot measures should be used.
273    pub anti_bot: Option<bool>,
274    #[serde(default)]
275    /// Specifies whether to send data via webhooks.
276    pub webhooks: Option<WebhookSettings>,
277    #[serde(default)]
278    /// Specifies whether to include metadata in the response.
279    pub metadata: Option<bool>,
280    #[serde(default)]
281    /// The dimensions of the viewport.
282    pub viewport: Option<Viewport>,
283    #[serde(default)]
284    /// The encoding to be used for the request.
285    pub encoding: Option<String>,
286    #[serde(default)]
287    /// Specifies whether to include subdomains in the crawl.
288    pub subdomains: Option<bool>,
289    #[serde(default)]
290    /// The user agent string to be used for the request.
291    pub user_agent: Option<String>,
292    #[serde(default)]
293    /// Specifies whether the response data should be stored.
294    pub store_data: Option<bool>,
295    #[serde(default)]
296    /// Configuration settings for GPT (general purpose texture mappings).
297    pub gpt_config: Option<HashMap<String, String>>,
298    #[serde(default)]
299    /// Specifies whether to use fingerprinting protection.
300    pub fingerprint: Option<bool>,
301    #[serde(default)]
302    /// Specifies whether to perform the request without using storage.
303    pub storageless: Option<bool>,
304    #[serde(default)]
305    /// Specifies whether readability optimizations should be applied.
306    pub readability: Option<bool>,
307    #[serde(default)]
308    /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
309    pub proxy_enabled: Option<bool>,
310    #[serde(default)]
311    /// Specifies whether to respect the site's robots.txt file.
312    pub respect_robots: Option<bool>,
313    #[serde(default)]
314    /// CSS selector to be used to filter the content.
315    pub root_selector: Option<String>,
316    #[serde(default)]
317    /// Specifies whether to load all resources of the crawl target.
318    pub full_resources: Option<bool>,
319    #[serde(default)]
320    /// The text string to extract data from.
321    pub text: Option<String>,
322    #[serde(default)]
323    /// Specifies whether to use the sitemap links.
324    pub sitemap: Option<bool>,
325    #[serde(default)]
326    /// External domains to include the crawl.
327    pub external_domains: Option<Vec<String>>,
328    #[serde(default)]
329    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
330    pub return_embeddings: Option<bool>,
331    #[serde(default)]
332    /// Returns the HTTP response headers.
333    pub return_headers: Option<bool>,
334    #[serde(default)]
335    /// Returns the link(s) found on the page that match the crawler query.
336    pub return_page_links: Option<bool>,
337    #[serde(default)]
338    /// Returns the HTTP response cookies.
339    pub return_cookies: Option<bool>,
340    #[serde(default)]
341    /// The timeout for the request, in milliseconds.
342    pub request_timeout: Option<u8>,
343    #[serde(default)]
344    /// Specifies whether to run the request in the background.
345    pub run_in_background: Option<bool>,
346    #[serde(default)]
347    /// Specifies whether to skip configuration checks.
348    pub skip_config_checks: Option<bool>,
349    #[serde(default)]
350    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
351    pub css_extraction_map: Option<CSSExtractionMap>,
352    #[serde(default)]
353    /// The chunking algorithm to use.
354    pub chunking_alg: Option<ChunkingAlgDict>,
355    #[serde(default)]
356    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
357    pub disable_intercept: Option<bool>,
358    #[serde(default)]
359    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
360    pub wait_for: Option<WaitFor>,
361    #[serde(default)]
362    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
363    pub execution_scripts: Option<ExecutionScriptsMap>,
364    #[serde(default)]
365    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
366    pub automation_scripts: Option<WebAutomationMap>,
367    #[serde(default)]
368    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
369    pub redirect_policy: Option<RedirectPolicy>,
370    #[serde(default)]
371    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
372    pub event_tracker: Option<EventTracker>,
373    #[serde(default)]
374    /// The timeout to stop the crawl.
375    pub crawl_timeout: Option<Timeout>,
376    #[serde(default)]
377    /// Evaluates given script in every frame upon creation (before loading frame's scripts).
378    pub evaluate_on_new_document: Option<Box<String>>,
379    #[serde(default)]
380    /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 70%, with trade-offs in speed, accuracy,
381    /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
382    /// targeting websites with minimal anti-bot protections.
383    pub lite_mode: Option<bool>,
384    #[serde(default)]
385    /// The proxy to use for request.
386    pub proxy: Option<ProxyType>,
387    #[serde(default)]
388    /// Use a remote proxy at ~70% reduced cost for file downloads.
389    /// This requires a user-supplied static IP proxy endpoint.
390    pub remote_proxy: Option<String>,
391    #[serde(default)]
392    /// Set the maximum number of credits to use per page.
393    /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
394    /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
395    pub max_credits_per_page: Option<f64>,
396}
397
398/// The structure representing request parameters for a search request.
399#[derive(Debug, Default, Deserialize, Serialize, Clone)]
400pub struct SearchRequestParams {
401    /// The base request parameters.
402    #[serde(default, flatten)]
403    pub base: RequestParams,
404    // The search request.
405    pub search: String,
406    /// The search limit.
407    pub search_limit: Option<u32>,
408    // Fetch the page content. Defaults to true.
409    pub fetch_page_content: Option<bool>,
410    /// The search location of the request
411    pub location: Option<String>,
412    /// The country code of the request
413    pub country: Option<String>,
414    /// The language code of the request.
415    pub language: Option<String>,
416    /// The number of search results
417    pub num: Option<u32>,
418    /// The page of the search results.
419    pub page: Option<u32>,
420    #[serde(default)]
421    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
422    pub website_limit: Option<u32>,
423}
424
425/// Structure representing request parameters for transforming files.
426#[derive(Debug, Default, Deserialize, Serialize, Clone)]
427pub struct TransformParams {
428    #[serde(default)]
429    /// The format in which the result should be returned.
430    pub return_format: Option<ReturnFormat>,
431    #[serde(default)]
432    /// Specifies whether readability optimizations should be applied.
433    pub readability: Option<bool>,
434    #[serde(default)]
435    /// Clean the markdown or text for AI.
436    pub clean: Option<bool>,
437    #[serde(default)]
438    /// Clean the markdown or text for AI removing footers, navigation, and more.
439    pub clean_full: Option<bool>,
440    /// The data being transformed.
441    pub data: Vec<DataParam>,
442}
443
444#[derive(Serialize, Deserialize, Debug, Clone)]
445pub struct DataParam {
446    /// The HTML resource.
447    pub html: String,
448    /// The website url.
449    pub url: Option<String>,
450}
451
452/// the request type to perform
453#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
454#[serde(rename_all = "lowercase")]
455pub enum RequestType {
456    /// Default HTTP request
457    Http,
458    /// Chrome browser rendering
459    Chrome,
460    #[default]
461    /// Smart mode defaulting to HTTP and using Chrome when needed.
462    SmartMode,
463}
464
465/// Enum representing different return formats.
466#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
467#[serde(rename_all = "lowercase")]
468pub enum ReturnFormat {
469    #[default]
470    /// The default return format of the resource.
471    Raw,
472    /// Return the response as Markdown.
473    Markdown,
474    /// Return the response as Commonmark.
475    Commonmark,
476    /// Return the response as Html2text.
477    Html2text,
478    /// Return the response as Text.
479    Text,
480    /// Returns a screenshot as Base64Url
481    Screenshot,
482    /// Return the response as XML.
483    Xml,
484    /// Return the response as Bytes.
485    Bytes,
486}
spider_client/shapes/request.rs

spider_client/shapes/
request.rs