spider_client/shapes/
request.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7    /// The chunking algorithm to use, defined as a specific type.
8    r#type: ChunkingType,
9    /// The amount to chunk by.
10    value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone)]
15pub struct Timeout {
16    /// The seconds up to 60.
17    pub secs: u64,
18    /// The nanoseconds.
19    pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24    /// The timeout to wait until.
25    pub timeout: Timeout,
26}
27
28#[derive(Serialize, Deserialize, Debug, Clone)]
29#[serde(tag = "type", rename_all = "PascalCase")]
30pub enum WebAutomation {
31    Evaluate { code: String },
32    Click { selector: String },
33    Wait { duration: u64 },
34    WaitForNavigation,
35    WaitFor { selector: String },
36    WaitForAndClick { selector: String },
37    ScrollX { pixels: i32 },
38    ScrollY { pixels: i32 },
39    Fill { selector: String, value: String },
40    InfiniteScroll { times: u32 },
41}
42
43#[derive(Default, Serialize, Deserialize, Debug, Clone)]
44#[serde(tag = "type", rename_all = "PascalCase")]
45pub enum RedirectPolicy {
46    Loose,
47    #[default]
48    Strict,
49}
50
51pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
52pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
53
54#[derive(Serialize, Deserialize, Debug, Clone)]
55pub struct Selector {
56    /// The timeout to wait until.
57    pub timeout: Timeout,
58    /// The selector to wait for.
59    pub selector: String,
60}
61
62#[derive(Serialize, Deserialize, Debug, Clone)]
63pub struct Delay {
64    /// The timeout to wait until.
65    pub timeout: Timeout,
66}
67
68#[derive(Serialize, Deserialize, Debug, Clone)]
69pub struct WaitFor {
70    /// Wait until idle networks with a timeout of idleness.
71    pub idle_network: Option<IdleNetwork>,
72    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
73    pub selector: Option<Selector>,
74    /// Wait until a hard delay.
75    pub delay: Option<Delay>,
76    /// Wait until page navigation happen. Default is true.
77    pub page_navigations: Option<bool>,
78}
79
80/// Query request to get a document.
81#[derive(Serialize, Deserialize, Debug, Clone, Default)]
82pub struct QueryRequest {
83    /// The exact website url.
84    pub url: Option<String>,
85    /// The website domain.
86    pub domain: Option<String>,
87    /// The path of the resource.
88    pub pathname: Option<String>,
89}
90
91/// Enum representing different types of Chunking.
92#[derive(Default, Debug, Deserialize, Serialize, Clone)]
93#[serde(rename_all = "lowercase")]
94pub enum ChunkingType {
95    #[default]
96    /// By the word count.
97    ByWords,
98    /// By the line count.
99    ByLines,
100    /// By the char length.
101    ByCharacterLength,
102    /// By sentence.
103    BySentence,
104}
105
106#[derive(Default, Debug, Deserialize, Serialize, Clone)]
107/// View port handling for chrome.
108pub struct Viewport {
109    /// Device screen Width
110    pub width: u32,
111    /// Device screen size
112    pub height: u32,
113    /// Device scale factor
114    pub device_scale_factor: Option<f64>,
115    /// Emulating Mobile?
116    pub emulating_mobile: bool,
117    /// Use landscape mode instead of portrait.
118    pub is_landscape: bool,
119    /// Touch screen device?
120    pub has_touch: bool,
121}
122
123// Define the CSSSelector struct
124#[derive(Debug, Clone, Default, Deserialize, Serialize)]
125pub struct CSSSelector {
126    /// The name of the selector group
127    pub name: String,
128    /// A vector of CSS selectors
129    pub selectors: Vec<String>,
130}
131
132// Define the CSSExtractionMap type
133pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
134
135/// Represents the settings for a webhook configuration
136#[derive(Debug, Default, Deserialize, Serialize, Clone)]
137pub struct WebhookSettings {
138    /// The destination where the webhook information will be sent
139    destination: String,
140    /// Trigger an action when all credits are depleted
141    on_credits_depleted: bool,
142    /// Trigger an action when half of the credits are depleted
143    on_credits_half_depleted: bool,
144    /// Trigger an action on a website status update event
145    on_website_status: bool,
146    /// Send information about a new page find (such as links and bytes)
147    on_find: bool,
148    /// Handle the metadata of a found page
149    on_find_metadata: bool,
150}
151
152/// Proxy pool selection for outbound request routing.
153/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
154///
155/// - 'residential'         → cost-effective entry-level residential pool
156/// - 'residential_fast'    → faster residential pool for higher throughput
157/// - 'residential_static'  → static residential IPs, rotated daily
158/// - 'residential_premium' → low-latency premium IPs
159/// - 'residential_core'    → balanced plan (quality vs. cost)
160/// - 'residential_plus'    → largest and highest quality core pool
161/// - 'mobile'              → 4G/5G mobile proxies for maximum evasion
162/// - 'isp'                 → ISP-grade datacenters
163#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
164pub enum ProxyType {
165    /// Cost-effective entry-level residential pool.
166    #[serde(rename = "residential")]
167    Residential,
168    /// Higher-throughput residential pool for better performance.
169    #[serde(rename = "residential_fast")]
170    ResidentialFast,
171    /// Static residential IPs, rotated daily for session persistence.
172    #[serde(rename = "residential_static")]
173    ResidentialStatic,
174    /// 4G / 5G mobile proxies for maximum stealth and evasion.
175    #[serde(rename = "mobile")]
176    Mobile,
177    /// ISP-grade residential routing (alias: `datacenter`).
178    #[serde(rename = "isp", alias = "datacenter")]
179    #[default]
180    Isp,
181    /// Premium low-latency residential proxy pool.
182    #[serde(rename = "residential_premium")]
183    ResidentialPremium,
184    /// Core residential plan optimized for balance between cost and quality.
185    #[serde(rename = "residential_core")]
186    ResidentialCore,
187    /// Extended core residential pool with the largest, highest-quality IPs.
188    #[serde(rename = "residential_plus")]
189    ResidentialPlus,
190}
191
192/// Send multiple return formats.
193#[derive(Debug, Deserialize, Serialize, Clone)]
194#[serde(untagged)]
195pub enum ReturnFormatHandling {
196    /// A single return item.
197    Single(ReturnFormat),
198    /// Multiple return formats.
199    Multi(std::collections::HashSet<ReturnFormat>),
200}
201
202impl Default for ReturnFormatHandling {
203    fn default() -> ReturnFormatHandling {
204        ReturnFormatHandling::Single(ReturnFormat::Raw)
205    }
206}
207
208#[derive(Debug, Default, Deserialize, Serialize, Clone)]
209pub struct EventTracker {
210    /// The responses received.
211    responses: Option<bool>,
212    ///The request sent.
213    requests: Option<bool>,
214}
215
216/// Structure representing request parameters.
217#[derive(Debug, Default, Deserialize, Serialize, Clone)]
218pub struct RequestParams {
219    #[serde(default)]
220    /// The URL to be crawled.
221    pub url: Option<String>,
222    #[serde(default)]
223    /// The type of request to be made.
224    pub request: Option<RequestType>,
225    #[serde(default)]
226    /// The maximum number of pages the crawler should visit.
227    pub limit: Option<u32>,
228    #[serde(default)]
229    /// The format in which the result should be returned.
230    pub return_format: Option<ReturnFormatHandling>,
231    #[serde(default)]
232    /// Specifies whether to only visit the top-level domain.
233    pub tld: Option<bool>,
234    #[serde(default)]
235    /// The depth of the crawl.
236    pub depth: Option<u32>,
237    #[serde(default)]
238    /// Specifies whether the request should be cached.
239    pub cache: Option<bool>,
240    #[serde(default)]
241    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
242    pub scroll: Option<u32>,
243    #[serde(default)]
244    /// The budget for various resources.
245    pub budget: Option<HashMap<String, u32>>,
246    #[serde(default)]
247    /// The blacklist routes to ignore. This can be a Regex string pattern.
248    pub blacklist: Option<Vec<String>>,
249    #[serde(default)]
250    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
251    pub whitelist: Option<Vec<String>>,
252    #[serde(default)]
253    /// The locale to be used during the crawl.
254    pub locale: Option<String>,
255    #[serde(default)]
256    /// The cookies to be set for the request, formatted as a single string.
257    pub cookies: Option<String>,
258    #[serde(default)]
259    /// Specifies whether to use stealth techniques to avoid detection.
260    pub stealth: Option<bool>,
261    #[serde(default)]
262    /// The headers to be used for the request.
263    pub headers: Option<HashMap<String, String>>,
264    #[serde(default)]
265    /// Specifies whether anti-bot measures should be used.
266    pub anti_bot: Option<bool>,
267    #[serde(default)]
268    /// Specifies whether to send data via webhooks.
269    pub webhooks: Option<WebhookSettings>,
270    #[serde(default)]
271    /// Specifies whether to include metadata in the response.
272    pub metadata: Option<bool>,
273    #[serde(default)]
274    /// The dimensions of the viewport.
275    pub viewport: Option<Viewport>,
276    #[serde(default)]
277    /// The encoding to be used for the request.
278    pub encoding: Option<String>,
279    #[serde(default)]
280    /// Specifies whether to include subdomains in the crawl.
281    pub subdomains: Option<bool>,
282    #[serde(default)]
283    /// The user agent string to be used for the request.
284    pub user_agent: Option<String>,
285    #[serde(default)]
286    /// Specifies whether the response data should be stored.
287    pub store_data: Option<bool>,
288    #[serde(default)]
289    /// Configuration settings for GPT (general purpose texture mappings).
290    pub gpt_config: Option<HashMap<String, String>>,
291    #[serde(default)]
292    /// Specifies whether to use fingerprinting protection.
293    pub fingerprint: Option<bool>,
294    #[serde(default)]
295    /// Specifies whether to perform the request without using storage.
296    pub storageless: Option<bool>,
297    #[serde(default)]
298    /// Specifies whether readability optimizations should be applied.
299    pub readability: Option<bool>,
300    #[serde(default)]
301    /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
302    pub proxy_enabled: Option<bool>,
303    #[serde(default)]
304    /// Specifies whether to respect the site's robots.txt file.
305    pub respect_robots: Option<bool>,
306    #[serde(default)]
307    /// CSS selector to be used to filter the content.
308    pub root_selector: Option<String>,
309    #[serde(default)]
310    /// Specifies whether to load all resources of the crawl target.
311    pub full_resources: Option<bool>,
312    #[serde(default)]
313    /// The text string to extract data from.
314    pub text: Option<String>,
315    #[serde(default)]
316    /// Specifies whether to use the sitemap links.
317    pub sitemap: Option<bool>,
318    #[serde(default)]
319    /// External domains to include the crawl.
320    pub external_domains: Option<Vec<String>>,
321    #[serde(default)]
322    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
323    pub return_embeddings: Option<bool>,
324    #[serde(default)]
325    /// Returns the HTTP response headers.
326    pub return_headers: Option<bool>,
327    #[serde(default)]
328    /// Returns the link(s) found on the page that match the crawler query.
329    pub return_page_links: Option<bool>,
330    #[serde(default)]
331    /// Returns the HTTP response cookies.
332    pub return_cookies: Option<bool>,
333    #[serde(default)]
334    /// The timeout for the request, in milliseconds.
335    pub request_timeout: Option<u8>,
336    #[serde(default)]
337    /// Specifies whether to run the request in the background.
338    pub run_in_background: Option<bool>,
339    #[serde(default)]
340    /// Specifies whether to skip configuration checks.
341    pub skip_config_checks: Option<bool>,
342    #[serde(default)]
343    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
344    pub css_extraction_map: Option<CSSExtractionMap>,
345    #[serde(default)]
346    /// The chunking algorithm to use.
347    pub chunking_alg: Option<ChunkingAlgDict>,
348    #[serde(default)]
349    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
350    pub disable_intercept: Option<bool>,
351    #[serde(default)]
352    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
353    pub wait_for: Option<WaitFor>,
354    #[serde(default)]
355    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
356    pub execution_scripts: Option<ExecutionScriptsMap>,
357    #[serde(default)]
358    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
359    pub automation_scripts: Option<WebAutomationMap>,
360    #[serde(default)]
361    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
362    pub redirect_policy: Option<RedirectPolicy>,
363    #[serde(default)]
364    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
365    pub event_tracker: Option<EventTracker>,
366    #[serde(default)]
367    /// The timeout to stop the crawl.
368    pub crawl_timeout: Option<Timeout>,
369    #[serde(default)]
370    /// Evaluates given script in every frame upon creation (before loading frame's scripts).
371    pub evaluate_on_new_document: Option<Box<String>>,
372    #[serde(default)]
373    /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 70%, with trade-offs in speed, accuracy,
374    /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
375    /// targeting websites with minimal anti-bot protections.
376    pub lite_mode: Option<bool>,
377    #[serde(default)]
378    /// The proxy to use for request.
379    pub proxy: Option<ProxyType>,
380    #[serde(default)]
381    /// Use a remote proxy at ~70% reduced cost for file downloads.
382    /// This requires a user-supplied static IP proxy endpoint.
383    pub remote_proxy: Option<String>,
384    #[serde(default)]
385    /// Set the maximum number of credits to use per page.
386    /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
387    /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
388    pub max_credits_per_page: Option<f64>,
389}
390
391/// The structure representing request parameters for a search request.
392#[derive(Debug, Default, Deserialize, Serialize, Clone)]
393pub struct SearchRequestParams {
394    /// The base request parameters.
395    #[serde(default, flatten)]
396    pub base: RequestParams,
397    // The search request.
398    pub search: String,
399    /// The search limit.
400    pub search_limit: Option<u32>,
401    // Fetch the page content. Defaults to true.
402    pub fetch_page_content: Option<bool>,
403    /// The search location of the request
404    pub location: Option<String>,
405    /// The country code of the request
406    pub country: Option<String>,
407    /// The language code of the request.
408    pub language: Option<String>,
409    /// The number of search results
410    pub num: Option<u32>,
411    /// The page of the search results.
412    pub page: Option<u32>,
413    #[serde(default)]
414    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
415    pub website_limit: Option<u32>,
416}
417
418/// Structure representing request parameters for transforming files.
419#[derive(Debug, Default, Deserialize, Serialize, Clone)]
420pub struct TransformParams {
421    #[serde(default)]
422    /// The format in which the result should be returned.
423    pub return_format: Option<ReturnFormat>,
424    #[serde(default)]
425    /// Specifies whether readability optimizations should be applied.
426    pub readability: Option<bool>,
427    #[serde(default)]
428    /// Clean the markdown or text for AI.
429    pub clean: Option<bool>,
430    #[serde(default)]
431    /// Clean the markdown or text for AI removing footers, navigation, and more.
432    pub clean_full: Option<bool>,
433    /// The data being transformed.
434    pub data: Vec<DataParam>,
435}
436
437#[derive(Serialize, Deserialize, Debug, Clone)]
438pub struct DataParam {
439    /// The HTML resource.
440    pub html: String,
441    /// The website url.
442    pub url: Option<String>,
443}
444
445/// the request type to perform
446#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
447#[serde(rename_all = "lowercase")]
448pub enum RequestType {
449    /// Default HTTP request
450    Http,
451    /// Chrome browser rendering
452    Chrome,
453    #[default]
454    /// Smart mode defaulting to HTTP and using Chrome when needed.
455    SmartMode,
456}
457
458/// Enum representing different return formats.
459#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
460#[serde(rename_all = "lowercase")]
461pub enum ReturnFormat {
462    #[default]
463    /// The default return format of the resource.
464    Raw,
465    /// Return the response as Markdown.
466    Markdown,
467    /// Return the response as Commonmark.
468    Commonmark,
469    /// Return the response as Html2text.
470    Html2text,
471    /// Return the response as Text.
472    Text,
473    /// Returns a screenshot as Base64Url
474    Screenshot,
475    /// Return the response as XML.
476    Xml,
477    /// Return the response as Bytes.
478    Bytes,
479}
spider_client/shapes/request.rs

spider_client/shapes/
request.rs