spider_client/shapes/
request.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7    /// The chunking algorithm to use, defined as a specific type.
8    r#type: ChunkingType,
9    /// The amount to chunk by.
10    value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone, Default)]
15pub struct Timeout {
16    /// The seconds up to 60.
17    pub secs: u64,
18    /// The nanoseconds.
19    pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24    /// The timeout to wait until.
25    pub timeout: Timeout,
26}
27
28#[derive(Serialize, Deserialize, Debug, Clone)]
29#[serde(tag = "type", rename_all = "PascalCase")]
30pub enum WebAutomation {
31    Evaluate { code: String },
32    Click { selector: String },
33    Wait { duration: u64 },
34    WaitForNavigation,
35    WaitFor { selector: String },
36    WaitForAndClick { selector: String },
37    ScrollX { pixels: i32 },
38    ScrollY { pixels: i32 },
39    Fill { selector: String, value: String },
40    InfiniteScroll { times: u32 },
41}
42
43#[derive(Default, Serialize, Deserialize, Debug, Clone)]
44#[serde(tag = "type", rename_all = "PascalCase")]
45pub enum RedirectPolicy {
46    Loose,
47    #[default]
48    Strict,
49}
50
51pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
52pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
53
54#[derive(Serialize, Deserialize, Debug, Clone)]
55pub struct Selector {
56    /// The timeout to wait until.
57    pub timeout: Timeout,
58    /// The selector to wait for.
59    pub selector: String,
60}
61
62#[derive(Serialize, Deserialize, Debug, Clone, Default)]
63pub struct Delay {
64    /// The timeout to wait until.
65    pub timeout: Timeout,
66}
67
68#[derive(Serialize, Deserialize, Debug, Clone, Default)]
69pub struct WaitFor {
70    /// Wait until idle networks with a timeout of idleness.
71    pub idle_network: Option<IdleNetwork>,
72    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
73    pub selector: Option<Selector>,
74    /// Wait for the dom to update
75    pub dom: Option<Selector>,
76    /// Wait until a hard delay.
77    pub delay: Option<Delay>,
78    /// Wait until page navigation happen. Default is true.
79    pub page_navigations: Option<bool>,
80}
81
82/// Query request to get a document.
83#[derive(Serialize, Deserialize, Debug, Clone, Default)]
84pub struct QueryRequest {
85    /// The exact website url.
86    pub url: Option<String>,
87    /// The website domain.
88    pub domain: Option<String>,
89    /// The path of the resource.
90    pub pathname: Option<String>,
91}
92
93/// Enum representing different types of Chunking.
94#[derive(Default, Debug, Deserialize, Serialize, Clone)]
95#[serde(rename_all = "lowercase")]
96pub enum ChunkingType {
97    #[default]
98    /// By the word count.
99    ByWords,
100    /// By the line count.
101    ByLines,
102    /// By the char length.
103    ByCharacterLength,
104    /// By sentence.
105    BySentence,
106}
107
108#[derive(Default, Debug, Deserialize, Serialize, Clone)]
109/// View port handling for chrome.
110pub struct Viewport {
111    /// Device screen Width
112    pub width: u32,
113    /// Device screen size
114    pub height: u32,
115    /// Device scale factor
116    pub device_scale_factor: Option<f64>,
117    /// Emulating Mobile?
118    pub emulating_mobile: bool,
119    /// Use landscape mode instead of portrait.
120    pub is_landscape: bool,
121    /// Touch screen device?
122    pub has_touch: bool,
123}
124
125// Define the CSSSelector struct
126#[derive(Debug, Clone, Default, Deserialize, Serialize)]
127pub struct CSSSelector {
128    /// The name of the selector group
129    pub name: String,
130    /// A vector of CSS selectors
131    pub selectors: Vec<String>,
132}
133
134// Define the CSSExtractionMap type
135pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
136
137/// Represents the settings for a webhook configuration
138#[derive(Debug, Default, Deserialize, Serialize, Clone)]
139pub struct WebhookSettings {
140    /// The destination where the webhook information will be sent
141    destination: String,
142    /// Trigger an action when all credits are depleted
143    on_credits_depleted: bool,
144    /// Trigger an action when half of the credits are depleted
145    on_credits_half_depleted: bool,
146    /// Trigger an action on a website status update event
147    on_website_status: bool,
148    /// Send information about a new page find (such as links and bytes)
149    on_find: bool,
150    /// Handle the metadata of a found page
151    on_find_metadata: bool,
152}
153
154/// Proxy pool selection for outbound request routing.
155/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
156///
157/// - 'residential'         → cost-effective entry-level residential pool
158/// - 'residential_fast'    → faster residential pool for higher throughput
159/// - 'residential_static'  → static residential IPs, rotated daily
160/// - 'residential_premium' → low-latency premium IPs
161/// - 'residential_core'    → balanced plan (quality vs. cost)
162/// - 'residential_plus'    → largest and highest quality core pool
163/// - 'mobile'              → 4G/5G mobile proxies for maximum evasion
164/// - 'isp'                 → ISP-grade datacenters
165#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
166pub enum ProxyType {
167    /// Cost-effective entry-level residential pool.
168    #[serde(rename = "residential")]
169    Residential,
170    /// Higher-throughput residential pool for better performance.
171    #[serde(rename = "residential_fast")]
172    ResidentialFast,
173    /// Static residential IPs, rotated daily for session persistence.
174    #[serde(rename = "residential_static")]
175    ResidentialStatic,
176    /// 4G / 5G mobile proxies for maximum stealth and evasion.
177    #[serde(rename = "mobile")]
178    Mobile,
179    /// ISP-grade residential routing (alias: `datacenter`).
180    #[serde(rename = "isp", alias = "datacenter")]
181    #[default]
182    Isp,
183    /// Premium low-latency residential proxy pool.
184    #[serde(rename = "residential_premium")]
185    ResidentialPremium,
186    /// Core residential plan optimized for balance between cost and quality.
187    #[serde(rename = "residential_core")]
188    ResidentialCore,
189    /// Extended core residential pool with the largest, highest-quality IPs.
190    #[serde(rename = "residential_plus")]
191    ResidentialPlus,
192}
193
194/// Send multiple return formats.
195#[derive(Debug, Deserialize, Serialize, Clone)]
196#[serde(untagged)]
197pub enum ReturnFormatHandling {
198    /// A single return item.
199    Single(ReturnFormat),
200    /// Multiple return formats.
201    Multi(std::collections::HashSet<ReturnFormat>),
202}
203
204impl Default for ReturnFormatHandling {
205    fn default() -> ReturnFormatHandling {
206        ReturnFormatHandling::Single(ReturnFormat::Raw)
207    }
208}
209
210#[derive(Debug, Default, Deserialize, Serialize, Clone)]
211pub struct EventTracker {
212    /// The responses received.
213    responses: Option<bool>,
214    ///The request sent.
215    requests: Option<bool>,
216}
217
218/// Structure representing request parameters.
219#[derive(Debug, Default, Deserialize, Serialize, Clone)]
220pub struct RequestParams {
221    #[serde(default)]
222    /// The URL to be crawled.
223    pub url: Option<String>,
224    #[serde(default)]
225    /// The type of request to be made.
226    pub request: Option<RequestType>,
227    #[serde(default)]
228    /// The maximum number of pages the crawler should visit.
229    pub limit: Option<u32>,
230    #[serde(default)]
231    /// The format in which the result should be returned.
232    pub return_format: Option<ReturnFormatHandling>,
233    #[serde(default)]
234    /// Specifies whether to only visit the top-level domain.
235    pub tld: Option<bool>,
236    #[serde(default)]
237    /// The depth of the crawl.
238    pub depth: Option<u32>,
239    #[serde(default)]
240    /// Specifies whether the request should be cached.
241    pub cache: Option<bool>,
242    #[serde(default)]
243    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
244    pub scroll: Option<u32>,
245    #[serde(default)]
246    /// The budget for various resources.
247    pub budget: Option<HashMap<String, u32>>,
248    #[serde(default)]
249    /// The blacklist routes to ignore. This can be a Regex string pattern.
250    pub blacklist: Option<Vec<String>>,
251    #[serde(default)]
252    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
253    pub whitelist: Option<Vec<String>>,
254    #[serde(default)]
255    /// The locale to be used during the crawl.
256    pub locale: Option<String>,
257    #[serde(default)]
258    /// The cookies to be set for the request, formatted as a single string.
259    pub cookies: Option<String>,
260    #[serde(default)]
261    /// Specifies whether to use stealth techniques to avoid detection.
262    pub stealth: Option<bool>,
263    #[serde(default)]
264    /// The headers to be used for the request.
265    pub headers: Option<HashMap<String, String>>,
266    #[serde(default)]
267    /// Specifies whether anti-bot measures should be used.
268    pub anti_bot: Option<bool>,
269    #[serde(default)]
270    /// Specifies whether to send data via webhooks.
271    pub webhooks: Option<WebhookSettings>,
272    #[serde(default)]
273    /// Specifies whether to include metadata in the response.
274    pub metadata: Option<bool>,
275    #[serde(default)]
276    /// The dimensions of the viewport.
277    pub viewport: Option<Viewport>,
278    #[serde(default)]
279    /// The encoding to be used for the request.
280    pub encoding: Option<String>,
281    #[serde(default)]
282    /// Specifies whether to include subdomains in the crawl.
283    pub subdomains: Option<bool>,
284    #[serde(default)]
285    /// The user agent string to be used for the request.
286    pub user_agent: Option<String>,
287    #[serde(default)]
288    /// Specifies whether the response data should be stored.
289    pub store_data: Option<bool>,
290    #[serde(default)]
291    /// Configuration settings for GPT (general purpose texture mappings).
292    pub gpt_config: Option<HashMap<String, String>>,
293    #[serde(default)]
294    /// Specifies whether to use fingerprinting protection.
295    pub fingerprint: Option<bool>,
296    #[serde(default)]
297    /// Specifies whether to perform the request without using storage.
298    pub storageless: Option<bool>,
299    #[serde(default)]
300    /// Specifies whether readability optimizations should be applied.
301    pub readability: Option<bool>,
302    #[serde(default)]
303    /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
304    pub proxy_enabled: Option<bool>,
305    #[serde(default)]
306    /// Specifies whether to respect the site's robots.txt file.
307    pub respect_robots: Option<bool>,
308    #[serde(default)]
309    /// CSS selector to be used to filter the content.
310    pub root_selector: Option<String>,
311    #[serde(default)]
312    /// Specifies whether to load all resources of the crawl target.
313    pub full_resources: Option<bool>,
314    #[serde(default)]
315    /// The text string to extract data from.
316    pub text: Option<String>,
317    #[serde(default)]
318    /// Specifies whether to use the sitemap links.
319    pub sitemap: Option<bool>,
320    #[serde(default)]
321    /// External domains to include the crawl.
322    pub external_domains: Option<Vec<String>>,
323    #[serde(default)]
324    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
325    pub return_embeddings: Option<bool>,
326    #[serde(default)]
327    /// Returns the HTTP response headers.
328    pub return_headers: Option<bool>,
329    #[serde(default)]
330    /// Returns the link(s) found on the page that match the crawler query.
331    pub return_page_links: Option<bool>,
332    #[serde(default)]
333    /// Returns the HTTP response cookies.
334    pub return_cookies: Option<bool>,
335    #[serde(default)]
336    /// The timeout for the request, in milliseconds.
337    pub request_timeout: Option<u8>,
338    #[serde(default)]
339    /// Specifies whether to run the request in the background.
340    pub run_in_background: Option<bool>,
341    #[serde(default)]
342    /// Specifies whether to skip configuration checks.
343    pub skip_config_checks: Option<bool>,
344    #[serde(default)]
345    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
346    pub css_extraction_map: Option<CSSExtractionMap>,
347    #[serde(default)]
348    /// The chunking algorithm to use.
349    pub chunking_alg: Option<ChunkingAlgDict>,
350    #[serde(default)]
351    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
352    pub disable_intercept: Option<bool>,
353    #[serde(default)]
354    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
355    pub wait_for: Option<WaitFor>,
356    #[serde(default)]
357    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
358    pub execution_scripts: Option<ExecutionScriptsMap>,
359    #[serde(default)]
360    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
361    pub automation_scripts: Option<WebAutomationMap>,
362    #[serde(default)]
363    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
364    pub redirect_policy: Option<RedirectPolicy>,
365    #[serde(default)]
366    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
367    pub event_tracker: Option<EventTracker>,
368    #[serde(default)]
369    /// The timeout to stop the crawl.
370    pub crawl_timeout: Option<Timeout>,
371    #[serde(default)]
372    /// Evaluates given script in every frame upon creation (before loading frame's scripts).
373    pub evaluate_on_new_document: Option<Box<String>>,
374    #[serde(default)]
375    /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 70%, with trade-offs in speed, accuracy,
376    /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
377    /// targeting websites with minimal anti-bot protections.
378    pub lite_mode: Option<bool>,
379    #[serde(default)]
380    /// The proxy to use for request.
381    pub proxy: Option<ProxyType>,
382    #[serde(default)]
383    /// Use a remote proxy at ~70% reduced cost for file downloads.
384    /// This requires a user-supplied static IP proxy endpoint.
385    pub remote_proxy: Option<String>,
386    #[serde(default)]
387    /// Set the maximum number of credits to use per page.
388    /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
389    /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
390    pub max_credits_per_page: Option<f64>,
391}
392
393/// The structure representing request parameters for a search request.
394#[derive(Debug, Default, Deserialize, Serialize, Clone)]
395pub struct SearchRequestParams {
396    /// The base request parameters.
397    #[serde(default, flatten)]
398    pub base: RequestParams,
399    // The search request.
400    pub search: String,
401    /// The search limit.
402    pub search_limit: Option<u32>,
403    // Fetch the page content. Defaults to true.
404    pub fetch_page_content: Option<bool>,
405    /// The search location of the request
406    pub location: Option<String>,
407    /// The country code of the request
408    pub country: Option<String>,
409    /// The language code of the request.
410    pub language: Option<String>,
411    /// The number of search results
412    pub num: Option<u32>,
413    /// The page of the search results.
414    pub page: Option<u32>,
415    #[serde(default)]
416    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
417    pub website_limit: Option<u32>,
418}
419
420/// Structure representing request parameters for transforming files.
421#[derive(Debug, Default, Deserialize, Serialize, Clone)]
422pub struct TransformParams {
423    #[serde(default)]
424    /// The format in which the result should be returned.
425    pub return_format: Option<ReturnFormat>,
426    #[serde(default)]
427    /// Specifies whether readability optimizations should be applied.
428    pub readability: Option<bool>,
429    #[serde(default)]
430    /// Clean the markdown or text for AI.
431    pub clean: Option<bool>,
432    #[serde(default)]
433    /// Clean the markdown or text for AI removing footers, navigation, and more.
434    pub clean_full: Option<bool>,
435    /// The data being transformed.
436    pub data: Vec<DataParam>,
437}
438
439#[derive(Serialize, Deserialize, Debug, Clone)]
440pub struct DataParam {
441    /// The HTML resource.
442    pub html: String,
443    /// The website url.
444    pub url: Option<String>,
445}
446
447/// the request type to perform
448#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
449#[serde(rename_all = "lowercase")]
450pub enum RequestType {
451    /// Default HTTP request
452    Http,
453    /// Chrome browser rendering
454    Chrome,
455    #[default]
456    /// Smart mode defaulting to HTTP and using Chrome when needed.
457    SmartMode,
458}
459
460/// Enum representing different return formats.
461#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
462#[serde(rename_all = "lowercase")]
463pub enum ReturnFormat {
464    #[default]
465    /// The default return format of the resource.
466    Raw,
467    /// Return the response as Markdown.
468    Markdown,
469    /// Return the response as Commonmark.
470    Commonmark,
471    /// Return the response as Html2text.
472    Html2text,
473    /// Return the response as Text.
474    Text,
475    /// Returns a screenshot as Base64Url
476    Screenshot,
477    /// Return the response as XML.
478    Xml,
479    /// Return the response as Bytes.
480    Bytes,
481}
spider_client/shapes/request.rs

spider_client/shapes/
request.rs