spider_client/shapes/request.rs
1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7 /// The chunking algorithm to use, defined as a specific type.
8 r#type: ChunkingType,
9 /// The amount to chunk by.
10 value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone)]
15pub struct Timeout {
16 /// The seconds up to 60.
17 pub secs: u64,
18 /// The nanoseconds.
19 pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24 /// The timeout to wait until.
25 pub timeout: Timeout,
26}
27
28#[derive(Serialize, Deserialize, Debug, Clone)]
29#[serde(tag = "type", rename_all = "PascalCase")]
30pub enum WebAutomation {
31 Evaluate { code: String },
32 Click { selector: String },
33 Wait { duration: u64 },
34 WaitForNavigation,
35 WaitFor { selector: String },
36 WaitForAndClick { selector: String },
37 ScrollX { pixels: i32 },
38 ScrollY { pixels: i32 },
39 Fill { selector: String, value: String },
40 InfiniteScroll { times: u32 },
41}
42
43#[derive(Default, Serialize, Deserialize, Debug, Clone)]
44#[serde(tag = "type", rename_all = "PascalCase")]
45pub enum RedirectPolicy {
46 Loose,
47 #[default]
48 Strict,
49}
50
51pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
52pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
53
54#[derive(Serialize, Deserialize, Debug, Clone)]
55pub struct Selector {
56 /// The timeout to wait until.
57 pub timeout: Timeout,
58 /// The selector to wait for.
59 pub selector: String,
60}
61
62#[derive(Serialize, Deserialize, Debug, Clone)]
63pub struct Delay {
64 /// The timeout to wait until.
65 pub timeout: Timeout,
66}
67
68#[derive(Serialize, Deserialize, Debug, Clone)]
69pub struct WaitFor {
70 /// Wait until idle networks with a timeout of idleness.
71 pub idle_network: Option<IdleNetwork>,
72 /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
73 pub selector: Option<Selector>,
74 /// Wait until a hard delay.
75 pub delay: Option<Delay>,
76 /// Wait until page navigation happen. Default is true.
77 pub page_navigations: Option<bool>,
78}
79
80/// Query request to get a document.
81#[derive(Serialize, Deserialize, Debug, Clone, Default)]
82pub struct QueryRequest {
83 /// The exact website url.
84 pub url: Option<String>,
85 /// The website domain.
86 pub domain: Option<String>,
87 /// The path of the resource.
88 pub pathname: Option<String>,
89}
90
91/// Enum representing different types of Chunking.
92#[derive(Default, Debug, Deserialize, Serialize, Clone)]
93#[serde(rename_all = "lowercase")]
94pub enum ChunkingType {
95 #[default]
96 /// By the word count.
97 ByWords,
98 /// By the line count.
99 ByLines,
100 /// By the char length.
101 ByCharacterLength,
102 /// By sentence.
103 BySentence,
104}
105
106#[derive(Default, Debug, Deserialize, Serialize, Clone)]
107/// View port handling for chrome.
108pub struct Viewport {
109 /// Device screen Width
110 pub width: u32,
111 /// Device screen size
112 pub height: u32,
113 /// Device scale factor
114 pub device_scale_factor: Option<f64>,
115 /// Emulating Mobile?
116 pub emulating_mobile: bool,
117 /// Use landscape mode instead of portrait.
118 pub is_landscape: bool,
119 /// Touch screen device?
120 pub has_touch: bool,
121}
122
123// Define the CSSSelector struct
124#[derive(Debug, Clone, Default, Deserialize, Serialize)]
125pub struct CSSSelector {
126 /// The name of the selector group
127 pub name: String,
128 /// A vector of CSS selectors
129 pub selectors: Vec<String>,
130}
131
132// Define the CSSExtractionMap type
133pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
134
135/// Represents the settings for a webhook configuration
136#[derive(Debug, Default, Deserialize, Serialize, Clone)]
137pub struct WebhookSettings {
138 /// The destination where the webhook information will be sent
139 destination: String,
140 /// Trigger an action when all credits are depleted
141 on_credits_depleted: bool,
142 /// Trigger an action when half of the credits are depleted
143 on_credits_half_depleted: bool,
144 /// Trigger an action on a website status update event
145 on_website_status: bool,
146 /// Send information about a new page find (such as links and bytes)
147 on_find: bool,
148 /// Handle the metadata of a found page
149 on_find_metadata: bool,
150}
151
152/// Proxy pool selection for outbound request routing.
153/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
154///
155/// - 'residential' → cost-effective entry-level residential pool
156/// - 'residential_fast' → faster residential pool for higher throughput
157/// - 'residential_static' → static residential IPs, rotated daily
158/// - 'residential_premium' → low-latency premium IPs
159/// - 'residential_core' → balanced plan (quality vs. cost)
160/// - 'residential_plus' → largest and highest quality core pool
161/// - 'mobile' → 4G/5G mobile proxies for maximum evasion
162/// - 'isp' → ISP-grade datacenters
163#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
164pub enum ProxyType {
165 /// Cost-effective entry-level residential pool.
166 #[serde(rename = "residential")]
167 Residential,
168 /// Higher-throughput residential pool for better performance.
169 #[serde(rename = "residential_fast")]
170 ResidentialFast,
171 /// Static residential IPs, rotated daily for session persistence.
172 #[serde(rename = "residential_static")]
173 ResidentialStatic,
174 /// 4G / 5G mobile proxies for maximum stealth and evasion.
175 #[serde(rename = "mobile")]
176 Mobile,
177 /// ISP-grade residential routing (alias: `datacenter`).
178 #[serde(rename = "isp", alias = "datacenter")]
179 #[default]
180 Isp,
181 /// Premium low-latency residential proxy pool.
182 #[serde(rename = "residential_premium")]
183 ResidentialPremium,
184 /// Core residential plan optimized for balance between cost and quality.
185 #[serde(rename = "residential_core")]
186 ResidentialCore,
187 /// Extended core residential pool with the largest, highest-quality IPs.
188 #[serde(rename = "residential_plus")]
189 ResidentialPlus,
190}
191
192/// Send multiple return formats.
193#[derive(Debug, Deserialize, Serialize, Clone)]
194#[serde(untagged)]
195pub enum ReturnFormatHandling {
196 /// A single return item.
197 Single(ReturnFormat),
198 /// Multiple return formats.
199 Multi(std::collections::HashSet<ReturnFormat>),
200}
201
202impl Default for ReturnFormatHandling {
203 fn default() -> ReturnFormatHandling {
204 ReturnFormatHandling::Single(ReturnFormat::Raw)
205 }
206}
207
208#[derive(Debug, Default, Deserialize, Serialize, Clone)]
209pub struct EventTracker {
210 /// The responses received.
211 responses: Option<bool>,
212 ///The request sent.
213 requests: Option<bool>,
214}
215
216/// Structure representing request parameters.
217#[derive(Debug, Default, Deserialize, Serialize, Clone)]
218pub struct RequestParams {
219 #[serde(default)]
220 /// The URL to be crawled.
221 pub url: Option<String>,
222 #[serde(default)]
223 /// The type of request to be made.
224 pub request: Option<RequestType>,
225 #[serde(default)]
226 /// The maximum number of pages the crawler should visit.
227 pub limit: Option<u32>,
228 #[serde(default)]
229 /// The format in which the result should be returned.
230 pub return_format: Option<ReturnFormatHandling>,
231 #[serde(default)]
232 /// Specifies whether to only visit the top-level domain.
233 pub tld: Option<bool>,
234 #[serde(default)]
235 /// The depth of the crawl.
236 pub depth: Option<u32>,
237 #[serde(default)]
238 /// Specifies whether the request should be cached.
239 pub cache: Option<bool>,
240 #[serde(default)]
241 /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
242 pub scroll: Option<u32>,
243 #[serde(default)]
244 /// The budget for various resources.
245 pub budget: Option<HashMap<String, u32>>,
246 #[serde(default)]
247 /// The blacklist routes to ignore. This can be a Regex string pattern.
248 pub blacklist: Option<Vec<String>>,
249 #[serde(default)]
250 /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
251 pub whitelist: Option<Vec<String>>,
252 #[serde(default)]
253 /// The locale to be used during the crawl.
254 pub locale: Option<String>,
255 #[serde(default)]
256 /// The cookies to be set for the request, formatted as a single string.
257 pub cookies: Option<String>,
258 #[serde(default)]
259 /// Specifies whether to use stealth techniques to avoid detection.
260 pub stealth: Option<bool>,
261 #[serde(default)]
262 /// The headers to be used for the request.
263 pub headers: Option<HashMap<String, String>>,
264 #[serde(default)]
265 /// Specifies whether anti-bot measures should be used.
266 pub anti_bot: Option<bool>,
267 #[serde(default)]
268 /// Specifies whether to send data via webhooks.
269 pub webhooks: Option<WebhookSettings>,
270 #[serde(default)]
271 /// Specifies whether to include metadata in the response.
272 pub metadata: Option<bool>,
273 #[serde(default)]
274 /// The dimensions of the viewport.
275 pub viewport: Option<Viewport>,
276 #[serde(default)]
277 /// The encoding to be used for the request.
278 pub encoding: Option<String>,
279 #[serde(default)]
280 /// Specifies whether to include subdomains in the crawl.
281 pub subdomains: Option<bool>,
282 #[serde(default)]
283 /// The user agent string to be used for the request.
284 pub user_agent: Option<String>,
285 #[serde(default)]
286 /// Specifies whether the response data should be stored.
287 pub store_data: Option<bool>,
288 #[serde(default)]
289 /// Configuration settings for GPT (general purpose texture mappings).
290 pub gpt_config: Option<HashMap<String, String>>,
291 #[serde(default)]
292 /// Specifies whether to use fingerprinting protection.
293 pub fingerprint: Option<bool>,
294 #[serde(default)]
295 /// Specifies whether to perform the request without using storage.
296 pub storageless: Option<bool>,
297 #[serde(default)]
298 /// Specifies whether readability optimizations should be applied.
299 pub readability: Option<bool>,
300 #[serde(default)]
301 /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
302 pub proxy_enabled: Option<bool>,
303 #[serde(default)]
304 /// Specifies whether to respect the site's robots.txt file.
305 pub respect_robots: Option<bool>,
306 #[serde(default)]
307 /// CSS selector to be used to filter the content.
308 pub root_selector: Option<String>,
309 #[serde(default)]
310 /// Specifies whether to load all resources of the crawl target.
311 pub full_resources: Option<bool>,
312 #[serde(default)]
313 /// The text string to extract data from.
314 pub text: Option<String>,
315 #[serde(default)]
316 /// Specifies whether to use the sitemap links.
317 pub sitemap: Option<bool>,
318 #[serde(default)]
319 /// External domains to include the crawl.
320 pub external_domains: Option<Vec<String>>,
321 #[serde(default)]
322 /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
323 pub return_embeddings: Option<bool>,
324 #[serde(default)]
325 /// Returns the HTTP response headers.
326 pub return_headers: Option<bool>,
327 #[serde(default)]
328 /// Returns the link(s) found on the page that match the crawler query.
329 pub return_page_links: Option<bool>,
330 #[serde(default)]
331 /// Returns the HTTP response cookies.
332 pub return_cookies: Option<bool>,
333 #[serde(default)]
334 /// The timeout for the request, in milliseconds.
335 pub request_timeout: Option<u8>,
336 #[serde(default)]
337 /// Specifies whether to run the request in the background.
338 pub run_in_background: Option<bool>,
339 #[serde(default)]
340 /// Specifies whether to skip configuration checks.
341 pub skip_config_checks: Option<bool>,
342 #[serde(default)]
343 /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
344 pub css_extraction_map: Option<CSSExtractionMap>,
345 #[serde(default)]
346 /// The chunking algorithm to use.
347 pub chunking_alg: Option<ChunkingAlgDict>,
348 #[serde(default)]
349 /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
350 pub disable_intercept: Option<bool>,
351 #[serde(default)]
352 /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
353 pub wait_for: Option<WaitFor>,
354 #[serde(default)]
355 /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
356 pub execution_scripts: Option<ExecutionScriptsMap>,
357 #[serde(default)]
358 /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
359 pub automation_scripts: Option<WebAutomationMap>,
360 #[serde(default)]
361 /// The redirect policy for HTTP request. Set the value to Loose to allow all.
362 pub redirect_policy: Option<RedirectPolicy>,
363 #[serde(default)]
364 /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
365 pub event_tracker: Option<EventTracker>,
366 #[serde(default)]
367 /// The timeout to stop the crawl.
368 pub crawl_timeout: Option<Timeout>,
369 #[serde(default)]
370 /// Evaluates given script in every frame upon creation (before loading frame's scripts).
371 pub evaluate_on_new_document: Option<Box<String>>,
372 #[serde(default)]
373 /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 70%, with trade-offs in speed, accuracy,
374 /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
375 /// targeting websites with minimal anti-bot protections.
376 pub lite_mode: Option<bool>,
377 #[serde(default)]
378 /// The proxy to use for request.
379 pub proxy: Option<ProxyType>,
380 #[serde(default)]
381 /// Use a remote proxy at ~70% reduced cost for file downloads.
382 /// This requires a user-supplied static IP proxy endpoint.
383 pub remote_proxy: Option<String>,
384 #[serde(default)]
385 /// Set the maximum number of credits to use per page.
386 /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
387 /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
388 pub max_credits_per_page: Option<f64>,
389}
390
391/// The structure representing request parameters for a search request.
392#[derive(Debug, Default, Deserialize, Serialize, Clone)]
393pub struct SearchRequestParams {
394 /// The base request parameters.
395 #[serde(default, flatten)]
396 pub base: RequestParams,
397 // The search request.
398 pub search: String,
399 /// The search limit.
400 pub search_limit: Option<u32>,
401 // Fetch the page content. Defaults to true.
402 pub fetch_page_content: Option<bool>,
403 /// The search location of the request
404 pub location: Option<String>,
405 /// The country code of the request
406 pub country: Option<String>,
407 /// The language code of the request.
408 pub language: Option<String>,
409 /// The number of search results
410 pub num: Option<u32>,
411 /// The page of the search results.
412 pub page: Option<u32>,
413 #[serde(default)]
414 /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
415 pub website_limit: Option<u32>,
416}
417
418/// Structure representing request parameters for transforming files.
419#[derive(Debug, Default, Deserialize, Serialize, Clone)]
420pub struct TransformParams {
421 #[serde(default)]
422 /// The format in which the result should be returned.
423 pub return_format: Option<ReturnFormat>,
424 #[serde(default)]
425 /// Specifies whether readability optimizations should be applied.
426 pub readability: Option<bool>,
427 #[serde(default)]
428 /// Clean the markdown or text for AI.
429 pub clean: Option<bool>,
430 #[serde(default)]
431 /// Clean the markdown or text for AI removing footers, navigation, and more.
432 pub clean_full: Option<bool>,
433 /// The data being transformed.
434 pub data: Vec<DataParam>,
435}
436
437#[derive(Serialize, Deserialize, Debug, Clone)]
438pub struct DataParam {
439 /// The HTML resource.
440 pub html: String,
441 /// The website url.
442 pub url: Option<String>,
443}
444
445/// the request type to perform
446#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
447#[serde(rename_all = "lowercase")]
448pub enum RequestType {
449 /// Default HTTP request
450 Http,
451 /// Chrome browser rendering
452 Chrome,
453 #[default]
454 /// Smart mode defaulting to HTTP and using Chrome when needed.
455 SmartMode,
456}
457
458/// Enum representing different return formats.
459#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
460#[serde(rename_all = "lowercase")]
461pub enum ReturnFormat {
462 #[default]
463 /// The default return format of the resource.
464 Raw,
465 /// Return the response as Markdown.
466 Markdown,
467 /// Return the response as Commonmark.
468 Commonmark,
469 /// Return the response as Html2text.
470 Html2text,
471 /// Return the response as Text.
472 Text,
473 /// Returns a screenshot as Base64Url
474 Screenshot,
475 /// Return the response as XML.
476 Xml,
477 /// Return the response as Bytes.
478 Bytes,
479}