spider_client/shapes/request.rs
1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7 /// The chunking algorithm to use, defined as a specific type.
8 r#type: ChunkingType,
9 /// The amount to chunk by.
10 value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone, Default)]
15pub struct Timeout {
16 /// The seconds up to 60.
17 pub secs: u64,
18 /// The nanoseconds.
19 pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24 /// The timeout to wait until.
25 pub timeout: Timeout,
26}
27
28
29/// Represents various web automation actions.
30#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
31pub enum WebAutomation {
32 /// Runs custom JavaScript code.
33 Evaluate(String),
34 /// Clicks on an element.
35 Click(String),
36 /// Clicks on all elements.
37 ClickAll(String),
38 /// Clicks on all elements.
39 ClickAllClickable(),
40 /// Waits for a fixed duration in milliseconds.
41 Wait(u64),
42 /// Waits for the next navigation event.
43 WaitForNavigation,
44 /// Wait for dom updates to stop.
45 WaitForDom {
46 /// The selector of the element to wait for updates.
47 selector: Option<String>,
48 /// The timeout to wait for in ms.
49 timeout: u32,
50 },
51 /// Waits for an element to appear.
52 WaitFor(String),
53 /// Waits for an element to appear with a timeout.
54 WaitForWithTimeout {
55 /// The selector of the element to wait for updates.
56 selector: String,
57 /// The timeout to wait for in ms.
58 timeout: u64,
59 },
60 /// Waits for an element to appear and then clicks on it.
61 WaitForAndClick(String),
62 /// Scrolls the screen in the horizontal axis by a specified amount in pixels.
63 ScrollX(i32),
64 /// Scrolls the screen in the vertical axis by a specified amount in pixels.
65 ScrollY(i32),
66 /// Fills an input element with a specified value.
67 Fill {
68 /// The selector of the input element to fill.
69 selector: String,
70 /// The value to fill the input element with.
71 value: String,
72 },
73 /// Scrolls the page until the end.
74 InfiniteScroll(u32),
75 /// Perform a screenshot on the page - fullscreen and omit background for params.
76 Screenshot {
77 /// Take a full page screenshot.
78 full_page: bool,
79 /// Omit the background.
80 omit_background: bool,
81 /// The output file to store the screenshot.
82 output: String,
83 },
84 /// Only continue to the next automation if the prior step was valid. Use this intermediate after a step to break out of the chain.
85 ValidateChain,
86}
87
88#[derive(Default, Serialize, Deserialize, Debug, Clone)]
89#[serde(tag = "type", rename_all = "PascalCase")]
90pub enum RedirectPolicy {
91 Loose,
92 #[default]
93 Strict,
94}
95
96pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
97pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
98
99#[derive(Serialize, Deserialize, Debug, Clone)]
100pub struct Selector {
101 /// The timeout to wait until.
102 pub timeout: Timeout,
103 /// The selector to wait for.
104 pub selector: String,
105}
106
107#[derive(Serialize, Deserialize, Debug, Clone, Default)]
108pub struct Delay {
109 /// The timeout to wait until.
110 pub timeout: Timeout,
111}
112
113/// Default as true.
114fn default_some_true() -> Option<bool> {
115 Some(true)
116}
117
118#[derive(Serialize, Deserialize, Debug, Clone, Default)]
119pub struct WaitFor {
120 /// Wait until idle networks with a timeout of idleness.
121 pub idle_network: Option<IdleNetwork>,
122 /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
123 pub selector: Option<Selector>,
124 /// Wait for the dom to update
125 pub dom: Option<Selector>,
126 /// Wait until a hard delay.
127 pub delay: Option<Delay>,
128 /// Wait until page navigation happen. Default is true.
129 #[serde(default = "default_some_true")]
130 pub page_navigations: Option<bool>,
131}
132
133/// Query request to get a document.
134#[derive(Serialize, Deserialize, Debug, Clone, Default)]
135pub struct QueryRequest {
136 /// The exact website url.
137 pub url: Option<String>,
138 /// The website domain.
139 pub domain: Option<String>,
140 /// The path of the resource.
141 pub pathname: Option<String>,
142}
143
144/// Enum representing different types of Chunking.
145#[derive(Default, Debug, Deserialize, Serialize, Clone)]
146#[serde(rename_all = "lowercase")]
147pub enum ChunkingType {
148 #[default]
149 /// By the word count.
150 ByWords,
151 /// By the line count.
152 ByLines,
153 /// By the char length.
154 ByCharacterLength,
155 /// By sentence.
156 BySentence,
157}
158
159#[derive(Default, Debug, Deserialize, Serialize, Clone)]
160/// View port handling for chrome.
161pub struct Viewport {
162 /// Device screen Width
163 pub width: u32,
164 /// Device screen size
165 pub height: u32,
166 /// Device scale factor
167 pub device_scale_factor: Option<f64>,
168 /// Emulating Mobile?
169 pub emulating_mobile: bool,
170 /// Use landscape mode instead of portrait.
171 pub is_landscape: bool,
172 /// Touch screen device?
173 pub has_touch: bool,
174}
175
176// Define the CSSSelector struct
177#[derive(Debug, Clone, Default, Deserialize, Serialize)]
178pub struct CSSSelector {
179 /// The name of the selector group
180 pub name: String,
181 /// A vector of CSS selectors
182 pub selectors: Vec<String>,
183}
184
185// Define the CSSExtractionMap type
186pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
187
188/// Represents the settings for a webhook configuration
189#[derive(Debug, Default, Deserialize, Serialize, Clone)]
190pub struct WebhookSettings {
191 /// The destination where the webhook information will be sent
192 destination: String,
193 /// Trigger an action when all credits are depleted
194 on_credits_depleted: bool,
195 /// Trigger an action when half of the credits are depleted
196 on_credits_half_depleted: bool,
197 /// Trigger an action on a website status update event
198 on_website_status: bool,
199 /// Send information about a new page find (such as links and bytes)
200 on_find: bool,
201 /// Handle the metadata of a found page
202 on_find_metadata: bool,
203}
204
205/// Proxy pool selection for outbound request routing.
206/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
207///
208/// - 'residential' → cost-effective entry-level residential pool
209/// - 'residential_fast' → faster residential pool for higher throughput
210/// - 'residential_static' → static residential IPs, rotated daily
211/// - 'residential_premium' → low-latency premium IPs
212/// - 'residential_core' → balanced plan (quality vs. cost)
213/// - 'residential_plus' → largest and highest quality core pool
214/// - 'mobile' → 4G/5G mobile proxies for maximum evasion
215/// - 'isp' → ISP-grade datacenters
216#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
217pub enum ProxyType {
218 /// Cost-effective entry-level residential pool.
219 #[serde(rename = "residential")]
220 Residential,
221 /// Higher-throughput residential pool for better performance.
222 #[serde(rename = "residential_fast")]
223 ResidentialFast,
224 /// Static residential IPs, rotated daily for session persistence.
225 #[serde(rename = "residential_static")]
226 ResidentialStatic,
227 /// 4G / 5G mobile proxies for maximum stealth and evasion.
228 #[serde(rename = "mobile")]
229 Mobile,
230 /// ISP-grade residential routing (alias: `datacenter`).
231 #[serde(rename = "isp", alias = "datacenter")]
232 #[default]
233 Isp,
234 /// Premium low-latency residential proxy pool.
235 #[serde(rename = "residential_premium")]
236 ResidentialPremium,
237 /// Core residential plan optimized for balance between cost and quality.
238 #[serde(rename = "residential_core")]
239 ResidentialCore,
240 /// Extended core residential pool with the largest, highest-quality IPs.
241 #[serde(rename = "residential_plus")]
242 ResidentialPlus,
243}
244
245/// List of proxies.
246pub const PROXY_TYPE_LIST: [ProxyType; 10] = [
247 ProxyType::ResidentialStatic,
248 ProxyType::Residential,
249 ProxyType::Isp,
250 ProxyType::Mobile,
251 ProxyType::ResidentialPremium,
252 ProxyType::ResidentialPlus,
253 ProxyType::ResidentialCore,
254 ProxyType::ResidentialFast,
255 ProxyType::ResidentialStatic,
256 ProxyType::Residential,
257];
258
259impl ProxyType {
260 /// Get the canonical string representation of the proxy type.
261 pub fn as_str(&self) -> &'static str {
262 match self {
263 ProxyType::Residential => "residential",
264 ProxyType::ResidentialFast => "residential_fast",
265 ProxyType::ResidentialStatic => "residential_static",
266 ProxyType::Mobile => "mobile",
267 ProxyType::Isp => "isp",
268 ProxyType::ResidentialPremium => "residential_premium",
269 ProxyType::ResidentialCore => "residential_core",
270 ProxyType::ResidentialPlus => "residential_plus",
271 }
272 }
273}
274
275/// Send multiple return formats.
276#[derive(Debug, Deserialize, Serialize, Clone)]
277#[serde(untagged)]
278pub enum ReturnFormatHandling {
279 /// A single return item.
280 Single(ReturnFormat),
281 /// Multiple return formats.
282 Multi(std::collections::HashSet<ReturnFormat>),
283}
284
285impl Default for ReturnFormatHandling {
286 fn default() -> ReturnFormatHandling {
287 ReturnFormatHandling::Single(ReturnFormat::Raw)
288 }
289}
290
291#[derive(Debug, Default, Deserialize, Serialize, Clone)]
292pub struct EventTracker {
293 /// The responses received.
294 responses: Option<bool>,
295 /// The request sent.
296 requests: Option<bool>,
297 /// Track the automation events with data changes and screenshots.
298 automation: Option<bool>,
299}
300
301/// Structure representing request parameters.
302#[derive(Debug, Default, Deserialize, Serialize, Clone)]
303pub struct RequestParams {
304 #[serde(default)]
305 /// The URL to be crawled.
306 pub url: Option<String>,
307 #[serde(default)]
308 /// The type of request to be made.
309 pub request: Option<RequestType>,
310 #[serde(default)]
311 /// The maximum number of pages the crawler should visit.
312 pub limit: Option<u32>,
313 #[serde(default)]
314 /// The format in which the result should be returned.
315 pub return_format: Option<ReturnFormatHandling>,
316 /// The country code for request
317 pub country_code: Option<String>,
318 #[serde(default)]
319 /// Specifies whether to only visit the top-level domain.
320 pub tld: Option<bool>,
321 #[serde(default)]
322 /// The depth of the crawl.
323 pub depth: Option<u32>,
324 #[serde(default)]
325 /// Specifies whether the request should be cached.
326 pub cache: Option<bool>,
327 #[serde(default)]
328 /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
329 pub scroll: Option<u32>,
330 #[serde(default)]
331 /// The budget for various resources.
332 pub budget: Option<HashMap<String, u32>>,
333 #[serde(default)]
334 /// The blacklist routes to ignore. This can be a Regex string pattern.
335 pub blacklist: Option<Vec<String>>,
336 #[serde(default)]
337 /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
338 pub whitelist: Option<Vec<String>>,
339 #[serde(default)]
340 /// The locale to be used during the crawl.
341 pub locale: Option<String>,
342 #[serde(default)]
343 /// The cookies to be set for the request, formatted as a single string.
344 pub cookies: Option<String>,
345 #[serde(default)]
346 /// Specifies whether to use stealth techniques to avoid detection.
347 pub stealth: Option<bool>,
348 #[serde(default)]
349 /// The headers to be used for the request.
350 pub headers: Option<HashMap<String, String>>,
351 #[serde(default)]
352 /// Specifies whether anti-bot measures should be used.
353 pub anti_bot: Option<bool>,
354 #[serde(default)]
355 /// Specifies whether to send data via webhooks.
356 pub webhooks: Option<WebhookSettings>,
357 #[serde(default)]
358 /// Specifies whether to include metadata in the response.
359 pub metadata: Option<bool>,
360 #[serde(default)]
361 /// The dimensions of the viewport.
362 pub viewport: Option<Viewport>,
363 #[serde(default)]
364 /// The encoding to be used for the request.
365 pub encoding: Option<String>,
366 #[serde(default)]
367 /// Specifies whether to include subdomains in the crawl.
368 pub subdomains: Option<bool>,
369 #[serde(default)]
370 /// The user agent string to be used for the request.
371 pub user_agent: Option<String>,
372 #[serde(default)]
373 /// Specifies whether to use fingerprinting protection.
374 pub fingerprint: Option<bool>,
375 #[serde(default)]
376 /// Specifies whether to perform the request without using storage.
377 pub storageless: Option<bool>,
378 #[serde(default)]
379 /// Specifies whether readability optimizations should be applied.
380 pub readability: Option<bool>,
381 #[serde(default)]
382 /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
383 pub proxy_enabled: Option<bool>,
384 #[serde(default)]
385 /// Specifies whether to respect the site's robots.txt file.
386 pub respect_robots: Option<bool>,
387 #[serde(default)]
388 /// CSS selector to be used to filter the content.
389 pub root_selector: Option<String>,
390 #[serde(default)]
391 /// Specifies whether to load all resources of the crawl target.
392 pub full_resources: Option<bool>,
393 #[serde(default)]
394 /// The text string to extract data from.
395 pub text: Option<String>,
396 #[serde(default)]
397 /// Specifies whether to use the sitemap links.
398 pub sitemap: Option<bool>,
399 #[serde(default)]
400 /// External domains to include the crawl.
401 pub external_domains: Option<Vec<String>>,
402 #[serde(default)]
403 /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
404 pub return_embeddings: Option<bool>,
405 #[serde(default)]
406 /// Returns the HTTP response headers.
407 pub return_headers: Option<bool>,
408 #[serde(default)]
409 /// Returns the link(s) found on the page that match the crawler query.
410 pub return_page_links: Option<bool>,
411 #[serde(default)]
412 /// Returns the HTTP response cookies.
413 pub return_cookies: Option<bool>,
414 #[serde(default)]
415 /// The timeout for the request, in seconds.
416 pub request_timeout: Option<u8>,
417 #[serde(default)]
418 /// Specifies whether to run the request in the background.
419 pub run_in_background: Option<bool>,
420 #[serde(default)]
421 /// Specifies whether to skip configuration checks.
422 pub skip_config_checks: Option<bool>,
423 #[serde(default)]
424 /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
425 pub css_extraction_map: Option<CSSExtractionMap>,
426 #[serde(default)]
427 /// The chunking algorithm to use.
428 pub chunking_alg: Option<ChunkingAlgDict>,
429 #[serde(default)]
430 /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
431 pub disable_intercept: Option<bool>,
432 #[serde(default)]
433 /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
434 pub wait_for: Option<WaitFor>,
435 #[serde(default)]
436 /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
437 pub execution_scripts: Option<ExecutionScriptsMap>,
438 #[serde(default)]
439 /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
440 pub automation_scripts: Option<WebAutomationMap>,
441 #[serde(default)]
442 /// The redirect policy for HTTP request. Set the value to Loose to allow all.
443 pub redirect_policy: Option<RedirectPolicy>,
444 #[serde(default)]
445 /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
446 pub event_tracker: Option<EventTracker>,
447 #[serde(default)]
448 /// The timeout to stop the crawl.
449 pub crawl_timeout: Option<Timeout>,
450 #[serde(default)]
451 /// Evaluates given script in every frame upon creation (before loading frame's scripts).
452 pub evaluate_on_new_document: Option<Box<String>>,
453 #[serde(default)]
454 /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 50%, with trade-offs in speed, accuracy,
455 /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
456 /// targeting websites with minimal anti-bot protections.
457 pub lite_mode: Option<bool>,
458 #[serde(default)]
459 /// The proxy to use for request.
460 pub proxy: Option<ProxyType>,
461 #[serde(default)]
462 /// Use a remote proxy at ~50% reduced cost for file downloads.
463 /// This requires a user-supplied static IP proxy endpoint.
464 pub remote_proxy: Option<String>,
465 #[serde(default)]
466 /// Set the maximum number of credits to use per page.
467 /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
468 /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
469 pub max_credits_per_page: Option<f64>,
470}
471
472
473#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
474#[serde(rename_all = "lowercase")]
475pub enum TBS {
476 #[serde(rename = "qdr:h")]
477 PastHour,
478 #[serde(rename = "qdr:d")]
479 Past24Hours,
480 #[serde(rename = "qdr:w")]
481 PastWeek,
482 #[serde(rename = "qdr:m")]
483 PastMonth,
484 #[serde(rename = "qdr:y")]
485 PastYear,
486}
487
488/// The engine to use.
489#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
490pub enum Engine {
491 /// Google
492 Google,
493 /// Brave
494 Brave,
495 /// All
496 #[default]
497 All
498}
499
500/// The structure representing request parameters for a search request.
501#[derive(Debug, Default, Deserialize, Serialize, Clone)]
502pub struct SearchRequestParams {
503 /// The base request parameters.
504 #[serde(default, flatten)]
505 pub base: RequestParams,
506 // The search request.
507 pub search: String,
508 /// The search limit.
509 pub search_limit: Option<u32>,
510 // Fetch the page content. Defaults to true.
511 pub fetch_page_content: Option<bool>,
512 /// The search location of the request
513 pub location: Option<String>,
514 /// The country code of the request
515 pub country: Option<crate::shapes::country_codes::CountryCode>,
516 /// The language code of the request.
517 pub language: Option<String>,
518 /// The number of search results
519 pub num: Option<u32>,
520 /// The time period range.
521 pub tbs: Option<TBS>,
522 /// The page of the search results.
523 pub page: Option<u32>,
524 /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
525 pub website_limit: Option<u32>,
526 /// Prioritize speed over output quantity.
527 pub quick_search: Option<bool>,
528 /// Auto paginate pages ( up to 100 pages ).
529 pub auto_pagination: Option<bool>,
530 /// The search engine to use.
531 pub engine: Option<Engine>
532}
533
534/// Structure representing request parameters for transforming files.
535#[derive(Debug, Default, Deserialize, Serialize, Clone)]
536pub struct TransformParams {
537 #[serde(default)]
538 /// The format in which the result should be returned.
539 pub return_format: Option<ReturnFormat>,
540 #[serde(default)]
541 /// Specifies whether readability optimizations should be applied.
542 pub readability: Option<bool>,
543 #[serde(default)]
544 /// Clean the markdown or text for AI.
545 pub clean: Option<bool>,
546 #[serde(default)]
547 /// Clean the markdown or text for AI removing footers, navigation, and more.
548 pub clean_full: Option<bool>,
549 /// The data being transformed.
550 pub data: Vec<Resource>,
551}
552
553#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Default)]
554/// Transformation resource to use.
555pub struct Resource {
556 #[serde(default)]
557 /// the html to transform
558 pub html: Option<bytes::Bytes>,
559 #[serde(default)]
560 /// the content to transform
561 pub content: Option<bytes::Bytes>,
562 #[serde(default)]
563 /// the url of the html incase of readability to improve transformations.
564 pub url: Option<String>,
565 #[serde(default)]
566 /// the language of the resource.
567 pub lang: Option<String>,
568}
569
570/// the request type to perform
571#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
572#[serde(rename_all = "lowercase")]
573pub enum RequestType {
574 /// Default HTTP request
575 Http,
576 /// Chrome browser rendering
577 Chrome,
578 #[default]
579 /// Smart mode defaulting to HTTP and using Chrome when needed.
580 SmartMode,
581}
582
583/// Enum representing different return formats.
584#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
585#[serde(rename_all = "lowercase")]
586pub enum ReturnFormat {
587 #[default]
588 /// The default return format of the resource.
589 Raw,
590 /// Return the response as Markdown.
591 Markdown,
592 /// Return the response as Commonmark.
593 Commonmark,
594 /// Return the response as Html2text.
595 Html2text,
596 /// Return the response as Text.
597 Text,
598 /// Returns a screenshot as Base64Url
599 Screenshot,
600 /// Return the response as XML.
601 Xml,
602 /// Return the response as Bytes.
603 Bytes,
604}