spider_client/shapes/request.rs
1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7 /// The chunking algorithm to use, defined as a specific type.
8 pub r#type: ChunkingType,
9 /// The amount to chunk by.
10 pub value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone, Default)]
15pub struct Timeout {
16 /// The seconds up to 60.
17 pub secs: u64,
18 /// The nanoseconds.
19 pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24 /// The timeout to wait until.
25 pub timeout: Timeout,
26}
27
28/// Represents various web automation actions.
29#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
30pub enum WebAutomation {
31 /// Runs custom JavaScript code.
32 Evaluate(String),
33 /// Clicks on an element.
34 Click(String),
35 /// Clicks on all elements.
36 ClickAll(String),
37 /// Clicks on all elements.
38 ClickAllClickable(),
39 /// Clicks at the position x and y coordinates.
40 ClickPoint {
41 /// The horizontal (X) coordinate.
42 x: f64,
43 /// The vertical (Y) coordinate.
44 y: f64,
45 },
46 /// Click and hold on an element found by selector for a duration in ms.
47 ClickHold {
48 /// The CSS selector to target.
49 selector: String,
50 /// Duration to hold in milliseconds.
51 hold_for_ms: u64,
52 },
53 /// Click and hold at a specific point for a duration in ms.
54 ClickHoldPoint {
55 /// The horizontal (X) coordinate.
56 x: f64,
57 /// The vertical (Y) coordinate.
58 y: f64,
59 /// Duration to hold in milliseconds.
60 hold_for_ms: u64,
61 },
62 /// Click-and-drag from one element to another (selector-based).
63 ClickDrag {
64 /// CSS selector for the start element.
65 from: String,
66 /// CSS selector for the destination element.
67 to: String,
68 /// Optional key modifier (Rust: Option<i64>).
69 modifier: Option<i64>,
70 },
71 /// Click-and-drag from one point to another.
72 ClickDragPoint {
73 /// Start X coordinate.
74 from_x: f64,
75 /// Start Y coordinate.
76 from_y: f64,
77 /// End X coordinate.
78 to_x: f64,
79 /// End Y coordinate.
80 to_y: f64,
81 /// Optional key modifier (Rust: Option<i64>).
82 modifier: Option<i64>,
83 },
84 Type {
85 /// The value to type.
86 value: String,
87 /// The click modifier.
88 modifier: Option<i64>,
89 },
90 /// Waits for a fixed duration in milliseconds.
91 Wait(u64),
92 /// Waits for the next navigation event.
93 WaitForNavigation,
94 /// Wait for dom updates to stop.
95 WaitForDom {
96 /// The selector of the element to wait for updates.
97 selector: Option<String>,
98 /// The timeout to wait for in ms.
99 timeout: u32,
100 },
101 /// Waits for an element to appear.
102 WaitFor(String),
103 /// Waits for an element to appear with a timeout.
104 WaitForWithTimeout {
105 /// The selector of the element to wait for updates.
106 selector: String,
107 /// The timeout to wait for in ms.
108 timeout: u64,
109 },
110 /// Waits for an element to appear and then clicks on it.
111 WaitForAndClick(String),
112 /// Scrolls the screen in the horizontal axis by a specified amount in pixels.
113 ScrollX(i32),
114 /// Scrolls the screen in the vertical axis by a specified amount in pixels.
115 ScrollY(i32),
116 /// Fills an input element with a specified value.
117 Fill {
118 /// The selector of the input element to fill.
119 selector: String,
120 /// The value to fill the input element with.
121 value: String,
122 },
123 /// Scrolls the page until the end.
124 InfiniteScroll(u32),
125 /// Perform a screenshot on the page - fullscreen and omit background for params.
126 Screenshot {
127 /// Take a full page screenshot.
128 full_page: bool,
129 /// Omit the background.
130 omit_background: bool,
131 /// The output file to store the screenshot.
132 output: String,
133 },
134 /// Only continue to the next automation if the prior step was valid. Use this intermediate after a step to break out of the chain.
135 ValidateChain,
136}
137
138#[derive(Default, Serialize, Deserialize, Debug, Clone)]
139#[serde(tag = "type", rename_all = "PascalCase")]
140pub enum RedirectPolicy {
141 Loose,
142 #[default]
143 Strict,
144}
145
146pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
147pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
148
149#[derive(Serialize, Deserialize, Debug, Clone)]
150pub struct Selector {
151 /// The timeout to wait until.
152 pub timeout: Timeout,
153 /// The selector to wait for.
154 pub selector: String,
155}
156
157#[derive(Serialize, Deserialize, Debug, Clone, Default)]
158pub struct Delay {
159 /// The timeout to wait until.
160 pub timeout: Timeout,
161}
162
163/// Default as true.
164fn default_some_true() -> Option<bool> {
165 Some(true)
166}
167
168#[derive(Serialize, Deserialize, Debug, Clone, Default)]
169pub struct WaitFor {
170 /// Wait until idle networks with a timeout of idleness.
171 pub idle_network: Option<IdleNetwork>,
172 /// Wait until network to be idle with a max timeout.
173 pub idle_network0: Option<IdleNetwork>,
174 /// Wait until network to almost be idle with a max timeout.
175 pub almost_idle_network0: Option<IdleNetwork>,
176 /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
177 pub selector: Option<Selector>,
178 /// Wait for the dom to update
179 pub dom: Option<Selector>,
180 /// Wait until a hard delay.
181 pub delay: Option<Delay>,
182 /// Wait until page navigation happen. Default is true.
183 #[serde(default = "default_some_true")]
184 pub page_navigations: Option<bool>,
185}
186
187/// Query request to get a document.
188#[derive(Serialize, Deserialize, Debug, Clone, Default)]
189pub struct QueryRequest {
190 /// The exact website url.
191 pub url: Option<String>,
192 /// The website domain.
193 pub domain: Option<String>,
194 /// The path of the resource.
195 pub pathname: Option<String>,
196}
197
198/// Enum representing different types of Chunking.
199#[derive(Default, Debug, Deserialize, Serialize, Clone)]
200#[serde(rename_all = "lowercase")]
201pub enum ChunkingType {
202 #[default]
203 /// By the word count.
204 ByWords,
205 /// By the line count.
206 ByLines,
207 /// By the char length.
208 ByCharacterLength,
209 /// By sentence.
210 BySentence,
211}
212
213#[derive(Default, Debug, Deserialize, Serialize, Clone)]
214/// View port handling for chrome.
215pub struct Viewport {
216 /// Device screen Width
217 pub width: u32,
218 /// Device screen size
219 pub height: u32,
220 /// Device scale factor
221 pub device_scale_factor: Option<f64>,
222 /// Emulating Mobile?
223 pub emulating_mobile: bool,
224 /// Use landscape mode instead of portrait.
225 pub is_landscape: bool,
226 /// Touch screen device?
227 pub has_touch: bool,
228}
229
230// Define the CSSSelector struct
231#[derive(Debug, Clone, Default, Deserialize, Serialize)]
232pub struct CSSSelector {
233 /// The name of the selector group
234 pub name: String,
235 /// A vector of CSS selectors
236 pub selectors: Vec<String>,
237}
238
239// Define the CSSExtractionMap type
240pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
241
242/// Represents the settings for a webhook configuration
243#[derive(Debug, Default, Deserialize, Serialize, Clone)]
244pub struct WebhookSettings {
245 /// The destination where the webhook information will be sent
246 destination: String,
247 /// Trigger an action when all credits are depleted
248 on_credits_depleted: bool,
249 /// Trigger an action when half of the credits are depleted
250 on_credits_half_depleted: bool,
251 /// Trigger an action on a website status update event
252 on_website_status: bool,
253 /// Send information about a new page find (such as links and bytes)
254 on_find: bool,
255 /// Handle the metadata of a found page
256 on_find_metadata: bool,
257}
258
259/// Proxy pool selection for outbound request routing.
260/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
261///
262/// - 'residential' → cost-effective entry-level residential pool
263/// - 'residential_fast' → faster residential pool for higher throughput
264/// - 'residential_static' → static residential IPs, rotated daily
265/// - 'residential_premium' → low-latency premium IPs
266/// - 'residential_core' → balanced plan (quality vs. cost)
267/// - 'residential_plus' → largest and highest quality core pool
268/// - 'mobile' → 4G/5G mobile proxies for maximum evasion
269/// - 'isp' → ISP-grade datacenters
270#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
271pub enum ProxyType {
272 /// Cost-effective entry-level residential pool.
273 #[serde(rename = "residential")]
274 Residential,
275 /// 4G / 5G mobile proxies for maximum stealth and evasion.
276 #[serde(rename = "mobile")]
277 Mobile,
278 /// ISP-grade residential routing (alias: `datacenter`).
279 #[serde(rename = "isp", alias = "datacenter")]
280 #[default]
281 Isp,
282}
283
284/// List of proxies.
285pub const PROXY_TYPE_LIST: [ProxyType; 3] =
286 [ProxyType::Residential, ProxyType::Isp, ProxyType::Mobile];
287
288impl ProxyType {
289 /// Get the canonical string representation of the proxy type.
290 pub fn as_str(&self) -> &'static str {
291 match self {
292 ProxyType::Residential => "residential",
293 ProxyType::Mobile => "mobile",
294 ProxyType::Isp => "isp",
295 }
296 }
297}
298
299/// Send multiple return formats.
300#[derive(Debug, Deserialize, Serialize, Clone)]
301#[serde(untagged)]
302pub enum ReturnFormatHandling {
303 /// A single return item.
304 Single(ReturnFormat),
305 /// Multiple return formats.
306 Multi(std::collections::HashSet<ReturnFormat>),
307}
308
309impl Default for ReturnFormatHandling {
310 fn default() -> ReturnFormatHandling {
311 ReturnFormatHandling::Single(ReturnFormat::Raw)
312 }
313}
314
315#[derive(Debug, Default, Deserialize, Serialize, Clone)]
316pub struct EventTracker {
317 /// The responses received.
318 pub responses: Option<bool>,
319 /// The request sent.
320 pub requests: Option<bool>,
321 /// Track the automation events with data changes and screenshots.
322 pub automation: Option<bool>,
323}
324
325#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
326#[serde(tag = "type")]
327pub enum LinkRewriteRule {
328 #[serde(rename = "replace")]
329 /// A string replacer.
330 Replace {
331 /// Only apply when the link's host matches this value.
332 #[serde(default)]
333 host: Option<String>,
334 find: String,
335 replace_with: String,
336 },
337
338 #[serde(rename = "regex")]
339 /// A regex replacer.
340 Regex {
341 /// Only apply when the link's host matches this value.
342 #[serde(default)]
343 host: Option<String>,
344 pattern: String,
345 replace_with: String,
346 },
347}
348
349/// Structure representing request parameters.
350#[derive(Debug, Default, Deserialize, Serialize, Clone)]
351pub struct RequestParams {
352 #[serde(default)]
353 /// The URL to be crawled.
354 pub url: Option<String>,
355 #[serde(default)]
356 /// The type of request to be made.
357 pub request: Option<RequestType>,
358 #[serde(default)]
359 /// The maximum number of pages the crawler should visit.
360 pub limit: Option<u32>,
361 #[serde(default)]
362 /// The format in which the result should be returned.
363 pub return_format: Option<ReturnFormatHandling>,
364 /// The country code for request
365 pub country_code: Option<String>,
366 #[serde(default)]
367 /// Specifies whether to only visit the top-level domain.
368 pub tld: Option<bool>,
369 #[serde(default)]
370 /// The depth of the crawl.
371 pub depth: Option<u32>,
372 #[serde(default)]
373 /// Specifies whether the request should be cached.
374 pub cache: Option<bool>,
375 #[serde(default)]
376 /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
377 pub scroll: Option<u32>,
378 #[serde(default)]
379 /// The budget for various resources.
380 pub budget: Option<HashMap<String, u32>>,
381 #[serde(default)]
382 /// The blacklist routes to ignore. This can be a Regex string pattern.
383 pub blacklist: Option<Vec<String>>,
384 #[serde(default)]
385 /// URL rewrite rule applied to every discovered link.
386 pub link_rewrite: Option<LinkRewriteRule>,
387 #[serde(default)]
388 /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
389 pub whitelist: Option<Vec<String>>,
390 #[serde(default)]
391 /// The locale to be used during the crawl.
392 pub locale: Option<String>,
393 #[serde(default)]
394 /// The cookies to be set for the request, formatted as a single string.
395 pub cookies: Option<String>,
396 #[serde(default)]
397 /// Specifies whether to use stealth techniques to avoid detection.
398 pub stealth: Option<bool>,
399 #[serde(default)]
400 /// The headers to be used for the request.
401 pub headers: Option<HashMap<String, String>>,
402 #[serde(default)]
403 /// Specifies whether to send data via webhooks.
404 pub webhooks: Option<WebhookSettings>,
405 #[serde(default)]
406 /// Specifies whether to include metadata in the response.
407 pub metadata: Option<bool>,
408 #[serde(default)]
409 /// The dimensions of the viewport.
410 pub viewport: Option<Viewport>,
411 #[serde(default)]
412 /// The encoding to be used for the request.
413 pub encoding: Option<String>,
414 #[serde(default)]
415 /// Specifies whether to include subdomains in the crawl.
416 pub subdomains: Option<bool>,
417 #[serde(default)]
418 /// The user agent string to be used for the request.
419 pub user_agent: Option<String>,
420 #[serde(default)]
421 /// Specifies whether to use fingerprinting protection.
422 pub fingerprint: Option<bool>,
423 #[serde(default)]
424 /// Specifies whether to perform the request without using storage.
425 pub storageless: Option<bool>,
426 #[serde(default)]
427 /// Specifies whether readability optimizations should be applied.
428 pub readability: Option<bool>,
429 #[serde(default)]
430 /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
431 pub proxy_enabled: Option<bool>,
432 #[serde(default)]
433 /// Specifies whether to respect the site's robots.txt file.
434 pub respect_robots: Option<bool>,
435 #[serde(default)]
436 /// CSS selector to be used to filter the content.
437 pub root_selector: Option<String>,
438 #[serde(default)]
439 /// Specifies whether to load all resources of the crawl target.
440 pub full_resources: Option<bool>,
441 #[serde(default)]
442 /// The text string to extract data from.
443 pub text: Option<String>,
444 #[serde(default)]
445 /// Specifies whether to use the sitemap links.
446 pub sitemap: Option<bool>,
447 #[serde(default)]
448 /// External domains to include the crawl.
449 pub external_domains: Option<Vec<String>>,
450 #[serde(default)]
451 /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
452 pub return_embeddings: Option<bool>,
453 #[serde(default)]
454 /// Returns the HTTP response headers.
455 pub return_headers: Option<bool>,
456 #[serde(default)]
457 /// Returns the link(s) found on the page that match the crawler query.
458 pub return_page_links: Option<bool>,
459 #[serde(default)]
460 /// Returns the HTTP response cookies.
461 pub return_cookies: Option<bool>,
462 #[serde(default)]
463 /// The timeout for the request, in seconds.
464 pub request_timeout: Option<u8>,
465 #[serde(default)]
466 /// Specifies whether to run the request in the background.
467 pub run_in_background: Option<bool>,
468 #[serde(default)]
469 /// Specifies whether to skip configuration checks.
470 pub skip_config_checks: Option<bool>,
471 #[serde(default)]
472 /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
473 pub css_extraction_map: Option<CSSExtractionMap>,
474 #[serde(default)]
475 /// The chunking algorithm to use.
476 pub chunking_alg: Option<ChunkingAlgDict>,
477 #[serde(default)]
478 /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
479 pub disable_intercept: Option<bool>,
480 #[serde(default)]
481 /// Disables service-provided hints that add request optimizations to improve crawl outcomes,
482 /// such as network blacklists, request-type selection, geo handling, and more.
483 pub disable_hints: Option<bool>,
484 #[serde(default)]
485 /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
486 pub wait_for: Option<WaitFor>,
487 #[serde(default)]
488 /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
489 pub execution_scripts: Option<ExecutionScriptsMap>,
490 #[serde(default)]
491 /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
492 pub automation_scripts: Option<WebAutomationMap>,
493 #[serde(default)]
494 /// The redirect policy for HTTP request. Set the value to Loose to allow all.
495 pub redirect_policy: Option<RedirectPolicy>,
496 #[serde(default)]
497 /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
498 pub event_tracker: Option<EventTracker>,
499 #[serde(default)]
500 /// The timeout to stop the crawl.
501 pub crawl_timeout: Option<Timeout>,
502 #[serde(default)]
503 /// Evaluates given script in every frame upon creation (before loading frame's scripts).
504 pub evaluate_on_new_document: Option<Box<String>>,
505 #[serde(default)]
506 /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 50%, with trade-offs in speed, accuracy,
507 /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
508 /// targeting websites with minimal anti-bot protections.
509 pub lite_mode: Option<bool>,
510 #[serde(default)]
511 /// The proxy to use for request.
512 pub proxy: Option<ProxyType>,
513 #[serde(default)]
514 /// Use a remote proxy at ~50% reduced cost for file downloads.
515 /// This requires a user-supplied static IP proxy endpoint.
516 pub remote_proxy: Option<String>,
517 #[serde(default)]
518 /// Set the maximum number of credits to use per page.
519 /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
520 /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
521 pub max_credits_per_page: Option<f64>,
522}
523
524#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
525#[serde(rename_all = "lowercase")]
526pub enum TBS {
527 #[serde(rename = "qdr:h")]
528 PastHour,
529 #[serde(rename = "qdr:d")]
530 Past24Hours,
531 #[serde(rename = "qdr:w")]
532 PastWeek,
533 #[serde(rename = "qdr:m")]
534 PastMonth,
535 #[serde(rename = "qdr:y")]
536 PastYear,
537}
538
539/// The engine to use.
540#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
541pub enum Engine {
542 /// Google
543 Google,
544 /// Brave
545 Brave,
546 /// All
547 #[default]
548 All,
549}
550
551/// The structure representing request parameters for a search request.
552#[derive(Debug, Default, Deserialize, Serialize, Clone)]
553pub struct SearchRequestParams {
554 /// The base request parameters.
555 #[serde(default, flatten)]
556 pub base: RequestParams,
557 // The search request.
558 pub search: String,
559 /// The search limit.
560 pub search_limit: Option<u32>,
561 // Fetch the page content. Defaults to true.
562 pub fetch_page_content: Option<bool>,
563 /// The search location of the request
564 pub location: Option<String>,
565 /// The country code of the request
566 pub country: Option<crate::shapes::country_codes::CountryCode>,
567 /// The language code of the request.
568 pub language: Option<String>,
569 /// The number of search results
570 pub num: Option<u32>,
571 /// The time period range.
572 pub tbs: Option<TBS>,
573 /// The page of the search results.
574 pub page: Option<u32>,
575 /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
576 pub website_limit: Option<u32>,
577 /// Prioritize speed over output quantity.
578 pub quick_search: Option<bool>,
579 /// Auto paginate pages ( up to 100 pages ).
580 pub auto_pagination: Option<bool>,
581 /// The search engine to use.
582 pub engine: Option<Engine>,
583}
584
585/// Structure representing request parameters for transforming files.
586#[derive(Debug, Default, Deserialize, Serialize, Clone)]
587pub struct TransformParams {
588 #[serde(default)]
589 /// The format in which the result should be returned.
590 pub return_format: Option<ReturnFormat>,
591 #[serde(default)]
592 /// Specifies whether readability optimizations should be applied.
593 pub readability: Option<bool>,
594 #[serde(default)]
595 /// Clean the markdown or text for AI.
596 pub clean: Option<bool>,
597 #[serde(default)]
598 /// Clean the markdown or text for AI removing footers, navigation, and more.
599 pub clean_full: Option<bool>,
600 /// The data being transformed.
601 pub data: Vec<Resource>,
602}
603
604#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Default)]
605/// Transformation resource to use.
606pub struct Resource {
607 #[serde(default)]
608 /// the html to transform
609 pub html: Option<bytes::Bytes>,
610 #[serde(default)]
611 /// the content to transform
612 pub content: Option<bytes::Bytes>,
613 #[serde(default)]
614 /// the url of the html incase of readability to improve transformations.
615 pub url: Option<String>,
616 #[serde(default)]
617 /// the language of the resource.
618 pub lang: Option<String>,
619}
620
621/// the request type to perform
622#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
623#[serde(rename_all = "lowercase")]
624pub enum RequestType {
625 /// Default HTTP request
626 Http,
627 /// Chrome browser rendering
628 Chrome,
629 #[default]
630 /// Smart mode defaulting to HTTP and using Chrome when needed.
631 SmartMode,
632}
633
634/// Enum representing different return formats.
635#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
636#[serde(rename_all = "lowercase")]
637pub enum ReturnFormat {
638 #[default]
639 /// The default return format of the resource.
640 Raw,
641 /// Return the response as Markdown.
642 Markdown,
643 /// Return the response as Commonmark.
644 Commonmark,
645 /// Return the response as Html2text.
646 Html2text,
647 /// Return the response as Text.
648 Text,
649 /// Returns a screenshot as Base64Url
650 Screenshot,
651 /// Return the response as XML.
652 Xml,
653 /// Return the response as Bytes.
654 Bytes,
655}