spider_client/shapes/request.rs
1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7 /// The chunking algorithm to use, defined as a specific type.
8 pub r#type: ChunkingType,
9 /// The amount to chunk by.
10 pub value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone, Default)]
15pub struct Timeout {
16 /// The seconds up to 60.
17 pub secs: u64,
18 /// The nanoseconds.
19 pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24 /// The timeout to wait until.
25 pub timeout: Timeout,
26}
27
28
29/// Represents various web automation actions.
30#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
31pub enum WebAutomation {
32 /// Runs custom JavaScript code.
33 Evaluate(String),
34 /// Clicks on an element.
35 Click(String),
36 /// Clicks on all elements.
37 ClickAll(String),
38 /// Clicks on all elements.
39 ClickAllClickable(),
40 /// Clicks at the position x and y coordinates.
41 ClickPoint {
42 /// The horizontal (X) coordinate.
43 x: f64,
44 /// The vertical (Y) coordinate.
45 y: f64,
46 },
47 Type {
48 /// The value to type.
49 value: String,
50 /// The click modifier.
51 modifier: Option<i64>
52 },
53 /// Waits for a fixed duration in milliseconds.
54 Wait(u64),
55 /// Waits for the next navigation event.
56 WaitForNavigation,
57 /// Wait for dom updates to stop.
58 WaitForDom {
59 /// The selector of the element to wait for updates.
60 selector: Option<String>,
61 /// The timeout to wait for in ms.
62 timeout: u32,
63 },
64 /// Waits for an element to appear.
65 WaitFor(String),
66 /// Waits for an element to appear with a timeout.
67 WaitForWithTimeout {
68 /// The selector of the element to wait for updates.
69 selector: String,
70 /// The timeout to wait for in ms.
71 timeout: u64,
72 },
73 /// Waits for an element to appear and then clicks on it.
74 WaitForAndClick(String),
75 /// Scrolls the screen in the horizontal axis by a specified amount in pixels.
76 ScrollX(i32),
77 /// Scrolls the screen in the vertical axis by a specified amount in pixels.
78 ScrollY(i32),
79 /// Fills an input element with a specified value.
80 Fill {
81 /// The selector of the input element to fill.
82 selector: String,
83 /// The value to fill the input element with.
84 value: String,
85 },
86 /// Scrolls the page until the end.
87 InfiniteScroll(u32),
88 /// Perform a screenshot on the page - fullscreen and omit background for params.
89 Screenshot {
90 /// Take a full page screenshot.
91 full_page: bool,
92 /// Omit the background.
93 omit_background: bool,
94 /// The output file to store the screenshot.
95 output: String,
96 },
97 /// Only continue to the next automation if the prior step was valid. Use this intermediate after a step to break out of the chain.
98 ValidateChain,
99}
100
101#[derive(Default, Serialize, Deserialize, Debug, Clone)]
102#[serde(tag = "type", rename_all = "PascalCase")]
103pub enum RedirectPolicy {
104 Loose,
105 #[default]
106 Strict,
107}
108
109pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
110pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
111
112#[derive(Serialize, Deserialize, Debug, Clone)]
113pub struct Selector {
114 /// The timeout to wait until.
115 pub timeout: Timeout,
116 /// The selector to wait for.
117 pub selector: String,
118}
119
120#[derive(Serialize, Deserialize, Debug, Clone, Default)]
121pub struct Delay {
122 /// The timeout to wait until.
123 pub timeout: Timeout,
124}
125
126/// Default as true.
127fn default_some_true() -> Option<bool> {
128 Some(true)
129}
130
131#[derive(Serialize, Deserialize, Debug, Clone, Default)]
132pub struct WaitFor {
133 /// Wait until idle networks with a timeout of idleness.
134 pub idle_network: Option<IdleNetwork>,
135 /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
136 pub selector: Option<Selector>,
137 /// Wait for the dom to update
138 pub dom: Option<Selector>,
139 /// Wait until a hard delay.
140 pub delay: Option<Delay>,
141 /// Wait until page navigation happen. Default is true.
142 #[serde(default = "default_some_true")]
143 pub page_navigations: Option<bool>,
144}
145
146/// Query request to get a document.
147#[derive(Serialize, Deserialize, Debug, Clone, Default)]
148pub struct QueryRequest {
149 /// The exact website url.
150 pub url: Option<String>,
151 /// The website domain.
152 pub domain: Option<String>,
153 /// The path of the resource.
154 pub pathname: Option<String>,
155}
156
157/// Enum representing different types of Chunking.
158#[derive(Default, Debug, Deserialize, Serialize, Clone)]
159#[serde(rename_all = "lowercase")]
160pub enum ChunkingType {
161 #[default]
162 /// By the word count.
163 ByWords,
164 /// By the line count.
165 ByLines,
166 /// By the char length.
167 ByCharacterLength,
168 /// By sentence.
169 BySentence,
170}
171
172#[derive(Default, Debug, Deserialize, Serialize, Clone)]
173/// View port handling for chrome.
174pub struct Viewport {
175 /// Device screen Width
176 pub width: u32,
177 /// Device screen size
178 pub height: u32,
179 /// Device scale factor
180 pub device_scale_factor: Option<f64>,
181 /// Emulating Mobile?
182 pub emulating_mobile: bool,
183 /// Use landscape mode instead of portrait.
184 pub is_landscape: bool,
185 /// Touch screen device?
186 pub has_touch: bool,
187}
188
189// Define the CSSSelector struct
190#[derive(Debug, Clone, Default, Deserialize, Serialize)]
191pub struct CSSSelector {
192 /// The name of the selector group
193 pub name: String,
194 /// A vector of CSS selectors
195 pub selectors: Vec<String>,
196}
197
198// Define the CSSExtractionMap type
199pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
200
201/// Represents the settings for a webhook configuration
202#[derive(Debug, Default, Deserialize, Serialize, Clone)]
203pub struct WebhookSettings {
204 /// The destination where the webhook information will be sent
205 destination: String,
206 /// Trigger an action when all credits are depleted
207 on_credits_depleted: bool,
208 /// Trigger an action when half of the credits are depleted
209 on_credits_half_depleted: bool,
210 /// Trigger an action on a website status update event
211 on_website_status: bool,
212 /// Send information about a new page find (such as links and bytes)
213 on_find: bool,
214 /// Handle the metadata of a found page
215 on_find_metadata: bool,
216}
217
218/// Proxy pool selection for outbound request routing.
219/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
220///
221/// - 'residential' → cost-effective entry-level residential pool
222/// - 'residential_fast' → faster residential pool for higher throughput
223/// - 'residential_static' → static residential IPs, rotated daily
224/// - 'residential_premium' → low-latency premium IPs
225/// - 'residential_core' → balanced plan (quality vs. cost)
226/// - 'residential_plus' → largest and highest quality core pool
227/// - 'mobile' → 4G/5G mobile proxies for maximum evasion
228/// - 'isp' → ISP-grade datacenters
229#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
230pub enum ProxyType {
231 /// Cost-effective entry-level residential pool.
232 #[serde(rename = "residential")]
233 Residential,
234 /// Higher-throughput residential pool for better performance.
235 #[serde(rename = "residential_fast")]
236 ResidentialFast,
237 /// Static residential IPs, rotated daily for session persistence.
238 #[serde(rename = "residential_static")]
239 ResidentialStatic,
240 /// 4G / 5G mobile proxies for maximum stealth and evasion.
241 #[serde(rename = "mobile")]
242 Mobile,
243 /// ISP-grade residential routing (alias: `datacenter`).
244 #[serde(rename = "isp", alias = "datacenter")]
245 #[default]
246 Isp,
247 /// Premium low-latency residential proxy pool.
248 #[serde(rename = "residential_premium")]
249 ResidentialPremium,
250 /// Core residential plan optimized for balance between cost and quality.
251 #[serde(rename = "residential_core")]
252 ResidentialCore,
253 /// Extended core residential pool with the largest, highest-quality IPs.
254 #[serde(rename = "residential_plus")]
255 ResidentialPlus,
256}
257
258/// List of proxies.
259pub const PROXY_TYPE_LIST: [ProxyType; 10] = [
260 ProxyType::ResidentialStatic,
261 ProxyType::Residential,
262 ProxyType::Isp,
263 ProxyType::Mobile,
264 ProxyType::ResidentialPremium,
265 ProxyType::ResidentialPlus,
266 ProxyType::ResidentialCore,
267 ProxyType::ResidentialFast,
268 ProxyType::ResidentialStatic,
269 ProxyType::Residential,
270];
271
272impl ProxyType {
273 /// Get the canonical string representation of the proxy type.
274 pub fn as_str(&self) -> &'static str {
275 match self {
276 ProxyType::Residential => "residential",
277 ProxyType::ResidentialFast => "residential_fast",
278 ProxyType::ResidentialStatic => "residential_static",
279 ProxyType::Mobile => "mobile",
280 ProxyType::Isp => "isp",
281 ProxyType::ResidentialPremium => "residential_premium",
282 ProxyType::ResidentialCore => "residential_core",
283 ProxyType::ResidentialPlus => "residential_plus",
284 }
285 }
286}
287
288/// Send multiple return formats.
289#[derive(Debug, Deserialize, Serialize, Clone)]
290#[serde(untagged)]
291pub enum ReturnFormatHandling {
292 /// A single return item.
293 Single(ReturnFormat),
294 /// Multiple return formats.
295 Multi(std::collections::HashSet<ReturnFormat>),
296}
297
298impl Default for ReturnFormatHandling {
299 fn default() -> ReturnFormatHandling {
300 ReturnFormatHandling::Single(ReturnFormat::Raw)
301 }
302}
303
304#[derive(Debug, Default, Deserialize, Serialize, Clone)]
305pub struct EventTracker {
306 /// The responses received.
307 pub responses: Option<bool>,
308 /// The request sent.
309 pub requests: Option<bool>,
310 /// Track the automation events with data changes and screenshots.
311 pub automation: Option<bool>,
312}
313
314/// Structure representing request parameters.
315#[derive(Debug, Default, Deserialize, Serialize, Clone)]
316pub struct RequestParams {
317 #[serde(default)]
318 /// The URL to be crawled.
319 pub url: Option<String>,
320 #[serde(default)]
321 /// The type of request to be made.
322 pub request: Option<RequestType>,
323 #[serde(default)]
324 /// The maximum number of pages the crawler should visit.
325 pub limit: Option<u32>,
326 #[serde(default)]
327 /// The format in which the result should be returned.
328 pub return_format: Option<ReturnFormatHandling>,
329 /// The country code for request
330 pub country_code: Option<String>,
331 #[serde(default)]
332 /// Specifies whether to only visit the top-level domain.
333 pub tld: Option<bool>,
334 #[serde(default)]
335 /// The depth of the crawl.
336 pub depth: Option<u32>,
337 #[serde(default)]
338 /// Specifies whether the request should be cached.
339 pub cache: Option<bool>,
340 #[serde(default)]
341 /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
342 pub scroll: Option<u32>,
343 #[serde(default)]
344 /// The budget for various resources.
345 pub budget: Option<HashMap<String, u32>>,
346 #[serde(default)]
347 /// The blacklist routes to ignore. This can be a Regex string pattern.
348 pub blacklist: Option<Vec<String>>,
349 #[serde(default)]
350 /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
351 pub whitelist: Option<Vec<String>>,
352 #[serde(default)]
353 /// The locale to be used during the crawl.
354 pub locale: Option<String>,
355 #[serde(default)]
356 /// The cookies to be set for the request, formatted as a single string.
357 pub cookies: Option<String>,
358 #[serde(default)]
359 /// Specifies whether to use stealth techniques to avoid detection.
360 pub stealth: Option<bool>,
361 #[serde(default)]
362 /// The headers to be used for the request.
363 pub headers: Option<HashMap<String, String>>,
364 #[serde(default)]
365 /// Specifies whether anti-bot measures should be used.
366 pub anti_bot: Option<bool>,
367 #[serde(default)]
368 /// Specifies whether to send data via webhooks.
369 pub webhooks: Option<WebhookSettings>,
370 #[serde(default)]
371 /// Specifies whether to include metadata in the response.
372 pub metadata: Option<bool>,
373 #[serde(default)]
374 /// The dimensions of the viewport.
375 pub viewport: Option<Viewport>,
376 #[serde(default)]
377 /// The encoding to be used for the request.
378 pub encoding: Option<String>,
379 #[serde(default)]
380 /// Specifies whether to include subdomains in the crawl.
381 pub subdomains: Option<bool>,
382 #[serde(default)]
383 /// The user agent string to be used for the request.
384 pub user_agent: Option<String>,
385 #[serde(default)]
386 /// Specifies whether to use fingerprinting protection.
387 pub fingerprint: Option<bool>,
388 #[serde(default)]
389 /// Specifies whether to perform the request without using storage.
390 pub storageless: Option<bool>,
391 #[serde(default)]
392 /// Specifies whether readability optimizations should be applied.
393 pub readability: Option<bool>,
394 #[serde(default)]
395 /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
396 pub proxy_enabled: Option<bool>,
397 #[serde(default)]
398 /// Specifies whether to respect the site's robots.txt file.
399 pub respect_robots: Option<bool>,
400 #[serde(default)]
401 /// CSS selector to be used to filter the content.
402 pub root_selector: Option<String>,
403 #[serde(default)]
404 /// Specifies whether to load all resources of the crawl target.
405 pub full_resources: Option<bool>,
406 #[serde(default)]
407 /// The text string to extract data from.
408 pub text: Option<String>,
409 #[serde(default)]
410 /// Specifies whether to use the sitemap links.
411 pub sitemap: Option<bool>,
412 #[serde(default)]
413 /// External domains to include the crawl.
414 pub external_domains: Option<Vec<String>>,
415 #[serde(default)]
416 /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
417 pub return_embeddings: Option<bool>,
418 #[serde(default)]
419 /// Returns the HTTP response headers.
420 pub return_headers: Option<bool>,
421 #[serde(default)]
422 /// Returns the link(s) found on the page that match the crawler query.
423 pub return_page_links: Option<bool>,
424 #[serde(default)]
425 /// Returns the HTTP response cookies.
426 pub return_cookies: Option<bool>,
427 #[serde(default)]
428 /// The timeout for the request, in seconds.
429 pub request_timeout: Option<u8>,
430 #[serde(default)]
431 /// Specifies whether to run the request in the background.
432 pub run_in_background: Option<bool>,
433 #[serde(default)]
434 /// Specifies whether to skip configuration checks.
435 pub skip_config_checks: Option<bool>,
436 #[serde(default)]
437 /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
438 pub css_extraction_map: Option<CSSExtractionMap>,
439 #[serde(default)]
440 /// The chunking algorithm to use.
441 pub chunking_alg: Option<ChunkingAlgDict>,
442 #[serde(default)]
443 /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
444 pub disable_intercept: Option<bool>,
445 #[serde(default)]
446 /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
447 pub wait_for: Option<WaitFor>,
448 #[serde(default)]
449 /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
450 pub execution_scripts: Option<ExecutionScriptsMap>,
451 #[serde(default)]
452 /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
453 pub automation_scripts: Option<WebAutomationMap>,
454 #[serde(default)]
455 /// The redirect policy for HTTP request. Set the value to Loose to allow all.
456 pub redirect_policy: Option<RedirectPolicy>,
457 #[serde(default)]
458 /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
459 pub event_tracker: Option<EventTracker>,
460 #[serde(default)]
461 /// The timeout to stop the crawl.
462 pub crawl_timeout: Option<Timeout>,
463 #[serde(default)]
464 /// Evaluates given script in every frame upon creation (before loading frame's scripts).
465 pub evaluate_on_new_document: Option<Box<String>>,
466 #[serde(default)]
467 /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 50%, with trade-offs in speed, accuracy,
468 /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
469 /// targeting websites with minimal anti-bot protections.
470 pub lite_mode: Option<bool>,
471 #[serde(default)]
472 /// The proxy to use for request.
473 pub proxy: Option<ProxyType>,
474 #[serde(default)]
475 /// Use a remote proxy at ~50% reduced cost for file downloads.
476 /// This requires a user-supplied static IP proxy endpoint.
477 pub remote_proxy: Option<String>,
478 #[serde(default)]
479 /// Set the maximum number of credits to use per page.
480 /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
481 /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
482 pub max_credits_per_page: Option<f64>,
483}
484
485
486#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
487#[serde(rename_all = "lowercase")]
488pub enum TBS {
489 #[serde(rename = "qdr:h")]
490 PastHour,
491 #[serde(rename = "qdr:d")]
492 Past24Hours,
493 #[serde(rename = "qdr:w")]
494 PastWeek,
495 #[serde(rename = "qdr:m")]
496 PastMonth,
497 #[serde(rename = "qdr:y")]
498 PastYear,
499}
500
501/// The engine to use.
502#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
503pub enum Engine {
504 /// Google
505 Google,
506 /// Brave
507 Brave,
508 /// All
509 #[default]
510 All
511}
512
513/// The structure representing request parameters for a search request.
514#[derive(Debug, Default, Deserialize, Serialize, Clone)]
515pub struct SearchRequestParams {
516 /// The base request parameters.
517 #[serde(default, flatten)]
518 pub base: RequestParams,
519 // The search request.
520 pub search: String,
521 /// The search limit.
522 pub search_limit: Option<u32>,
523 // Fetch the page content. Defaults to true.
524 pub fetch_page_content: Option<bool>,
525 /// The search location of the request
526 pub location: Option<String>,
527 /// The country code of the request
528 pub country: Option<crate::shapes::country_codes::CountryCode>,
529 /// The language code of the request.
530 pub language: Option<String>,
531 /// The number of search results
532 pub num: Option<u32>,
533 /// The time period range.
534 pub tbs: Option<TBS>,
535 /// The page of the search results.
536 pub page: Option<u32>,
537 /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
538 pub website_limit: Option<u32>,
539 /// Prioritize speed over output quantity.
540 pub quick_search: Option<bool>,
541 /// Auto paginate pages ( up to 100 pages ).
542 pub auto_pagination: Option<bool>,
543 /// The search engine to use.
544 pub engine: Option<Engine>
545}
546
547/// Structure representing request parameters for transforming files.
548#[derive(Debug, Default, Deserialize, Serialize, Clone)]
549pub struct TransformParams {
550 #[serde(default)]
551 /// The format in which the result should be returned.
552 pub return_format: Option<ReturnFormat>,
553 #[serde(default)]
554 /// Specifies whether readability optimizations should be applied.
555 pub readability: Option<bool>,
556 #[serde(default)]
557 /// Clean the markdown or text for AI.
558 pub clean: Option<bool>,
559 #[serde(default)]
560 /// Clean the markdown or text for AI removing footers, navigation, and more.
561 pub clean_full: Option<bool>,
562 /// The data being transformed.
563 pub data: Vec<Resource>,
564}
565
566#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Default)]
567/// Transformation resource to use.
568pub struct Resource {
569 #[serde(default)]
570 /// the html to transform
571 pub html: Option<bytes::Bytes>,
572 #[serde(default)]
573 /// the content to transform
574 pub content: Option<bytes::Bytes>,
575 #[serde(default)]
576 /// the url of the html incase of readability to improve transformations.
577 pub url: Option<String>,
578 #[serde(default)]
579 /// the language of the resource.
580 pub lang: Option<String>,
581}
582
583/// the request type to perform
584#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
585#[serde(rename_all = "lowercase")]
586pub enum RequestType {
587 /// Default HTTP request
588 Http,
589 /// Chrome browser rendering
590 Chrome,
591 #[default]
592 /// Smart mode defaulting to HTTP and using Chrome when needed.
593 SmartMode,
594}
595
596/// Enum representing different return formats.
597#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
598#[serde(rename_all = "lowercase")]
599pub enum ReturnFormat {
600 #[default]
601 /// The default return format of the resource.
602 Raw,
603 /// Return the response as Markdown.
604 Markdown,
605 /// Return the response as Commonmark.
606 Commonmark,
607 /// Return the response as Html2text.
608 Html2text,
609 /// Return the response as Text.
610 Text,
611 /// Returns a screenshot as Base64Url
612 Screenshot,
613 /// Return the response as XML.
614 Xml,
615 /// Return the response as Bytes.
616 Bytes,
617}