spider_client/shapes/request.rs
1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4/// Structure representing the Chunking algorithm dictionary.
5#[derive(Debug, Deserialize, Serialize, Clone)]
6pub struct ChunkingAlgDict {
7 /// The chunking algorithm to use, defined as a specific type.
8 r#type: ChunkingType,
9 /// The amount to chunk by.
10 value: i32,
11}
12
13// The nested structures
14#[derive(Serialize, Deserialize, Debug, Clone, Default)]
15pub struct Timeout {
16 /// The seconds up to 60.
17 pub secs: u64,
18 /// The nanoseconds.
19 pub nanos: u32,
20}
21
22#[derive(Serialize, Deserialize, Debug, Clone)]
23pub struct IdleNetwork {
24 /// The timeout to wait until.
25 pub timeout: Timeout,
26}
27
28#[derive(Serialize, Deserialize, Debug, Clone)]
29#[serde(tag = "type", rename_all = "PascalCase")]
30pub enum WebAutomation {
31 Evaluate { code: String },
32 Click { selector: String },
33 Wait { duration: u64 },
34 WaitForNavigation,
35 WaitFor { selector: String },
36 WaitForAndClick { selector: String },
37 ScrollX { pixels: i32 },
38 ScrollY { pixels: i32 },
39 Fill { selector: String, value: String },
40 InfiniteScroll { times: u32 },
41}
42
43#[derive(Default, Serialize, Deserialize, Debug, Clone)]
44#[serde(tag = "type", rename_all = "PascalCase")]
45pub enum RedirectPolicy {
46 Loose,
47 #[default]
48 Strict,
49}
50
51pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
52pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
53
54#[derive(Serialize, Deserialize, Debug, Clone)]
55pub struct Selector {
56 /// The timeout to wait until.
57 pub timeout: Timeout,
58 /// The selector to wait for.
59 pub selector: String,
60}
61
62#[derive(Serialize, Deserialize, Debug, Clone, Default)]
63pub struct Delay {
64 /// The timeout to wait until.
65 pub timeout: Timeout,
66}
67
68/// Default as true.
69fn default_some_true() -> Option<bool> {
70 Some(true)
71}
72
73#[derive(Serialize, Deserialize, Debug, Clone, Default)]
74pub struct WaitFor {
75 /// Wait until idle networks with a timeout of idleness.
76 pub idle_network: Option<IdleNetwork>,
77 /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
78 pub selector: Option<Selector>,
79 /// Wait for the dom to update
80 pub dom: Option<Selector>,
81 /// Wait until a hard delay.
82 pub delay: Option<Delay>,
83 /// Wait until page navigation happen. Default is true.
84 #[serde(default = "default_some_true")]
85 pub page_navigations: Option<bool>,
86}
87
88/// Query request to get a document.
89#[derive(Serialize, Deserialize, Debug, Clone, Default)]
90pub struct QueryRequest {
91 /// The exact website url.
92 pub url: Option<String>,
93 /// The website domain.
94 pub domain: Option<String>,
95 /// The path of the resource.
96 pub pathname: Option<String>,
97}
98
99/// Enum representing different types of Chunking.
100#[derive(Default, Debug, Deserialize, Serialize, Clone)]
101#[serde(rename_all = "lowercase")]
102pub enum ChunkingType {
103 #[default]
104 /// By the word count.
105 ByWords,
106 /// By the line count.
107 ByLines,
108 /// By the char length.
109 ByCharacterLength,
110 /// By sentence.
111 BySentence,
112}
113
114#[derive(Default, Debug, Deserialize, Serialize, Clone)]
115/// View port handling for chrome.
116pub struct Viewport {
117 /// Device screen Width
118 pub width: u32,
119 /// Device screen size
120 pub height: u32,
121 /// Device scale factor
122 pub device_scale_factor: Option<f64>,
123 /// Emulating Mobile?
124 pub emulating_mobile: bool,
125 /// Use landscape mode instead of portrait.
126 pub is_landscape: bool,
127 /// Touch screen device?
128 pub has_touch: bool,
129}
130
131// Define the CSSSelector struct
132#[derive(Debug, Clone, Default, Deserialize, Serialize)]
133pub struct CSSSelector {
134 /// The name of the selector group
135 pub name: String,
136 /// A vector of CSS selectors
137 pub selectors: Vec<String>,
138}
139
140// Define the CSSExtractionMap type
141pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
142
143/// Represents the settings for a webhook configuration
144#[derive(Debug, Default, Deserialize, Serialize, Clone)]
145pub struct WebhookSettings {
146 /// The destination where the webhook information will be sent
147 destination: String,
148 /// Trigger an action when all credits are depleted
149 on_credits_depleted: bool,
150 /// Trigger an action when half of the credits are depleted
151 on_credits_half_depleted: bool,
152 /// Trigger an action on a website status update event
153 on_website_status: bool,
154 /// Send information about a new page find (such as links and bytes)
155 on_find: bool,
156 /// Handle the metadata of a found page
157 on_find_metadata: bool,
158}
159
160/// Proxy pool selection for outbound request routing.
161/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
162///
163/// - 'residential' → cost-effective entry-level residential pool
164/// - 'residential_fast' → faster residential pool for higher throughput
165/// - 'residential_static' → static residential IPs, rotated daily
166/// - 'residential_premium' → low-latency premium IPs
167/// - 'residential_core' → balanced plan (quality vs. cost)
168/// - 'residential_plus' → largest and highest quality core pool
169/// - 'mobile' → 4G/5G mobile proxies for maximum evasion
170/// - 'isp' → ISP-grade datacenters
171#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
172pub enum ProxyType {
173 /// Cost-effective entry-level residential pool.
174 #[serde(rename = "residential")]
175 Residential,
176 /// Higher-throughput residential pool for better performance.
177 #[serde(rename = "residential_fast")]
178 ResidentialFast,
179 /// Static residential IPs, rotated daily for session persistence.
180 #[serde(rename = "residential_static")]
181 ResidentialStatic,
182 /// 4G / 5G mobile proxies for maximum stealth and evasion.
183 #[serde(rename = "mobile")]
184 Mobile,
185 /// ISP-grade residential routing (alias: `datacenter`).
186 #[serde(rename = "isp", alias = "datacenter")]
187 #[default]
188 Isp,
189 /// Premium low-latency residential proxy pool.
190 #[serde(rename = "residential_premium")]
191 ResidentialPremium,
192 /// Core residential plan optimized for balance between cost and quality.
193 #[serde(rename = "residential_core")]
194 ResidentialCore,
195 /// Extended core residential pool with the largest, highest-quality IPs.
196 #[serde(rename = "residential_plus")]
197 ResidentialPlus,
198}
199
200/// List of proxies.
201pub const PROXY_TYPE_LIST: [ProxyType; 10] = [
202 ProxyType::ResidentialStatic,
203 ProxyType::Residential,
204 ProxyType::Isp,
205 ProxyType::Mobile,
206 ProxyType::ResidentialPremium,
207 ProxyType::ResidentialPlus,
208 ProxyType::ResidentialCore,
209 ProxyType::ResidentialFast,
210 ProxyType::ResidentialStatic,
211 ProxyType::Residential,
212];
213
214impl ProxyType {
215 /// Get the canonical string representation of the proxy type.
216 pub fn as_str(&self) -> &'static str {
217 match self {
218 ProxyType::Residential => "residential",
219 ProxyType::ResidentialFast => "residential_fast",
220 ProxyType::ResidentialStatic => "residential_static",
221 ProxyType::Mobile => "mobile",
222 ProxyType::Isp => "isp",
223 ProxyType::ResidentialPremium => "residential_premium",
224 ProxyType::ResidentialCore => "residential_core",
225 ProxyType::ResidentialPlus => "residential_plus",
226 }
227 }
228}
229
230/// Send multiple return formats.
231#[derive(Debug, Deserialize, Serialize, Clone)]
232#[serde(untagged)]
233pub enum ReturnFormatHandling {
234 /// A single return item.
235 Single(ReturnFormat),
236 /// Multiple return formats.
237 Multi(std::collections::HashSet<ReturnFormat>),
238}
239
240impl Default for ReturnFormatHandling {
241 fn default() -> ReturnFormatHandling {
242 ReturnFormatHandling::Single(ReturnFormat::Raw)
243 }
244}
245
246#[derive(Debug, Default, Deserialize, Serialize, Clone)]
247pub struct EventTracker {
248 /// The responses received.
249 responses: Option<bool>,
250 ///The request sent.
251 requests: Option<bool>,
252}
253
254/// Structure representing request parameters.
255#[derive(Debug, Default, Deserialize, Serialize, Clone)]
256pub struct RequestParams {
257 #[serde(default)]
258 /// The URL to be crawled.
259 pub url: Option<String>,
260 #[serde(default)]
261 /// The type of request to be made.
262 pub request: Option<RequestType>,
263 #[serde(default)]
264 /// The maximum number of pages the crawler should visit.
265 pub limit: Option<u32>,
266 #[serde(default)]
267 /// The format in which the result should be returned.
268 pub return_format: Option<ReturnFormatHandling>,
269 /// The country code for request
270 pub country_code: Option<String>,
271 #[serde(default)]
272 /// Specifies whether to only visit the top-level domain.
273 pub tld: Option<bool>,
274 #[serde(default)]
275 /// The depth of the crawl.
276 pub depth: Option<u32>,
277 #[serde(default)]
278 /// Specifies whether the request should be cached.
279 pub cache: Option<bool>,
280 #[serde(default)]
281 /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
282 pub scroll: Option<u32>,
283 #[serde(default)]
284 /// The budget for various resources.
285 pub budget: Option<HashMap<String, u32>>,
286 #[serde(default)]
287 /// The blacklist routes to ignore. This can be a Regex string pattern.
288 pub blacklist: Option<Vec<String>>,
289 #[serde(default)]
290 /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
291 pub whitelist: Option<Vec<String>>,
292 #[serde(default)]
293 /// The locale to be used during the crawl.
294 pub locale: Option<String>,
295 #[serde(default)]
296 /// The cookies to be set for the request, formatted as a single string.
297 pub cookies: Option<String>,
298 #[serde(default)]
299 /// Specifies whether to use stealth techniques to avoid detection.
300 pub stealth: Option<bool>,
301 #[serde(default)]
302 /// The headers to be used for the request.
303 pub headers: Option<HashMap<String, String>>,
304 #[serde(default)]
305 /// Specifies whether anti-bot measures should be used.
306 pub anti_bot: Option<bool>,
307 #[serde(default)]
308 /// Specifies whether to send data via webhooks.
309 pub webhooks: Option<WebhookSettings>,
310 #[serde(default)]
311 /// Specifies whether to include metadata in the response.
312 pub metadata: Option<bool>,
313 #[serde(default)]
314 /// The dimensions of the viewport.
315 pub viewport: Option<Viewport>,
316 #[serde(default)]
317 /// The encoding to be used for the request.
318 pub encoding: Option<String>,
319 #[serde(default)]
320 /// Specifies whether to include subdomains in the crawl.
321 pub subdomains: Option<bool>,
322 #[serde(default)]
323 /// The user agent string to be used for the request.
324 pub user_agent: Option<String>,
325 #[serde(default)]
326 /// Specifies whether to use fingerprinting protection.
327 pub fingerprint: Option<bool>,
328 #[serde(default)]
329 /// Specifies whether to perform the request without using storage.
330 pub storageless: Option<bool>,
331 #[serde(default)]
332 /// Specifies whether readability optimizations should be applied.
333 pub readability: Option<bool>,
334 #[serde(default)]
335 /// Specifies whether to use a proxy for the request. [Deprecated]: use the 'proxy' param instead.
336 pub proxy_enabled: Option<bool>,
337 #[serde(default)]
338 /// Specifies whether to respect the site's robots.txt file.
339 pub respect_robots: Option<bool>,
340 #[serde(default)]
341 /// CSS selector to be used to filter the content.
342 pub root_selector: Option<String>,
343 #[serde(default)]
344 /// Specifies whether to load all resources of the crawl target.
345 pub full_resources: Option<bool>,
346 #[serde(default)]
347 /// The text string to extract data from.
348 pub text: Option<String>,
349 #[serde(default)]
350 /// Specifies whether to use the sitemap links.
351 pub sitemap: Option<bool>,
352 #[serde(default)]
353 /// External domains to include the crawl.
354 pub external_domains: Option<Vec<String>>,
355 #[serde(default)]
356 /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
357 pub return_embeddings: Option<bool>,
358 #[serde(default)]
359 /// Returns the HTTP response headers.
360 pub return_headers: Option<bool>,
361 #[serde(default)]
362 /// Returns the link(s) found on the page that match the crawler query.
363 pub return_page_links: Option<bool>,
364 #[serde(default)]
365 /// Returns the HTTP response cookies.
366 pub return_cookies: Option<bool>,
367 #[serde(default)]
368 /// The timeout for the request, in seconds.
369 pub request_timeout: Option<u8>,
370 #[serde(default)]
371 /// Specifies whether to run the request in the background.
372 pub run_in_background: Option<bool>,
373 #[serde(default)]
374 /// Specifies whether to skip configuration checks.
375 pub skip_config_checks: Option<bool>,
376 #[serde(default)]
377 /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
378 pub css_extraction_map: Option<CSSExtractionMap>,
379 #[serde(default)]
380 /// The chunking algorithm to use.
381 pub chunking_alg: Option<ChunkingAlgDict>,
382 #[serde(default)]
383 /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
384 pub disable_intercept: Option<bool>,
385 #[serde(default)]
386 /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
387 pub wait_for: Option<WaitFor>,
388 #[serde(default)]
389 /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
390 pub execution_scripts: Option<ExecutionScriptsMap>,
391 #[serde(default)]
392 /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
393 pub automation_scripts: Option<WebAutomationMap>,
394 #[serde(default)]
395 /// The redirect policy for HTTP request. Set the value to Loose to allow all.
396 pub redirect_policy: Option<RedirectPolicy>,
397 #[serde(default)]
398 /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
399 pub event_tracker: Option<EventTracker>,
400 #[serde(default)]
401 /// The timeout to stop the crawl.
402 pub crawl_timeout: Option<Timeout>,
403 #[serde(default)]
404 /// Evaluates given script in every frame upon creation (before loading frame's scripts).
405 pub evaluate_on_new_document: Option<Box<String>>,
406 #[serde(default)]
407 /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 50%, with trade-offs in speed, accuracy,
408 /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
409 /// targeting websites with minimal anti-bot protections.
410 pub lite_mode: Option<bool>,
411 #[serde(default)]
412 /// The proxy to use for request.
413 pub proxy: Option<ProxyType>,
414 #[serde(default)]
415 /// Use a remote proxy at ~50% reduced cost for file downloads.
416 /// This requires a user-supplied static IP proxy endpoint.
417 pub remote_proxy: Option<String>,
418 #[serde(default)]
419 /// Set the maximum number of credits to use per page.
420 /// Credits are measured in decimal units, where 10,000 credits equal one dollar (100 credits per penny).
421 /// Credit limiting only applies to request that are Javascript rendered using smart_mode or chrome for the 'request' type.
422 pub max_credits_per_page: Option<f64>,
423}
424
425
426#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
427#[serde(rename_all = "lowercase")]
428pub enum TBS {
429 #[serde(rename = "qdr:h")]
430 PastHour,
431 #[serde(rename = "qdr:d")]
432 Past24Hours,
433 #[serde(rename = "qdr:w")]
434 PastWeek,
435 #[serde(rename = "qdr:m")]
436 PastMonth,
437 #[serde(rename = "qdr:y")]
438 PastYear,
439}
440
441/// The engine to use.
442#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
443pub enum Engine {
444 /// Google
445 Google,
446 /// Brave
447 Brave,
448 /// All
449 #[default]
450 All
451}
452
453/// The structure representing request parameters for a search request.
454#[derive(Debug, Default, Deserialize, Serialize, Clone)]
455pub struct SearchRequestParams {
456 /// The base request parameters.
457 #[serde(default, flatten)]
458 pub base: RequestParams,
459 // The search request.
460 pub search: String,
461 /// The search limit.
462 pub search_limit: Option<u32>,
463 // Fetch the page content. Defaults to true.
464 pub fetch_page_content: Option<bool>,
465 /// The search location of the request
466 pub location: Option<String>,
467 /// The country code of the request
468 pub country: Option<crate::shapes::country_codes::CountryCode>,
469 /// The language code of the request.
470 pub language: Option<String>,
471 /// The number of search results
472 pub num: Option<u32>,
473 /// The time period range.
474 pub tbs: Option<TBS>,
475 /// The page of the search results.
476 pub page: Option<u32>,
477 /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
478 pub website_limit: Option<u32>,
479 /// Prioritize speed over output quantity.
480 pub quick_search: Option<bool>,
481 /// Auto paginate pages ( up to 100 pages ).
482 pub auto_pagination: Option<bool>,
483 /// The search engine to use.
484 pub engine: Option<Engine>
485}
486
487/// Structure representing request parameters for transforming files.
488#[derive(Debug, Default, Deserialize, Serialize, Clone)]
489pub struct TransformParams {
490 #[serde(default)]
491 /// The format in which the result should be returned.
492 pub return_format: Option<ReturnFormat>,
493 #[serde(default)]
494 /// Specifies whether readability optimizations should be applied.
495 pub readability: Option<bool>,
496 #[serde(default)]
497 /// Clean the markdown or text for AI.
498 pub clean: Option<bool>,
499 #[serde(default)]
500 /// Clean the markdown or text for AI removing footers, navigation, and more.
501 pub clean_full: Option<bool>,
502 /// The data being transformed.
503 pub data: Vec<Resource>,
504}
505
506#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Default)]
507/// Transformation resource to use.
508pub struct Resource {
509 #[serde(default)]
510 /// the html to transform
511 pub html: Option<bytes::Bytes>,
512 #[serde(default)]
513 /// the content to transform
514 pub content: Option<bytes::Bytes>,
515 #[serde(default)]
516 /// the url of the html incase of readability to improve transformations.
517 pub url: Option<String>,
518 #[serde(default)]
519 /// the language of the resource.
520 pub lang: Option<String>,
521}
522
523/// the request type to perform
524#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
525#[serde(rename_all = "lowercase")]
526pub enum RequestType {
527 /// Default HTTP request
528 Http,
529 /// Chrome browser rendering
530 Chrome,
531 #[default]
532 /// Smart mode defaulting to HTTP and using Chrome when needed.
533 SmartMode,
534}
535
536/// Enum representing different return formats.
537#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
538#[serde(rename_all = "lowercase")]
539pub enum ReturnFormat {
540 #[default]
541 /// The default return format of the resource.
542 Raw,
543 /// Return the response as Markdown.
544 Markdown,
545 /// Return the response as Commonmark.
546 Commonmark,
547 /// Return the response as Html2text.
548 Html2text,
549 /// Return the response as Text.
550 Text,
551 /// Returns a screenshot as Base64Url
552 Screenshot,
553 /// Return the response as XML.
554 Xml,
555 /// Return the response as Bytes.
556 Bytes,
557}