spider_client/
lib.rs

1//! The `spider-client` module provides the primary interface and
2//! functionalities for the Spider web crawler library, which is
3//! designed for rapid and efficient crawling of web pages to gather
4//! links using isolated contexts.
5//!
6//! ### Features
7//!
8//! - **Multi-threaded Crawling:** Spider can utilize multiple
9//!   threads to parallelize the crawling process, drastically
10//!   improving performance and allowing the ability to gather
11//!   millions of pages in a short time.
12//!
13//! - **Configurable:** The library provides various options to
14//!   configure the crawling behavior, such as setting the depth
15//!   of crawling, user-agent strings, delays between requests,
16//!   and more.
17//!
18//! - **Link Gathering:** One of the primary objectives of Spider is to
19//!   gather and manage links from the web pages it crawls,
20//!   compiling them into a structured format for further use.
21//!
22//! ### Examples
23//!
24//! Basic usage of the Spider client might look like this:
25//!
26//! ```rust
27//! use spider_client::{Spider, RequestType, RequestParams};
28//! use tokio;
29//!
30//!  # #[ignore]
31//! #[tokio::main]
32//! async fn main() {
33//!     let spider = Spider::new(Some("myspiderapikey".into())).expect("API key must be provided");
34//!
35//!     let url = "https://spider.cloud";
36//!
37//!     // Scrape a single URL
38//!     let scraped_data = spider.scrape_url(url, None, "application/json").await.expect("Failed to scrape the URL");
39//!
40//!     println!("Scraped Data: {:?}", scraped_data);
41//!
42//!     // Crawl a website
43//!     let crawler_params = RequestParams {
44//!         limit: Some(1),
45//!         proxy_enabled: Some(true),
46//!         store_data: Some(false),
47//!         metadata: Some(false),
48//!         request: Some(RequestType::Http),
49//!         ..Default::default()
50//!     };
51//!
52//!     let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
53//!
54//!     println!("Crawl Result: {:?}", crawl_result);
55//! }
56//! ```
57//!
58//! ### Modules
59//!
60//! - `config`: Contains the configuration options for the Spider client.
61//! - `utils`: Utility functions used by the Spider client.
62//!
63
64use backon::ExponentialBuilder;
65use backon::Retryable;
66use reqwest::Client;
67use reqwest::{Error, Response};
68use serde::{Deserialize, Serialize};
69use std::collections::HashMap;
70use tokio_stream::StreamExt;
71
72/// Structure representing the Chunking algorithm dictionary.
73#[derive(Debug, Deserialize, Serialize, Clone)]
74pub struct ChunkingAlgDict {
75    /// The chunking algorithm to use, defined as a specific type.
76    r#type: ChunkingType,
77    /// The amount to chunk by.
78    value: i32,
79}
80
81// The nested structures
82#[derive(Serialize, Deserialize, Debug, Clone)]
83pub struct Timeout {
84    /// The seconds up to 60.
85    pub secs: u64,
86    /// The nanoseconds.
87    pub nanos: u32,
88}
89
90#[derive(Serialize, Deserialize, Debug, Clone)]
91pub struct IdleNetwork {
92    /// The timeout to wait until.
93    pub timeout: Timeout,
94}
95
96#[derive(Serialize, Deserialize, Debug, Clone)]
97#[serde(tag = "type", rename_all = "PascalCase")]
98pub enum WebAutomation {
99    Evaluate { code: String },
100    Click { selector: String },
101    Wait { duration: u64 },
102    WaitForNavigation,
103    WaitFor { selector: String },
104    WaitForAndClick { selector: String },
105    ScrollX { pixels: i32 },
106    ScrollY { pixels: i32 },
107    Fill { selector: String, value: String },
108    InfiniteScroll { times: u32 },
109}
110
111#[derive(Default, Serialize, Deserialize, Debug, Clone)]
112#[serde(tag = "type", rename_all = "PascalCase")]
113pub enum RedirectPolicy {
114    Loose,
115    #[default]
116    Strict,
117}
118
119pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
120pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
121
122#[derive(Serialize, Deserialize, Debug, Clone)]
123pub struct Selector {
124    /// The timeout to wait until.
125    pub timeout: Timeout,
126    /// The selector to wait for.
127    pub selector: String,
128}
129
130#[derive(Serialize, Deserialize, Debug, Clone)]
131pub struct Delay {
132    /// The timeout to wait until.
133    pub timeout: Timeout,
134}
135
136#[derive(Serialize, Deserialize, Debug, Clone)]
137pub struct WaitFor {
138    /// Wait until idle networks with a timeout of idleness.
139    pub idle_network: Option<IdleNetwork>,
140    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
141    pub selector: Option<Selector>,
142    /// Wait until a hard delay.
143    pub delay: Option<Delay>,
144    /// Wait until page navigation happen. Default is true.
145    pub page_navigations: Option<bool>,
146}
147
148/// Query request to get a document.
149#[derive(Serialize, Deserialize, Debug, Clone, Default)]
150pub struct QueryRequest {
151    /// The exact website url.
152    pub url: Option<String>,
153    /// The website domain.
154    pub domain: Option<String>,
155    /// The path of the resource.
156    pub pathname: Option<String>,
157}
158
159/// Enum representing different types of Chunking.
160#[derive(Default, Debug, Deserialize, Serialize, Clone)]
161#[serde(rename_all = "lowercase")]
162pub enum ChunkingType {
163    #[default]
164    /// By the word count.
165    ByWords,
166    /// By the line count.
167    ByLines,
168    /// By the char length.
169    ByCharacterLength,
170    /// By sentence.
171    BySentence,
172}
173
174#[derive(Default, Debug, Deserialize, Serialize, Clone)]
175/// View port handling for chrome.
176pub struct Viewport {
177    /// Device screen Width
178    pub width: u32,
179    /// Device screen size
180    pub height: u32,
181    /// Device scale factor
182    pub device_scale_factor: Option<f64>,
183    /// Emulating Mobile?
184    pub emulating_mobile: bool,
185    /// Use landscape mode instead of portrait.
186    pub is_landscape: bool,
187    /// Touch screen device?
188    pub has_touch: bool,
189}
190
191/// The API url.
192const API_URL: &'static str = "https://api.spider.cloud";
193
194// Define the CSSSelector struct
195#[derive(Debug, Clone, Default, Deserialize, Serialize)]
196pub struct CSSSelector {
197    /// The name of the selector group
198    pub name: String,
199    /// A vector of CSS selectors
200    pub selectors: Vec<String>,
201}
202
203// Define the CSSExtractionMap type
204pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
205
206/// Represents the settings for a webhook configuration
207#[derive(Debug, Default, Deserialize, Serialize, Clone)]
208pub struct WebhookSettings {
209    /// The destination where the webhook information will be sent
210    destination: String,
211    /// Trigger an action when all credits are depleted
212    on_credits_depleted: bool,
213    /// Trigger an action when half of the credits are depleted
214    on_credits_half_depleted: bool,
215    /// Trigger an action on a website status update event
216    on_website_status: bool,
217    /// Send information about a new page find (such as links and bytes)
218    on_find: bool,
219    /// Handle the metadata of a found page
220    on_find_metadata: bool,
221}
222
223/// Proxy pool selection for outbound request routing.
224/// Choose a pool based on your use case (e.g., stealth, speed, or stability).
225///
226/// - 'residential'         → cost-effective entry-level residential pool
227/// - 'residential_fast'    → faster residential pool for higher throughput
228/// - 'residential_static'  → static residential IPs, rotated daily
229/// - 'residential_premium' → low-latency premium IPs
230/// - 'residential_core'    → balanced plan (quality vs. cost)
231/// - 'residential_plus'    → largest and highest quality core pool
232/// - 'mobile'              → 4G/5G mobile proxies for maximum evasion
233/// - 'isp'                 → ISP-grade datacenters
234#[derive(
235    Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize,
236)]
237pub enum ProxyType {
238    /// Cost-effective entry-level residential pool.
239    #[serde(rename = "residential")]
240    Residential,
241    /// Higher-throughput residential pool for better performance.
242    #[serde(rename = "residential_fast")]
243    ResidentialFast,
244    /// Static residential IPs, rotated daily for session persistence.
245    #[serde(rename = "residential_static")]
246    ResidentialStatic,
247    /// 4G / 5G mobile proxies for maximum stealth and evasion.
248    #[serde(rename = "mobile")]
249    Mobile,
250    /// ISP-grade residential routing (alias: `datacenter`).
251    #[serde(rename = "isp", alias = "datacenter")]
252    #[default]
253    Isp,
254    /// Premium low-latency residential proxy pool.
255    #[serde(rename = "residential_premium")]
256    ResidentialPremium,
257    /// Core residential plan optimized for balance between cost and quality.
258    #[serde(rename = "residential_core")]
259    ResidentialCore,
260    /// Extended core residential pool with the largest, highest-quality IPs.
261    #[serde(rename = "residential_plus")]
262    ResidentialPlus,
263}
264
265/// Send multiple return formats.
266#[derive(Debug, Deserialize, Serialize, Clone)]
267#[serde(untagged)]
268pub enum ReturnFormatHandling {
269    /// A single return item.
270    Single(ReturnFormat),
271    /// Multiple return formats.
272    Multi(std::collections::HashSet<ReturnFormat>),
273}
274
275impl Default for ReturnFormatHandling {
276    fn default() -> ReturnFormatHandling {
277        ReturnFormatHandling::Single(ReturnFormat::Raw)
278    }
279}
280
281#[derive(Debug, Default, Deserialize, Serialize, Clone)]
282pub struct EventTracker {
283    /// The responses received.
284    responses: Option<bool>,
285    ///The request sent.
286    requests: Option<bool>
287}
288
289/// Structure representing request parameters.
290#[derive(Debug, Default, Deserialize, Serialize, Clone)]
291pub struct RequestParams {
292    #[serde(default)]
293    /// The URL to be crawled.
294    pub url: Option<String>,
295    #[serde(default)]
296    /// The type of request to be made.
297    pub request: Option<RequestType>,
298    #[serde(default)]
299    /// The maximum number of pages the crawler should visit.
300    pub limit: Option<u32>,
301    #[serde(default)]
302    /// The format in which the result should be returned.
303    pub return_format: Option<ReturnFormatHandling>,
304    #[serde(default)]
305    /// Specifies whether to only visit the top-level domain.
306    pub tld: Option<bool>,
307    #[serde(default)]
308    /// The depth of the crawl.
309    pub depth: Option<u32>,
310    #[serde(default)]
311    /// Specifies whether the request should be cached.
312    pub cache: Option<bool>,
313    #[serde(default)]
314    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
315    pub scroll: Option<u32>,
316    #[serde(default)]
317    /// The budget for various resources.
318    pub budget: Option<HashMap<String, u32>>,
319    #[serde(default)]
320    /// The blacklist routes to ignore. This can be a Regex string pattern.
321    pub blacklist: Option<Vec<String>>,
322    #[serde(default)]
323    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
324    pub whitelist: Option<Vec<String>>,
325    #[serde(default)]
326    /// The locale to be used during the crawl.
327    pub locale: Option<String>,
328    #[serde(default)]
329    /// The cookies to be set for the request, formatted as a single string.
330    pub cookies: Option<String>,
331    #[serde(default)]
332    /// Specifies whether to use stealth techniques to avoid detection.
333    pub stealth: Option<bool>,
334    #[serde(default)]
335    /// The headers to be used for the request.
336    pub headers: Option<HashMap<String, String>>,
337    #[serde(default)]
338    /// Specifies whether anti-bot measures should be used.
339    pub anti_bot: Option<bool>,
340    #[serde(default)]
341    /// Specifies whether to send data via webhooks.
342    pub webhooks: Option<WebhookSettings>,
343    #[serde(default)]
344    /// Specifies whether to include metadata in the response.
345    pub metadata: Option<bool>,
346    #[serde(default)]
347    /// The dimensions of the viewport.
348    pub viewport: Option<Viewport>,
349    #[serde(default)]
350    /// The encoding to be used for the request.
351    pub encoding: Option<String>,
352    #[serde(default)]
353    /// Specifies whether to include subdomains in the crawl.
354    pub subdomains: Option<bool>,
355    #[serde(default)]
356    /// The user agent string to be used for the request.
357    pub user_agent: Option<String>,
358    #[serde(default)]
359    /// Specifies whether the response data should be stored.
360    pub store_data: Option<bool>,
361    #[serde(default)]
362    /// Configuration settings for GPT (general purpose texture mappings).
363    pub gpt_config: Option<HashMap<String, String>>,
364    #[serde(default)]
365    /// Specifies whether to use fingerprinting protection.
366    pub fingerprint: Option<bool>,
367    #[serde(default)]
368    /// Specifies whether to perform the request without using storage.
369    pub storageless: Option<bool>,
370    #[serde(default)]
371    /// Specifies whether readability optimizations should be applied.
372    pub readability: Option<bool>,
373    #[serde(default)]
374    /// Specifies whether to use a proxy for the request.
375    pub proxy_enabled: Option<bool>,
376    #[serde(default)]
377    /// Specifies whether to respect the site's robots.txt file.
378    pub respect_robots: Option<bool>,
379    #[serde(default)]
380    /// CSS selector to be used to filter the content.
381    pub root_selector: Option<String>,
382    #[serde(default)]
383    /// Specifies whether to load all resources of the crawl target.
384    pub full_resources: Option<bool>,
385    #[serde(default)]
386    /// The text string to extract data from.
387    pub text: Option<String>,
388    #[serde(default)]
389    /// Specifies whether to use the sitemap links.
390    pub sitemap: Option<bool>,
391    #[serde(default)]
392    /// External domains to include the crawl.
393    pub external_domains: Option<Vec<String>>,
394    #[serde(default)]
395    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
396    pub return_embeddings: Option<bool>,
397    #[serde(default)]
398    /// Returns the HTTP response headers.
399    pub return_headers: Option<bool>,
400    #[serde(default)]
401    /// Returns the link(s) found on the page that match the crawler query.
402    pub return_page_links: Option<bool>,
403    #[serde(default)]
404    /// Returns the HTTP response cookies.
405    pub return_cookies: Option<bool>,
406    #[serde(default)]
407    /// The timeout for the request, in milliseconds.
408    pub request_timeout: Option<u8>,
409    #[serde(default)]
410    /// Specifies whether to run the request in the background.
411    pub run_in_background: Option<bool>,
412    #[serde(default)]
413    /// Specifies whether to skip configuration checks.
414    pub skip_config_checks: Option<bool>,
415    #[serde(default)]
416    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
417    pub css_extraction_map: Option<CSSExtractionMap>,
418    #[serde(default)]
419    /// The chunking algorithm to use.
420    pub chunking_alg: Option<ChunkingAlgDict>,
421    #[serde(default)]
422    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
423    pub disable_intercept: Option<bool>,
424    #[serde(default)]
425    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
426    pub wait_for: Option<WaitFor>,
427    #[serde(default)]
428    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
429    pub execution_scripts: Option<ExecutionScriptsMap>,
430    #[serde(default)]
431    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
432    pub automation_scripts: Option<WebAutomationMap>,
433    #[serde(default)]
434    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
435    pub redirect_policy: Option<RedirectPolicy>,
436    #[serde(default)]
437    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
438    pub event_tracker: Option<EventTracker>,
439    #[serde(default)]
440    /// The timeout to stop the crawl.
441    pub crawl_timeout: Option<Timeout>,
442    #[serde(default)]
443    /// Evaluates given script in every frame upon creation (before loading frame's scripts).
444    pub evaluate_on_new_document: Option<Box<String>>,
445    #[serde(default)]
446    /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 70%, with trade-offs in speed, accuracy,
447    /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
448    /// targeting websites with minimal anti-bot protections.
449    pub lite_mode: Option<bool>,
450    /// The proxy to use for request.
451    pub proxy: Option<ProxyType>,
452    /// Use a remote proxy at ~70% reduced cost for file downloads.
453    /// This requires a user-supplied static IP proxy endpoint.
454    pub remote_proxy: Option<String>,
455}
456
457/// The structure representing request parameters for a search request.
458#[derive(Debug, Default, Deserialize, Serialize, Clone)]
459pub struct SearchRequestParams {
460    /// The base request parameters.
461    #[serde(default, flatten)]
462    pub base: RequestParams,
463    // The search request.
464    pub search: String,
465    /// The search limit.
466    pub search_limit: Option<u32>,
467    // Fetch the page content. Defaults to true.
468    pub fetch_page_content: Option<bool>,
469    /// The search location of the request
470    pub location: Option<String>,
471    /// The country code of the request
472    pub country: Option<String>,
473    /// The language code of the request.
474    pub language: Option<String>,
475    /// The number of search results
476    pub num: Option<u32>,
477    /// The page of the search results.
478    pub page: Option<u32>,
479    #[serde(default)]
480    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
481    pub website_limit: Option<u32>,
482}
483
484/// Structure representing request parameters for transforming files.
485#[derive(Debug, Default, Deserialize, Serialize, Clone)]
486pub struct TransformParams {
487    #[serde(default)]
488    /// The format in which the result should be returned.
489    pub return_format: Option<ReturnFormat>,
490    #[serde(default)]
491    /// Specifies whether readability optimizations should be applied.
492    pub readability: Option<bool>,
493    #[serde(default)]
494    /// Clean the markdown or text for AI.
495    pub clean: Option<bool>,
496    #[serde(default)]
497    /// Clean the markdown or text for AI removing footers, navigation, and more.
498    pub clean_full: Option<bool>,
499    /// The data being transformed.
500    pub data: Vec<DataParam>,
501}
502
503#[derive(Serialize, Deserialize, Debug, Clone)]
504pub struct DataParam {
505    /// The HTML resource.
506    pub html: String,
507    /// The website url.
508    pub url: Option<String>,
509}
510
511/// the request type to perform
512#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
513#[serde(rename_all = "lowercase")]
514pub enum RequestType {
515    /// Default HTTP request
516    Http,
517    /// Chrome browser rendering
518    Chrome,
519    #[default]
520    /// Smart mode defaulting to HTTP and using Chrome when needed.
521    SmartMode,
522}
523
524/// Enum representing different return formats.
525#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
526#[serde(rename_all = "lowercase")]
527pub enum ReturnFormat {
528    #[default]
529    /// The default return format of the resource.
530    Raw,
531    /// Return the response as Markdown.
532    Markdown,
533    /// Return the response as Commonmark.
534    Commonmark,
535    /// Return the response as Html2text.
536    Html2text,
537    /// Return the response as Text.
538    Text,
539    /// Return the response as XML.
540    Xml,
541    /// Return the response as Bytes.
542    Bytes,
543}
544
545/// Represents a Spider with API key and HTTP client.
546#[derive(Debug, Default)]
547pub struct Spider {
548    /// The Spider API key.
549    pub api_key: String,
550    /// The Spider Client to re-use.
551    pub client: Client,
552}
553
554impl Spider {
555    /// Creates a new instance of Spider.
556    ///
557    /// # Arguments
558    ///
559    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
560    ///
561    /// # Returns
562    ///
563    /// A new instance of Spider or an error string if no API key is provided.
564    pub fn new(api_key: Option<String>) -> Result<Self, &'static str> {
565        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
566
567        match api_key {
568            Some(key) => Ok(Self {
569                api_key: key,
570                client: Client::new(),
571            }),
572            None => Err("No API key provided"),
573        }
574    }
575
576    /// Creates a new instance of Spider.
577    ///
578    /// # Arguments
579    ///
580    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
581    /// * `client` - A custom client to pass in.
582    ///
583    /// # Returns
584    ///
585    /// A new instance of Spider or an error string if no API key is provided.
586    pub fn new_with_client(api_key: Option<String>, client: Client) -> Result<Self, &'static str> {
587        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
588
589        match api_key {
590            Some(key) => Ok(Self {
591                api_key: key,
592                client,
593            }),
594            None => Err("No API key provided"),
595        }
596    }
597
598    /// Sends a POST request to the API.
599    ///
600    /// # Arguments
601    ///
602    /// * `endpoint` - The API endpoint.
603    /// * `data` - The request data as a HashMap.
604    /// * `stream` - Whether streaming is enabled.
605    /// * `content_type` - The content type of the request.
606    ///
607    /// # Returns
608    ///
609    /// The response from the API.
610    async fn api_post_base(
611        &self,
612        endpoint: &str,
613        data: impl Serialize + Sized + std::fmt::Debug,
614        content_type: &str,
615    ) -> Result<Response, Error> {
616        let url: String = format!("{API_URL}/{}", endpoint);
617
618        self.client
619            .post(&url)
620            .header(
621                "User-Agent",
622                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
623            )
624            .header("Content-Type", content_type)
625            .header("Authorization", format!("Bearer {}", self.api_key))
626            .json(&data)
627            .send()
628            .await
629    }
630
631    /// Sends a POST request to the API.
632    ///
633    /// # Arguments
634    ///
635    /// * `endpoint` - The API endpoint.
636    /// * `data` - The request data as a HashMap.
637    /// * `stream` - Whether streaming is enabled.
638    /// * `content_type` - The content type of the request.
639    ///
640    /// # Returns
641    ///
642    /// The response from the API.
643    async fn api_post(
644        &self,
645        endpoint: &str,
646        data: impl Serialize + std::fmt::Debug + Clone + Send + Sync,
647        content_type: &str,
648    ) -> Result<Response, Error> {
649        let fetch = || async {
650            self.api_post_base(endpoint, data.to_owned(), content_type)
651                .await
652        };
653
654        fetch
655            .retry(ExponentialBuilder::default().with_max_times(5))
656            .when(|err: &reqwest::Error| {
657                if let Some(status) = err.status() {
658                    status.is_server_error()
659                } else {
660                    err.is_timeout()
661                }
662            })
663            .await
664    }
665
666    /// Sends a GET request to the API.
667    ///
668    /// # Arguments
669    ///
670    /// * `endpoint` - The API endpoint.
671    ///
672    /// # Returns
673    ///
674    /// The response from the API as a JSON value.
675    async fn api_get_base<T: Serialize>(
676        &self,
677        endpoint: &str,
678        query_params: Option<&T>,
679    ) -> Result<serde_json::Value, reqwest::Error> {
680        let url = format!("{API_URL}/{}", endpoint);
681        let res = self
682            .client
683            .get(&url)
684            .query(&query_params)
685            .header(
686                "User-Agent",
687                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
688            )
689            .header("Content-Type", "application/json")
690            .header("Authorization", format!("Bearer {}", self.api_key))
691            .send()
692            .await?;
693        res.json().await
694    }
695
696    /// Sends a GET request to the API.
697    ///
698    /// # Arguments
699    ///
700    /// * `endpoint` - The API endpoint.
701    ///
702    /// # Returns
703    ///
704    /// The response from the API as a JSON value.
705    async fn api_get<T: Serialize>(
706        &self,
707        endpoint: &str,
708        query_params: Option<&T>,
709    ) -> Result<serde_json::Value, reqwest::Error> {
710        let fetch = || async { self.api_get_base(endpoint, query_params.to_owned()).await };
711
712        fetch
713            .retry(ExponentialBuilder::default().with_max_times(5))
714            .when(|err: &reqwest::Error| {
715                if let Some(status) = err.status() {
716                    status.is_server_error()
717                } else {
718                    err.is_timeout()
719                }
720            })
721            .await
722    }
723
724    /// Sends a DELETE request to the API.
725    ///
726    /// # Arguments
727    ///
728    /// * `endpoint` - The API endpoint.
729    /// * `params` - Optional request parameters.
730    /// * `stream` - Whether streaming is enabled.
731    /// * `content_type` - The content type of the request.
732    ///
733    /// # Returns
734    ///
735    /// The response from the API.
736    async fn api_delete_base(
737        &self,
738        endpoint: &str,
739        params: Option<HashMap<String, serde_json::Value>>,
740    ) -> Result<Response, Error> {
741        let url = format!("{API_URL}/v1/{}", endpoint);
742        let request_builder = self
743            .client
744            .delete(&url)
745            .header(
746                "User-Agent",
747                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
748            )
749            .header("Content-Type", "application/json")
750            .header("Authorization", format!("Bearer {}", self.api_key));
751
752        let request_builder = if let Some(params) = params {
753            request_builder.json(&params)
754        } else {
755            request_builder
756        };
757
758        request_builder.send().await
759    }
760
761    /// Sends a DELETE request to the API.
762    ///
763    /// # Arguments
764    ///
765    /// * `endpoint` - The API endpoint.
766    /// * `params` - Optional request parameters.
767    /// * `stream` - Whether streaming is enabled.
768    /// * `content_type` - The content type of the request.
769    ///
770    /// # Returns
771    ///
772    /// The response from the API.
773    async fn api_delete(
774        &self,
775        endpoint: &str,
776        params: Option<HashMap<String, serde_json::Value>>,
777    ) -> Result<Response, Error> {
778        let fetch = || async { self.api_delete_base(endpoint, params.to_owned()).await };
779
780        fetch
781            .retry(ExponentialBuilder::default().with_max_times(5))
782            .when(|err: &reqwest::Error| {
783                if let Some(status) = err.status() {
784                    status.is_server_error()
785                } else {
786                    err.is_timeout()
787                }
788            })
789            .await
790    }
791
792    /// Scrapes a URL.
793    ///
794    /// # Arguments
795    ///
796    /// * `url` - The URL to scrape.
797    /// * `params` - Optional request parameters.
798    /// * `stream` - Whether streaming is enabled.
799    /// * `content_type` - The content type of the request.
800    ///
801    /// # Returns
802    ///
803    /// The response from the API as a JSON value.
804    pub async fn scrape_url(
805        &self,
806        url: &str,
807        params: Option<RequestParams>,
808        content_type: &str,
809    ) -> Result<serde_json::Value, reqwest::Error> {
810        let mut data = HashMap::new();
811
812        data.insert(
813            "url".to_string(),
814            serde_json::Value::String(url.to_string()),
815        );
816        data.insert("limit".to_string(), serde_json::Value::Number(1.into()));
817
818        if let Ok(params) = serde_json::to_value(params) {
819            if let Some(ref p) = params.as_object() {
820                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
821            }
822        }
823
824        let res = self.api_post("crawl", data, content_type).await?;
825        res.json().await
826    }
827
828    /// Crawls a URL.
829    ///
830    /// # Arguments
831    ///
832    /// * `url` - The URL to crawl.
833    /// * `params` - Optional request parameters.
834    /// * `stream` - Whether streaming is enabled.
835    /// * `content_type` - The content type of the request.
836    /// * `callback` - Optional callback function to handle each streamed chunk.
837    ///
838    /// # Returns
839    ///
840    /// The response from the API as a JSON value.
841    pub async fn crawl_url(
842        &self,
843        url: &str,
844        params: Option<RequestParams>,
845        stream: bool,
846        content_type: &str,
847        callback: Option<impl Fn(serde_json::Value) + Send>,
848    ) -> Result<serde_json::Value, reqwest::Error> {
849        use tokio_util::codec::{FramedRead, LinesCodec};
850
851        let mut data = HashMap::new();
852
853        if let Ok(params) = serde_json::to_value(params) {
854            if let Some(ref p) = params.as_object() {
855                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
856            }
857        }
858
859        data.insert("url".into(), serde_json::Value::String(url.to_string()));
860
861        let res = self.api_post("crawl", data, content_type).await?;
862
863        if stream {
864            if let Some(callback) = callback {
865                let stream = res.bytes_stream();
866
867                let stream_reader = tokio_util::io::StreamReader::new(
868                    stream.map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
869                );
870
871                let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
872
873                while let Some(line_result) = lines.next().await {
874                    match line_result {
875                        Ok(line) => {
876                            match serde_json::from_str::<serde_json::Value>(&line) {
877                                Ok(value) => {
878                                    callback(value);
879                                }
880                                Err(_e) => {
881                                    continue;
882                                }
883                            }
884                        }
885                        Err(_e) => {
886                            return Ok(serde_json::Value::Null)
887                        }
888                    }
889                }
890
891                Ok(serde_json::Value::Null)
892            } else {
893                Ok(serde_json::Value::Null)
894            }
895        } else {
896            res.json().await
897        }
898    }
899
900    /// Fetches links from a URL.
901    ///
902    /// # Arguments
903    ///
904    /// * `url` - The URL to fetch links from.
905    /// * `params` - Optional request parameters.
906    /// * `stream` - Whether streaming is enabled.
907    /// * `content_type` - The content type of the request.
908    ///
909    /// # Returns
910    ///
911    /// The response from the API as a JSON value.
912    pub async fn links(
913        &self,
914        url: &str,
915        params: Option<RequestParams>,
916        _stream: bool,
917        content_type: &str,
918    ) -> Result<serde_json::Value, reqwest::Error> {
919        let mut data = HashMap::new();
920
921        if let Ok(params) = serde_json::to_value(params) {
922            if let Some(ref p) = params.as_object() {
923                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
924            }
925        }
926
927        data.insert("url".into(), serde_json::Value::String(url.to_string()));
928
929        let res = self.api_post("links", data, content_type).await?;
930        res.json().await
931    }
932
933    /// Takes a screenshot of a URL.
934    ///
935    /// # Arguments
936    ///
937    /// * `url` - The URL to take a screenshot of.
938    /// * `params` - Optional request parameters.
939    /// * `stream` - Whether streaming is enabled.
940    /// * `content_type` - The content type of the request.
941    ///
942    /// # Returns
943    ///
944    /// The response from the API as a JSON value.
945    pub async fn screenshot(
946        &self,
947        url: &str,
948        params: Option<RequestParams>,
949        _stream: bool,
950        content_type: &str,
951    ) -> Result<serde_json::Value, reqwest::Error> {
952        let mut data = HashMap::new();
953
954        if let Ok(params) = serde_json::to_value(params) {
955            if let Some(ref p) = params.as_object() {
956                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
957            }
958        }
959
960        data.insert("url".into(), serde_json::Value::String(url.to_string()));
961
962        let res = self.api_post("screenshot", data, content_type).await?;
963        res.json().await
964    }
965
966    /// Searches for a query.
967    ///
968    /// # Arguments
969    ///
970    /// * `q` - The query to search for.
971    /// * `params` - Optional request parameters.
972    /// * `stream` - Whether streaming is enabled.
973    /// * `content_type` - The content type of the request.
974    ///
975    /// # Returns
976    ///
977    /// The response from the API as a JSON value.
978    pub async fn search(
979        &self,
980        q: &str,
981        params: Option<SearchRequestParams>,
982        _stream: bool,
983        content_type: &str,
984    ) -> Result<serde_json::Value, reqwest::Error> {
985        let body = match params {
986            Some(mut params) => {
987                params.search = q.to_string();
988                params
989            }
990            _ => {
991                let mut params = SearchRequestParams::default();
992                params.search = q.to_string();
993                params
994            }
995        };
996
997        let res = self.api_post("search", body, content_type).await?;
998
999        res.json().await
1000    }
1001
1002    /// Transforms data.
1003    ///
1004    /// # Arguments
1005    ///
1006    /// * `data` - The data to transform.
1007    /// * `params` - Optional request parameters.
1008    /// * `stream` - Whether streaming is enabled.
1009    /// * `content_type` - The content type of the request.
1010    ///
1011    /// # Returns
1012    ///
1013    /// The response from the API as a JSON value.
1014    pub async fn transform(
1015        &self,
1016        data: Vec<HashMap<&str, &str>>,
1017        params: Option<TransformParams>,
1018        _stream: bool,
1019        content_type: &str,
1020    ) -> Result<serde_json::Value, reqwest::Error> {
1021        let mut payload = HashMap::new();
1022
1023        if let Ok(params) = serde_json::to_value(params) {
1024            if let Some(ref p) = params.as_object() {
1025                payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1026            }
1027        }
1028
1029        if let Ok(d) = serde_json::to_value(data) {
1030            payload.insert("data".into(), d);
1031        }
1032
1033        let res = self.api_post("transform", payload, content_type).await?;
1034
1035        res.json().await
1036    }
1037
1038    /// Extracts contacts from a URL.
1039    ///
1040    /// # Arguments
1041    ///
1042    /// * `url` - The URL to extract contacts from.
1043    /// * `params` - Optional request parameters.
1044    /// * `stream` - Whether streaming is enabled.
1045    /// * `content_type` - The content type of the request.
1046    ///
1047    /// # Returns
1048    ///
1049    /// The response from the API as a JSON value.
1050    pub async fn extract_contacts(
1051        &self,
1052        url: &str,
1053        params: Option<RequestParams>,
1054        _stream: bool,
1055        content_type: &str,
1056    ) -> Result<serde_json::Value, reqwest::Error> {
1057        let mut data = HashMap::new();
1058
1059        if let Ok(params) = serde_json::to_value(params) {
1060            if let Ok(params) = serde_json::to_value(params) {
1061                if let Some(ref p) = params.as_object() {
1062                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1063                }
1064            }
1065        }
1066
1067        match serde_json::to_value(url) {
1068            Ok(u) => {
1069                data.insert("url".into(), u);
1070            }
1071            _ => (),
1072        }
1073
1074        let res = self
1075            .api_post("pipeline/extract-contacts", data, content_type)
1076            .await?;
1077        res.json().await
1078    }
1079
1080    /// Labels data from a URL.
1081    ///
1082    /// # Arguments
1083    ///
1084    /// * `url` - The URL to label data from.
1085    /// * `params` - Optional request parameters.
1086    /// * `stream` - Whether streaming is enabled.
1087    /// * `content_type` - The content type of the request.
1088    ///
1089    /// # Returns
1090    ///
1091    /// The response from the API as a JSON value.
1092    pub async fn label(
1093        &self,
1094        url: &str,
1095        params: Option<RequestParams>,
1096        _stream: bool,
1097        content_type: &str,
1098    ) -> Result<serde_json::Value, reqwest::Error> {
1099        let mut data = HashMap::new();
1100
1101        if let Ok(params) = serde_json::to_value(params) {
1102            if let Ok(params) = serde_json::to_value(params) {
1103                if let Some(ref p) = params.as_object() {
1104                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1105                }
1106            }
1107        }
1108
1109        data.insert("url".into(), serde_json::Value::String(url.to_string()));
1110
1111        let res = self.api_post("pipeline/label", data, content_type).await?;
1112        res.json().await
1113    }
1114
1115    /// Download a record from storage.
1116    ///
1117    /// # Arguments
1118    ///
1119    /// * `url` - Optional exact url of the file in storage.
1120    /// * `options` - Optional options.
1121    /// * `stream` - Whether streaming is enabled.
1122    ///
1123    /// # Returns
1124    ///
1125    /// The response from the API.
1126    pub async fn download(
1127        &self,
1128        url: Option<&str>,
1129        options: Option<HashMap<&str, i32>>,
1130    ) -> Result<reqwest::Response, reqwest::Error> {
1131        let mut params = HashMap::new();
1132
1133        if let Some(url) = url {
1134            params.insert("url".to_string(), url.to_string());
1135        }
1136
1137        if let Some(options) = options {
1138            for (key, value) in options {
1139                params.insert(key.to_string(), value.to_string());
1140            }
1141        }
1142
1143        let url = format!("{API_URL}/v1/data/download");
1144        let request = self
1145            .client
1146            .get(&url)
1147            .header(
1148                "User-Agent",
1149                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1150            )
1151            .header("Content-Type", "application/octet-stream")
1152            .header("Authorization", format!("Bearer {}", self.api_key))
1153            .query(&params);
1154
1155        let res = request.send().await?;
1156
1157        Ok(res)
1158    }
1159
1160    /// Creates a signed URL of a file from storage.
1161    ///
1162    /// # Arguments
1163    ///
1164    /// * `url` - Optional exact url of the file in storage.
1165    /// * `options` - Optional options.
1166    /// * `stream` - Whether streaming is enabled.
1167    ///
1168    /// # Returns
1169    ///
1170    /// The response from the API.
1171    pub async fn create_signed_url(
1172        &self,
1173        url: Option<&str>,
1174        options: Option<HashMap<&str, i32>>,
1175    ) -> Result<serde_json::Value, reqwest::Error> {
1176        let mut params = HashMap::new();
1177
1178        if let Some(options) = options {
1179            for (key, value) in options {
1180                params.insert(key.to_string(), value.to_string());
1181            }
1182        }
1183
1184        if let Some(url) = url {
1185            params.insert("url".to_string(), url.to_string());
1186        }
1187
1188        let url = format!("{API_URL}/v1/data/sign-url");
1189        let request = self
1190            .client
1191            .get(&url)
1192            .header(
1193                "User-Agent",
1194                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1195            )
1196            .header("Authorization", format!("Bearer {}", self.api_key))
1197            .query(&params);
1198
1199        let res = request.send().await?;
1200
1201        res.json().await
1202    }
1203
1204    /// Gets the crawl state of a URL.
1205    ///
1206    /// # Arguments
1207    ///
1208    /// * `url` - The URL to get the crawl state of.
1209    /// * `params` - Optional request parameters.
1210    /// * `stream` - Whether streaming is enabled.
1211    /// * `content_type` - The content type of the request.
1212    ///
1213    /// # Returns
1214    ///
1215    pub async fn get_crawl_state(
1216        &self,
1217        url: &str,
1218        params: Option<RequestParams>,
1219        content_type: &str,
1220    ) -> Result<serde_json::Value, reqwest::Error> {
1221        let mut payload = HashMap::new();
1222        payload.insert("url".into(), serde_json::Value::String(url.to_string()));
1223        payload.insert(
1224            "contentType".into(),
1225            serde_json::Value::String(content_type.to_string()),
1226        );
1227
1228        if let Ok(params) = serde_json::to_value(params) {
1229            if let Ok(params) = serde_json::to_value(params) {
1230                if let Some(ref p) = params.as_object() {
1231                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1232                }
1233            }
1234        }
1235
1236        let res = self
1237            .api_post("data/crawl_state", payload, content_type)
1238            .await?;
1239        res.json().await
1240    }
1241
1242    /// Get the account credits left.
1243    pub async fn get_credits(&self) -> Result<serde_json::Value, reqwest::Error> {
1244        self.api_get::<serde_json::Value>("data/credits", None)
1245            .await
1246    }
1247
1248    /// Send a request for a data record.
1249    pub async fn data_post(
1250        &self,
1251        table: &str,
1252        data: Option<RequestParams>,
1253    ) -> Result<serde_json::Value, reqwest::Error> {
1254        let res = self
1255            .api_post(&format!("data/{}", table), data, "application/json")
1256            .await?;
1257        res.json().await
1258    }
1259
1260    /// Query a record from the global DB.
1261    pub async fn query(&self, params: &QueryRequest) -> Result<serde_json::Value, reqwest::Error> {
1262        let res = self
1263            .api_get::<QueryRequest>(&"data/query", Some(params))
1264            .await?;
1265
1266        Ok(res)
1267    }
1268
1269    /// Get a table record.
1270    pub async fn data_get(
1271        &self,
1272        table: &str,
1273        params: Option<RequestParams>,
1274    ) -> Result<serde_json::Value, reqwest::Error> {
1275        let mut payload = HashMap::new();
1276
1277        if let Some(params) = params {
1278            if let Ok(p) = serde_json::to_value(params) {
1279                if let Some(o) = p.as_object() {
1280                    payload.extend(o.iter().map(|(k, v)| (k.as_str(), v.clone())));
1281                }
1282            }
1283        }
1284
1285        let res = self
1286            .api_get::<serde_json::Value>(&format!("data/{}", table), None)
1287            .await?;
1288        Ok(res)
1289    }
1290
1291    /// Delete a record.
1292    pub async fn data_delete(
1293        &self,
1294        table: &str,
1295        params: Option<RequestParams>,
1296    ) -> Result<serde_json::Value, reqwest::Error> {
1297        let mut payload = HashMap::new();
1298
1299        if let Ok(params) = serde_json::to_value(params) {
1300            if let Ok(params) = serde_json::to_value(params) {
1301                if let Some(ref p) = params.as_object() {
1302                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1303                }
1304            }
1305        }
1306
1307        let res = self
1308            .api_delete(&format!("data/{}", table), Some(payload))
1309            .await?;
1310        res.json().await
1311    }
1312}
1313
1314#[cfg(test)]
1315mod tests {
1316    use super::*;
1317    use dotenv::dotenv;
1318    use lazy_static::lazy_static;
1319    use reqwest::ClientBuilder;
1320
1321    lazy_static! {
1322        static ref SPIDER_CLIENT: Spider = {
1323            dotenv().ok();
1324            let client = ClientBuilder::new();
1325            let client = client.user_agent("SpiderBot").build().unwrap();
1326
1327            Spider::new_with_client(None, client).expect("client to build")
1328        };
1329    }
1330
1331    #[tokio::test]
1332    #[ignore]
1333    async fn test_scrape_url() {
1334        let response = SPIDER_CLIENT
1335            .scrape_url("https://example.com", None, "application/json")
1336            .await;
1337        assert!(response.is_ok());
1338    }
1339
1340    #[tokio::test]
1341    async fn test_crawl_url() {
1342        let response = SPIDER_CLIENT
1343            .crawl_url(
1344                "https://example.com",
1345                None,
1346                false,
1347                "application/json",
1348                None::<fn(serde_json::Value)>,
1349            )
1350            .await;
1351        assert!(response.is_ok());
1352    }
1353
1354    #[tokio::test]
1355    #[ignore]
1356    async fn test_links() {
1357        let response: Result<serde_json::Value, Error> = SPIDER_CLIENT
1358            .links("https://example.com", None, false, "application/json")
1359            .await;
1360        assert!(response.is_ok());
1361    }
1362
1363    #[tokio::test]
1364    #[ignore]
1365    async fn test_screenshot() {
1366        let mut params = RequestParams::default();
1367        params.limit = Some(1);
1368
1369        let response = SPIDER_CLIENT
1370            .screenshot(
1371                "https://example.com",
1372                Some(params),
1373                false,
1374                "application/json",
1375            )
1376            .await;
1377        assert!(response.is_ok());
1378    }
1379
1380    // #[tokio::test(flavor = "multi_thread")]
1381    // async fn test_search() {
1382    //     let mut params = SearchRequestParams::default();
1383
1384    //     params.search_limit = Some(1);
1385    //     params.num = Some(1);
1386    //     params.fetch_page_content = Some(false);
1387
1388    //     let response = SPIDER_CLIENT
1389    //         .search("a sports website", Some(params), false, "application/json")
1390    //         .await;
1391
1392    //     assert!(response.is_ok());
1393    // }
1394
1395    #[tokio::test]
1396    #[ignore]
1397    async fn test_transform() {
1398        let data = vec![HashMap::from([(
1399            "<html><body><h1>Transformation</h1></body></html>".into(),
1400            "".into(),
1401        )])];
1402        let response = SPIDER_CLIENT
1403            .transform(data, None, false, "application/json")
1404            .await;
1405        assert!(response.is_ok());
1406    }
1407
1408    #[tokio::test]
1409    #[ignore]
1410    async fn test_extract_contacts() {
1411        let response = SPIDER_CLIENT
1412            .extract_contacts("https://example.com", None, false, "application/json")
1413            .await;
1414        assert!(response.is_ok());
1415    }
1416
1417    #[tokio::test]
1418    #[ignore]
1419    async fn test_label() {
1420        let response = SPIDER_CLIENT
1421            .label("https://example.com", None, false, "application/json")
1422            .await;
1423        assert!(response.is_ok());
1424    }
1425
1426    #[tokio::test]
1427    async fn test_create_signed_url() {
1428        let response = SPIDER_CLIENT
1429            .create_signed_url(Some("example.com"), None)
1430            .await;
1431        assert!(response.is_ok());
1432    }
1433
1434    #[tokio::test]
1435    async fn test_get_crawl_state() {
1436        let response = SPIDER_CLIENT
1437            .get_crawl_state("https://example.com", None, "application/json")
1438            .await;
1439        assert!(response.is_ok());
1440    }
1441
1442    #[tokio::test]
1443    async fn test_query() {
1444        let mut query = QueryRequest::default();
1445
1446        query.domain = Some("spider.cloud".into());
1447
1448        let response = SPIDER_CLIENT.query(&query).await;
1449        assert!(response.is_ok());
1450    }
1451
1452    #[tokio::test]
1453    async fn test_get_credits() {
1454        let response = SPIDER_CLIENT.get_credits().await;
1455        assert!(response.is_ok());
1456    }
1457}