spider_client/
lib.rs

1//! The `spider-client` module provides the primary interface and
2//! functionalities for the Spider web crawler library, which is
3//! designed for rapid and efficient crawling of web pages to gather
4//! links using isolated contexts.
5//!
6//! ### Features
7//!
8//! - **Multi-threaded Crawling:** Spider can utilize multiple
9//!   threads to parallelize the crawling process, drastically
10//!   improving performance and allowing the ability to gather
11//!   millions of pages in a short time.
12//!
13//! - **Configurable:** The library provides various options to
14//!   configure the crawling behavior, such as setting the depth
15//!   of crawling, user-agent strings, delays between requests,
16//!   and more.
17//!
18//! - **Link Gathering:** One of the primary objectives of Spider is to
19//!   gather and manage links from the web pages it crawls,
20//!   compiling them into a structured format for further use.
21//!
22//! ### Examples
23//!
24//! Basic usage of the Spider client might look like this:
25//!
26//! ```rust
27//! use spider_client::{Spider, RequestType, RequestParams};
28//! use tokio;
29//!
30//!  # #[ignore]
31//! #[tokio::main]
32//! async fn main() {
33//!     let spider = Spider::new(Some("myspiderapikey".into())).expect("API key must be provided");
34//!
35//!     let url = "https://spider.cloud";
36//!
37//!     // Scrape a single URL
38//!     let scraped_data = spider.scrape_url(url, None, "application/json").await.expect("Failed to scrape the URL");
39//!
40//!     println!("Scraped Data: {:?}", scraped_data);
41//!
42//!     // Crawl a website
43//!     let crawler_params = RequestParams {
44//!         limit: Some(1),
45//!         proxy_enabled: Some(true),
46//!         store_data: Some(false),
47//!         metadata: Some(false),
48//!         request: Some(RequestType::Http),
49//!         ..Default::default()
50//!     };
51//!
52//!     let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
53//!
54//!     println!("Crawl Result: {:?}", crawl_result);
55//! }
56//! ```
57//!
58//! ### Modules
59//!
60//! - `config`: Contains the configuration options for the Spider client.
61//! - `utils`: Utility functions used by the Spider client.
62//!
63
64use backon::ExponentialBuilder;
65use backon::Retryable;
66use reqwest::Client;
67use reqwest::{Error, Response};
68use serde::{Deserialize, Serialize};
69use std::collections::HashMap;
70use tokio_stream::StreamExt;
71
72/// Structure representing the Chunking algorithm dictionary.
73#[derive(Debug, Deserialize, Serialize, Clone)]
74pub struct ChunkingAlgDict {
75    /// The chunking algorithm to use, defined as a specific type.
76    r#type: ChunkingType,
77    /// The amount to chunk by.
78    value: i32,
79}
80
81// The nested structures
82#[derive(Serialize, Deserialize, Debug, Clone)]
83pub struct Timeout {
84    /// The seconds up to 60.
85    pub secs: u64,
86    /// The nanoseconds.
87    pub nanos: u32,
88}
89
90#[derive(Serialize, Deserialize, Debug, Clone)]
91pub struct IdleNetwork {
92    /// The timeout to wait until.
93    pub timeout: Timeout,
94}
95
96#[derive(Serialize, Deserialize, Debug, Clone)]
97#[serde(tag = "type", rename_all = "PascalCase")]
98pub enum WebAutomation {
99    Evaluate { code: String },
100    Click { selector: String },
101    Wait { duration: u64 },
102    WaitForNavigation,
103    WaitFor { selector: String },
104    WaitForAndClick { selector: String },
105    ScrollX { pixels: i32 },
106    ScrollY { pixels: i32 },
107    Fill { selector: String, value: String },
108    InfiniteScroll { times: u32 },
109}
110
111#[derive(Default, Serialize, Deserialize, Debug, Clone)]
112#[serde(tag = "type", rename_all = "PascalCase")]
113pub enum RedirectPolicy {
114    Loose,
115    #[default]
116    Strict,
117}
118
119pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
120pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
121
122#[derive(Serialize, Deserialize, Debug, Clone)]
123pub struct Selector {
124    /// The timeout to wait until.
125    pub timeout: Timeout,
126    /// The selector to wait for.
127    pub selector: String,
128}
129
130#[derive(Serialize, Deserialize, Debug, Clone)]
131pub struct Delay {
132    /// The timeout to wait until.
133    pub timeout: Timeout,
134}
135
136#[derive(Serialize, Deserialize, Debug, Clone)]
137pub struct WaitFor {
138    /// Wait until idle networks with a timeout of idleness.
139    pub idle_network: Option<IdleNetwork>,
140    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
141    pub selector: Option<Selector>,
142    /// Wait until a hard delay.
143    pub delay: Option<Delay>,
144    /// Wait until page navigation happen. Default is true.
145    pub page_navigations: Option<bool>,
146}
147
148/// Query request to get a document.
149#[derive(Serialize, Deserialize, Debug, Clone, Default)]
150pub struct QueryRequest {
151    /// The exact website url.
152    pub url: Option<String>,
153    /// The website domain.
154    pub domain: Option<String>,
155    /// The path of the resource.
156    pub pathname: Option<String>,
157}
158
159/// Enum representing different types of Chunking.
160#[derive(Default, Debug, Deserialize, Serialize, Clone)]
161#[serde(rename_all = "lowercase")]
162pub enum ChunkingType {
163    #[default]
164    /// By the word count.
165    ByWords,
166    /// By the line count.
167    ByLines,
168    /// By the char length.
169    ByCharacterLength,
170    /// By sentence.
171    BySentence,
172}
173
174#[derive(Default, Debug, Deserialize, Serialize, Clone)]
175/// View port handling for chrome.
176pub struct Viewport {
177    /// Device screen Width
178    pub width: u32,
179    /// Device screen size
180    pub height: u32,
181    /// Device scale factor
182    pub device_scale_factor: Option<f64>,
183    /// Emulating Mobile?
184    pub emulating_mobile: bool,
185    /// Use landscape mode instead of portrait.
186    pub is_landscape: bool,
187    /// Touch screen device?
188    pub has_touch: bool,
189}
190
191/// The API url.
192const API_URL: &'static str = "https://api.spider.cloud";
193
194// Define the CSSSelector struct
195#[derive(Debug, Clone, Default, Deserialize, Serialize)]
196pub struct CSSSelector {
197    /// The name of the selector group
198    pub name: String,
199    /// A vector of CSS selectors
200    pub selectors: Vec<String>,
201}
202
203// Define the CSSExtractionMap type
204pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
205
206/// Represents the settings for a webhook configuration
207#[derive(Debug, Default, Deserialize, Serialize, Clone)]
208pub struct WebhookSettings {
209    /// The destination where the webhook information will be sent
210    destination: String,
211    /// Trigger an action when all credits are depleted
212    on_credits_depleted: bool,
213    /// Trigger an action when half of the credits are depleted
214    on_credits_half_depleted: bool,
215    /// Trigger an action on a website status update event
216    on_website_status: bool,
217    /// Send information about a new page find (such as links and bytes)
218    on_find: bool,
219    /// Handle the metadata of a found page
220    on_find_metadata: bool,
221}
222
223/// Send multiple return formats.
224#[derive(Debug, Deserialize, Serialize, Clone)]
225#[serde(untagged)]
226pub enum ReturnFormatHandling {
227    /// A single return item.
228    Single(ReturnFormat),
229    /// Multiple return formats.
230    Multi(std::collections::HashSet<ReturnFormat>),
231}
232
233impl Default for ReturnFormatHandling {
234    fn default() -> ReturnFormatHandling {
235        ReturnFormatHandling::Single(ReturnFormat::Raw)
236    }
237}
238
239#[derive(Debug, Default, Deserialize, Serialize, Clone)]
240pub struct EventTracker {
241    /// The responses received.
242    responses: Option<bool>,
243    ///The request sent.
244    requests: Option<bool>
245}
246
247/// Structure representing request parameters.
248#[derive(Debug, Default, Deserialize, Serialize, Clone)]
249pub struct RequestParams {
250    #[serde(default)]
251    /// The URL to be crawled.
252    pub url: Option<String>,
253    #[serde(default)]
254    /// The type of request to be made.
255    pub request: Option<RequestType>,
256    #[serde(default)]
257    /// The maximum number of pages the crawler should visit.
258    pub limit: Option<u32>,
259    #[serde(default)]
260    /// The format in which the result should be returned.
261    pub return_format: Option<ReturnFormatHandling>,
262    #[serde(default)]
263    /// Specifies whether to only visit the top-level domain.
264    pub tld: Option<bool>,
265    #[serde(default)]
266    /// The depth of the crawl.
267    pub depth: Option<u32>,
268    #[serde(default)]
269    /// Specifies whether the request should be cached.
270    pub cache: Option<bool>,
271    #[serde(default)]
272    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
273    pub scroll: Option<u32>,
274    #[serde(default)]
275    /// The budget for various resources.
276    pub budget: Option<HashMap<String, u32>>,
277    #[serde(default)]
278    /// The blacklist routes to ignore. This can be a Regex string pattern.
279    pub blacklist: Option<Vec<String>>,
280    #[serde(default)]
281    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
282    pub whitelist: Option<Vec<String>>,
283    #[serde(default)]
284    /// The locale to be used during the crawl.
285    pub locale: Option<String>,
286    #[serde(default)]
287    /// The cookies to be set for the request, formatted as a single string.
288    pub cookies: Option<String>,
289    #[serde(default)]
290    /// Specifies whether to use stealth techniques to avoid detection.
291    pub stealth: Option<bool>,
292    #[serde(default)]
293    /// The headers to be used for the request.
294    pub headers: Option<HashMap<String, String>>,
295    #[serde(default)]
296    /// Specifies whether anti-bot measures should be used.
297    pub anti_bot: Option<bool>,
298    #[serde(default)]
299    /// Specifies whether to send data via webhooks.
300    pub webhooks: Option<WebhookSettings>,
301    #[serde(default)]
302    /// Specifies whether to include metadata in the response.
303    pub metadata: Option<bool>,
304    #[serde(default)]
305    /// The dimensions of the viewport.
306    pub viewport: Option<Viewport>,
307    #[serde(default)]
308    /// The encoding to be used for the request.
309    pub encoding: Option<String>,
310    #[serde(default)]
311    /// Specifies whether to include subdomains in the crawl.
312    pub subdomains: Option<bool>,
313    #[serde(default)]
314    /// The user agent string to be used for the request.
315    pub user_agent: Option<String>,
316    #[serde(default)]
317    /// Specifies whether the response data should be stored.
318    pub store_data: Option<bool>,
319    #[serde(default)]
320    /// Configuration settings for GPT (general purpose texture mappings).
321    pub gpt_config: Option<HashMap<String, String>>,
322    #[serde(default)]
323    /// Specifies whether to use fingerprinting protection.
324    pub fingerprint: Option<bool>,
325    #[serde(default)]
326    /// Specifies whether to perform the request without using storage.
327    pub storageless: Option<bool>,
328    #[serde(default)]
329    /// Specifies whether readability optimizations should be applied.
330    pub readability: Option<bool>,
331    #[serde(default)]
332    /// Specifies whether to use a proxy for the request.
333    pub proxy_enabled: Option<bool>,
334    #[serde(default)]
335    /// Specifies whether to respect the site's robots.txt file.
336    pub respect_robots: Option<bool>,
337    #[serde(default)]
338    /// CSS selector to be used to filter the content.
339    pub root_selector: Option<String>,
340    #[serde(default)]
341    /// Specifies whether to load all resources of the crawl target.
342    pub full_resources: Option<bool>,
343    #[serde(default)]
344    /// The text string to extract data from.
345    pub text: Option<String>,
346    #[serde(default)]
347    /// Specifies whether to use the sitemap links.
348    pub sitemap: Option<bool>,
349    #[serde(default)]
350    /// External domains to include the crawl.
351    pub external_domains: Option<Vec<String>>,
352    #[serde(default)]
353    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
354    pub return_embeddings: Option<bool>,
355    #[serde(default)]
356    /// Returns the HTTP response headers.
357    pub return_headers: Option<bool>,
358    #[serde(default)]
359    /// Returns the link(s) found on the page that match the crawler query.
360    pub return_page_links: Option<bool>,
361    #[serde(default)]
362    /// Returns the HTTP response cookies.
363    pub return_cookies: Option<bool>,
364    #[serde(default)]
365    /// The timeout for the request, in milliseconds.
366    pub request_timeout: Option<u8>,
367    #[serde(default)]
368    /// Specifies whether to run the request in the background.
369    pub run_in_background: Option<bool>,
370    #[serde(default)]
371    /// Specifies whether to skip configuration checks.
372    pub skip_config_checks: Option<bool>,
373    #[serde(default)]
374    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
375    pub css_extraction_map: Option<CSSExtractionMap>,
376    #[serde(default)]
377    /// The chunking algorithm to use.
378    pub chunking_alg: Option<ChunkingAlgDict>,
379    #[serde(default)]
380    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
381    pub disable_intercept: Option<bool>,
382    #[serde(default)]
383    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
384    pub wait_for: Option<WaitFor>,
385    #[serde(default)]
386    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
387    pub execution_scripts: Option<ExecutionScriptsMap>,
388    #[serde(default)]
389    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
390    pub automation_scripts: Option<WebAutomationMap>,
391    #[serde(default)]
392    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
393    pub redirect_policy: Option<RedirectPolicy>,
394    #[serde(default)]
395    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
396    pub event_tracker: Option<EventTracker>,
397    #[serde(default)]
398    /// The timeout to stop the crawl.
399    pub crawl_timeout: Option<Timeout>,
400    #[serde(default)]
401    /// Evaluates given script in every frame upon creation (before loading frame's scripts).
402    pub evaluate_on_new_document: Option<Box<String>>,
403    #[serde(default)]
404    /// Runs the request using lite_mode:Lite mode reduces data transfer costs by 70%, with trade-offs in speed, accuracy,
405    /// geo-targeting, and reliability. It’s best suited for non-urgent data collection or when
406    /// targeting websites with minimal anti-bot protections.
407    pub lite_mode: Option<bool>,
408}
409
410/// The structure representing request parameters for a search request.
411#[derive(Debug, Default, Deserialize, Serialize, Clone)]
412pub struct SearchRequestParams {
413    /// The base request parameters.
414    #[serde(default, flatten)]
415    pub base: RequestParams,
416    // The search request.
417    pub search: String,
418    /// The search limit.
419    pub search_limit: Option<u32>,
420    // Fetch the page content. Defaults to true.
421    pub fetch_page_content: Option<bool>,
422    /// The search location of the request
423    pub location: Option<String>,
424    /// The country code of the request
425    pub country: Option<String>,
426    /// The language code of the request.
427    pub language: Option<String>,
428    /// The number of search results
429    pub num: Option<u32>,
430    /// The page of the search results.
431    pub page: Option<u32>,
432    #[serde(default)]
433    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
434    pub website_limit: Option<u32>,
435}
436
437/// Structure representing request parameters for transforming files.
438#[derive(Debug, Default, Deserialize, Serialize, Clone)]
439pub struct TransformParams {
440    #[serde(default)]
441    /// The format in which the result should be returned.
442    pub return_format: Option<ReturnFormat>,
443    #[serde(default)]
444    /// Specifies whether readability optimizations should be applied.
445    pub readability: Option<bool>,
446    #[serde(default)]
447    /// Clean the markdown or text for AI.
448    pub clean: Option<bool>,
449    #[serde(default)]
450    /// Clean the markdown or text for AI removing footers, navigation, and more.
451    pub clean_full: Option<bool>,
452    /// The data being transformed.
453    pub data: Vec<DataParam>,
454}
455
456#[derive(Serialize, Deserialize, Debug, Clone)]
457pub struct DataParam {
458    /// The HTML resource.
459    pub html: String,
460    /// The website url.
461    pub url: Option<String>,
462}
463
464/// the request type to perform
465#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
466#[serde(rename_all = "lowercase")]
467pub enum RequestType {
468    /// Default HTTP request
469    Http,
470    /// Chrome browser rendering
471    Chrome,
472    #[default]
473    /// Smart mode defaulting to HTTP and using Chrome when needed.
474    SmartMode,
475}
476
477/// Enum representing different return formats.
478#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
479#[serde(rename_all = "lowercase")]
480pub enum ReturnFormat {
481    #[default]
482    /// The default return format of the resource.
483    Raw,
484    /// Return the response as Markdown.
485    Markdown,
486    /// Return the response as Commonmark.
487    Commonmark,
488    /// Return the response as Html2text.
489    Html2text,
490    /// Return the response as Text.
491    Text,
492    /// Return the response as XML.
493    Xml,
494    /// Return the response as Bytes.
495    Bytes,
496}
497
498/// Represents a Spider with API key and HTTP client.
499#[derive(Debug, Default)]
500pub struct Spider {
501    /// The Spider API key.
502    pub api_key: String,
503    /// The Spider Client to re-use.
504    pub client: Client,
505}
506
507impl Spider {
508    /// Creates a new instance of Spider.
509    ///
510    /// # Arguments
511    ///
512    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
513    ///
514    /// # Returns
515    ///
516    /// A new instance of Spider or an error string if no API key is provided.
517    pub fn new(api_key: Option<String>) -> Result<Self, &'static str> {
518        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
519
520        match api_key {
521            Some(key) => Ok(Self {
522                api_key: key,
523                client: Client::new(),
524            }),
525            None => Err("No API key provided"),
526        }
527    }
528
529    /// Creates a new instance of Spider.
530    ///
531    /// # Arguments
532    ///
533    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
534    /// * `client` - A custom client to pass in.
535    ///
536    /// # Returns
537    ///
538    /// A new instance of Spider or an error string if no API key is provided.
539    pub fn new_with_client(api_key: Option<String>, client: Client) -> Result<Self, &'static str> {
540        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
541
542        match api_key {
543            Some(key) => Ok(Self {
544                api_key: key,
545                client,
546            }),
547            None => Err("No API key provided"),
548        }
549    }
550
551    /// Sends a POST request to the API.
552    ///
553    /// # Arguments
554    ///
555    /// * `endpoint` - The API endpoint.
556    /// * `data` - The request data as a HashMap.
557    /// * `stream` - Whether streaming is enabled.
558    /// * `content_type` - The content type of the request.
559    ///
560    /// # Returns
561    ///
562    /// The response from the API.
563    async fn api_post_base(
564        &self,
565        endpoint: &str,
566        data: impl Serialize + Sized + std::fmt::Debug,
567        content_type: &str,
568    ) -> Result<Response, Error> {
569        let url: String = format!("{API_URL}/{}", endpoint);
570
571        self.client
572            .post(&url)
573            .header(
574                "User-Agent",
575                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
576            )
577            .header("Content-Type", content_type)
578            .header("Authorization", format!("Bearer {}", self.api_key))
579            .json(&data)
580            .send()
581            .await
582    }
583
584    /// Sends a POST request to the API.
585    ///
586    /// # Arguments
587    ///
588    /// * `endpoint` - The API endpoint.
589    /// * `data` - The request data as a HashMap.
590    /// * `stream` - Whether streaming is enabled.
591    /// * `content_type` - The content type of the request.
592    ///
593    /// # Returns
594    ///
595    /// The response from the API.
596    async fn api_post(
597        &self,
598        endpoint: &str,
599        data: impl Serialize + std::fmt::Debug + Clone + Send + Sync,
600        content_type: &str,
601    ) -> Result<Response, Error> {
602        let fetch = || async {
603            self.api_post_base(endpoint, data.to_owned(), content_type)
604                .await
605        };
606
607        fetch
608            .retry(ExponentialBuilder::default().with_max_times(5))
609            .when(|err: &reqwest::Error| {
610                if let Some(status) = err.status() {
611                    status.is_server_error()
612                } else {
613                    err.is_timeout()
614                }
615            })
616            .await
617    }
618
619    /// Sends a GET request to the API.
620    ///
621    /// # Arguments
622    ///
623    /// * `endpoint` - The API endpoint.
624    ///
625    /// # Returns
626    ///
627    /// The response from the API as a JSON value.
628    async fn api_get_base<T: Serialize>(
629        &self,
630        endpoint: &str,
631        query_params: Option<&T>,
632    ) -> Result<serde_json::Value, reqwest::Error> {
633        let url = format!("{API_URL}/{}", endpoint);
634        let res = self
635            .client
636            .get(&url)
637            .query(&query_params)
638            .header(
639                "User-Agent",
640                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
641            )
642            .header("Content-Type", "application/json")
643            .header("Authorization", format!("Bearer {}", self.api_key))
644            .send()
645            .await?;
646        res.json().await
647    }
648
649    /// Sends a GET request to the API.
650    ///
651    /// # Arguments
652    ///
653    /// * `endpoint` - The API endpoint.
654    ///
655    /// # Returns
656    ///
657    /// The response from the API as a JSON value.
658    async fn api_get<T: Serialize>(
659        &self,
660        endpoint: &str,
661        query_params: Option<&T>,
662    ) -> Result<serde_json::Value, reqwest::Error> {
663        let fetch = || async { self.api_get_base(endpoint, query_params.to_owned()).await };
664
665        fetch
666            .retry(ExponentialBuilder::default().with_max_times(5))
667            .when(|err: &reqwest::Error| {
668                if let Some(status) = err.status() {
669                    status.is_server_error()
670                } else {
671                    err.is_timeout()
672                }
673            })
674            .await
675    }
676
677    /// Sends a DELETE request to the API.
678    ///
679    /// # Arguments
680    ///
681    /// * `endpoint` - The API endpoint.
682    /// * `params` - Optional request parameters.
683    /// * `stream` - Whether streaming is enabled.
684    /// * `content_type` - The content type of the request.
685    ///
686    /// # Returns
687    ///
688    /// The response from the API.
689    async fn api_delete_base(
690        &self,
691        endpoint: &str,
692        params: Option<HashMap<String, serde_json::Value>>,
693    ) -> Result<Response, Error> {
694        let url = format!("{API_URL}/v1/{}", endpoint);
695        let request_builder = self
696            .client
697            .delete(&url)
698            .header(
699                "User-Agent",
700                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
701            )
702            .header("Content-Type", "application/json")
703            .header("Authorization", format!("Bearer {}", self.api_key));
704
705        let request_builder = if let Some(params) = params {
706            request_builder.json(&params)
707        } else {
708            request_builder
709        };
710
711        request_builder.send().await
712    }
713
714    /// Sends a DELETE request to the API.
715    ///
716    /// # Arguments
717    ///
718    /// * `endpoint` - The API endpoint.
719    /// * `params` - Optional request parameters.
720    /// * `stream` - Whether streaming is enabled.
721    /// * `content_type` - The content type of the request.
722    ///
723    /// # Returns
724    ///
725    /// The response from the API.
726    async fn api_delete(
727        &self,
728        endpoint: &str,
729        params: Option<HashMap<String, serde_json::Value>>,
730    ) -> Result<Response, Error> {
731        let fetch = || async { self.api_delete_base(endpoint, params.to_owned()).await };
732
733        fetch
734            .retry(ExponentialBuilder::default().with_max_times(5))
735            .when(|err: &reqwest::Error| {
736                if let Some(status) = err.status() {
737                    status.is_server_error()
738                } else {
739                    err.is_timeout()
740                }
741            })
742            .await
743    }
744
745    /// Scrapes a URL.
746    ///
747    /// # Arguments
748    ///
749    /// * `url` - The URL to scrape.
750    /// * `params` - Optional request parameters.
751    /// * `stream` - Whether streaming is enabled.
752    /// * `content_type` - The content type of the request.
753    ///
754    /// # Returns
755    ///
756    /// The response from the API as a JSON value.
757    pub async fn scrape_url(
758        &self,
759        url: &str,
760        params: Option<RequestParams>,
761        content_type: &str,
762    ) -> Result<serde_json::Value, reqwest::Error> {
763        let mut data = HashMap::new();
764
765        data.insert(
766            "url".to_string(),
767            serde_json::Value::String(url.to_string()),
768        );
769        data.insert("limit".to_string(), serde_json::Value::Number(1.into()));
770
771        if let Ok(params) = serde_json::to_value(params) {
772            if let Some(ref p) = params.as_object() {
773                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
774            }
775        }
776
777        let res = self.api_post("crawl", data, content_type).await?;
778        res.json().await
779    }
780
781    /// Crawls a URL.
782    ///
783    /// # Arguments
784    ///
785    /// * `url` - The URL to crawl.
786    /// * `params` - Optional request parameters.
787    /// * `stream` - Whether streaming is enabled.
788    /// * `content_type` - The content type of the request.
789    /// * `callback` - Optional callback function to handle each streamed chunk.
790    ///
791    /// # Returns
792    ///
793    /// The response from the API as a JSON value.
794    pub async fn crawl_url(
795        &self,
796        url: &str,
797        params: Option<RequestParams>,
798        stream: bool,
799        content_type: &str,
800        callback: Option<impl Fn(serde_json::Value) + Send>,
801    ) -> Result<serde_json::Value, reqwest::Error> {
802        use tokio_util::codec::{FramedRead, LinesCodec};
803
804        let mut data = HashMap::new();
805
806        if let Ok(params) = serde_json::to_value(params) {
807            if let Some(ref p) = params.as_object() {
808                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
809            }
810        }
811
812        data.insert("url".into(), serde_json::Value::String(url.to_string()));
813
814        let res = self.api_post("crawl", data, content_type).await?;
815
816        if stream {
817            if let Some(callback) = callback {
818                let stream = res.bytes_stream();
819
820                let stream_reader = tokio_util::io::StreamReader::new(
821                    stream.map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
822                );
823
824                let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
825
826                while let Some(line_result) = lines.next().await {
827                    match line_result {
828                        Ok(line) => {
829                            match serde_json::from_str::<serde_json::Value>(&line) {
830                                Ok(value) => {
831                                    callback(value);
832                                }
833                                Err(_e) => {
834                                    continue;
835                                }
836                            }
837                        }
838                        Err(_e) => {
839                            return Ok(serde_json::Value::Null)
840                        }
841                    }
842                }
843
844                Ok(serde_json::Value::Null)
845            } else {
846                Ok(serde_json::Value::Null)
847            }
848        } else {
849            res.json().await
850        }
851    }
852
853    /// Fetches links from a URL.
854    ///
855    /// # Arguments
856    ///
857    /// * `url` - The URL to fetch links from.
858    /// * `params` - Optional request parameters.
859    /// * `stream` - Whether streaming is enabled.
860    /// * `content_type` - The content type of the request.
861    ///
862    /// # Returns
863    ///
864    /// The response from the API as a JSON value.
865    pub async fn links(
866        &self,
867        url: &str,
868        params: Option<RequestParams>,
869        _stream: bool,
870        content_type: &str,
871    ) -> Result<serde_json::Value, reqwest::Error> {
872        let mut data = HashMap::new();
873
874        if let Ok(params) = serde_json::to_value(params) {
875            if let Some(ref p) = params.as_object() {
876                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
877            }
878        }
879
880        data.insert("url".into(), serde_json::Value::String(url.to_string()));
881
882        let res = self.api_post("links", data, content_type).await?;
883        res.json().await
884    }
885
886    /// Takes a screenshot of a URL.
887    ///
888    /// # Arguments
889    ///
890    /// * `url` - The URL to take a screenshot of.
891    /// * `params` - Optional request parameters.
892    /// * `stream` - Whether streaming is enabled.
893    /// * `content_type` - The content type of the request.
894    ///
895    /// # Returns
896    ///
897    /// The response from the API as a JSON value.
898    pub async fn screenshot(
899        &self,
900        url: &str,
901        params: Option<RequestParams>,
902        _stream: bool,
903        content_type: &str,
904    ) -> Result<serde_json::Value, reqwest::Error> {
905        let mut data = HashMap::new();
906
907        if let Ok(params) = serde_json::to_value(params) {
908            if let Some(ref p) = params.as_object() {
909                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
910            }
911        }
912
913        data.insert("url".into(), serde_json::Value::String(url.to_string()));
914
915        let res = self.api_post("screenshot", data, content_type).await?;
916        res.json().await
917    }
918
919    /// Searches for a query.
920    ///
921    /// # Arguments
922    ///
923    /// * `q` - The query to search for.
924    /// * `params` - Optional request parameters.
925    /// * `stream` - Whether streaming is enabled.
926    /// * `content_type` - The content type of the request.
927    ///
928    /// # Returns
929    ///
930    /// The response from the API as a JSON value.
931    pub async fn search(
932        &self,
933        q: &str,
934        params: Option<SearchRequestParams>,
935        _stream: bool,
936        content_type: &str,
937    ) -> Result<serde_json::Value, reqwest::Error> {
938        let body = match params {
939            Some(mut params) => {
940                params.search = q.to_string();
941                params
942            }
943            _ => {
944                let mut params = SearchRequestParams::default();
945                params.search = q.to_string();
946                params
947            }
948        };
949
950        let res = self.api_post("search", body, content_type).await?;
951
952        res.json().await
953    }
954
955    /// Transforms data.
956    ///
957    /// # Arguments
958    ///
959    /// * `data` - The data to transform.
960    /// * `params` - Optional request parameters.
961    /// * `stream` - Whether streaming is enabled.
962    /// * `content_type` - The content type of the request.
963    ///
964    /// # Returns
965    ///
966    /// The response from the API as a JSON value.
967    pub async fn transform(
968        &self,
969        data: Vec<HashMap<&str, &str>>,
970        params: Option<TransformParams>,
971        _stream: bool,
972        content_type: &str,
973    ) -> Result<serde_json::Value, reqwest::Error> {
974        let mut payload = HashMap::new();
975
976        if let Ok(params) = serde_json::to_value(params) {
977            if let Some(ref p) = params.as_object() {
978                payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
979            }
980        }
981
982        if let Ok(d) = serde_json::to_value(data) {
983            payload.insert("data".into(), d);
984        }
985
986        let res = self.api_post("transform", payload, content_type).await?;
987
988        res.json().await
989    }
990
991    /// Extracts contacts from a URL.
992    ///
993    /// # Arguments
994    ///
995    /// * `url` - The URL to extract contacts from.
996    /// * `params` - Optional request parameters.
997    /// * `stream` - Whether streaming is enabled.
998    /// * `content_type` - The content type of the request.
999    ///
1000    /// # Returns
1001    ///
1002    /// The response from the API as a JSON value.
1003    pub async fn extract_contacts(
1004        &self,
1005        url: &str,
1006        params: Option<RequestParams>,
1007        _stream: bool,
1008        content_type: &str,
1009    ) -> Result<serde_json::Value, reqwest::Error> {
1010        let mut data = HashMap::new();
1011
1012        if let Ok(params) = serde_json::to_value(params) {
1013            if let Ok(params) = serde_json::to_value(params) {
1014                if let Some(ref p) = params.as_object() {
1015                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1016                }
1017            }
1018        }
1019
1020        match serde_json::to_value(url) {
1021            Ok(u) => {
1022                data.insert("url".into(), u);
1023            }
1024            _ => (),
1025        }
1026
1027        let res = self
1028            .api_post("pipeline/extract-contacts", data, content_type)
1029            .await?;
1030        res.json().await
1031    }
1032
1033    /// Labels data from a URL.
1034    ///
1035    /// # Arguments
1036    ///
1037    /// * `url` - The URL to label data from.
1038    /// * `params` - Optional request parameters.
1039    /// * `stream` - Whether streaming is enabled.
1040    /// * `content_type` - The content type of the request.
1041    ///
1042    /// # Returns
1043    ///
1044    /// The response from the API as a JSON value.
1045    pub async fn label(
1046        &self,
1047        url: &str,
1048        params: Option<RequestParams>,
1049        _stream: bool,
1050        content_type: &str,
1051    ) -> Result<serde_json::Value, reqwest::Error> {
1052        let mut data = HashMap::new();
1053
1054        if let Ok(params) = serde_json::to_value(params) {
1055            if let Ok(params) = serde_json::to_value(params) {
1056                if let Some(ref p) = params.as_object() {
1057                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1058                }
1059            }
1060        }
1061
1062        data.insert("url".into(), serde_json::Value::String(url.to_string()));
1063
1064        let res = self.api_post("pipeline/label", data, content_type).await?;
1065        res.json().await
1066    }
1067
1068    /// Download a record from storage.
1069    ///
1070    /// # Arguments
1071    ///
1072    /// * `url` - Optional exact url of the file in storage.
1073    /// * `options` - Optional options.
1074    /// * `stream` - Whether streaming is enabled.
1075    ///
1076    /// # Returns
1077    ///
1078    /// The response from the API.
1079    pub async fn download(
1080        &self,
1081        url: Option<&str>,
1082        options: Option<HashMap<&str, i32>>,
1083    ) -> Result<reqwest::Response, reqwest::Error> {
1084        let mut params = HashMap::new();
1085
1086        if let Some(url) = url {
1087            params.insert("url".to_string(), url.to_string());
1088        }
1089
1090        if let Some(options) = options {
1091            for (key, value) in options {
1092                params.insert(key.to_string(), value.to_string());
1093            }
1094        }
1095
1096        let url = format!("{API_URL}/v1/data/download");
1097        let request = self
1098            .client
1099            .get(&url)
1100            .header(
1101                "User-Agent",
1102                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1103            )
1104            .header("Content-Type", "application/octet-stream")
1105            .header("Authorization", format!("Bearer {}", self.api_key))
1106            .query(&params);
1107
1108        let res = request.send().await?;
1109
1110        Ok(res)
1111    }
1112
1113    /// Creates a signed URL of a file from storage.
1114    ///
1115    /// # Arguments
1116    ///
1117    /// * `url` - Optional exact url of the file in storage.
1118    /// * `options` - Optional options.
1119    /// * `stream` - Whether streaming is enabled.
1120    ///
1121    /// # Returns
1122    ///
1123    /// The response from the API.
1124    pub async fn create_signed_url(
1125        &self,
1126        url: Option<&str>,
1127        options: Option<HashMap<&str, i32>>,
1128    ) -> Result<serde_json::Value, reqwest::Error> {
1129        let mut params = HashMap::new();
1130
1131        if let Some(options) = options {
1132            for (key, value) in options {
1133                params.insert(key.to_string(), value.to_string());
1134            }
1135        }
1136
1137        if let Some(url) = url {
1138            params.insert("url".to_string(), url.to_string());
1139        }
1140
1141        let url = format!("{API_URL}/v1/data/sign-url");
1142        let request = self
1143            .client
1144            .get(&url)
1145            .header(
1146                "User-Agent",
1147                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1148            )
1149            .header("Authorization", format!("Bearer {}", self.api_key))
1150            .query(&params);
1151
1152        let res = request.send().await?;
1153
1154        res.json().await
1155    }
1156
1157    /// Gets the crawl state of a URL.
1158    ///
1159    /// # Arguments
1160    ///
1161    /// * `url` - The URL to get the crawl state of.
1162    /// * `params` - Optional request parameters.
1163    /// * `stream` - Whether streaming is enabled.
1164    /// * `content_type` - The content type of the request.
1165    ///
1166    /// # Returns
1167    ///
1168    pub async fn get_crawl_state(
1169        &self,
1170        url: &str,
1171        params: Option<RequestParams>,
1172        content_type: &str,
1173    ) -> Result<serde_json::Value, reqwest::Error> {
1174        let mut payload = HashMap::new();
1175        payload.insert("url".into(), serde_json::Value::String(url.to_string()));
1176        payload.insert(
1177            "contentType".into(),
1178            serde_json::Value::String(content_type.to_string()),
1179        );
1180
1181        if let Ok(params) = serde_json::to_value(params) {
1182            if let Ok(params) = serde_json::to_value(params) {
1183                if let Some(ref p) = params.as_object() {
1184                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1185                }
1186            }
1187        }
1188
1189        let res = self
1190            .api_post("data/crawl_state", payload, content_type)
1191            .await?;
1192        res.json().await
1193    }
1194
1195    /// Get the account credits left.
1196    pub async fn get_credits(&self) -> Result<serde_json::Value, reqwest::Error> {
1197        self.api_get::<serde_json::Value>("data/credits", None)
1198            .await
1199    }
1200
1201    /// Send a request for a data record.
1202    pub async fn data_post(
1203        &self,
1204        table: &str,
1205        data: Option<RequestParams>,
1206    ) -> Result<serde_json::Value, reqwest::Error> {
1207        let res = self
1208            .api_post(&format!("data/{}", table), data, "application/json")
1209            .await?;
1210        res.json().await
1211    }
1212
1213    /// Query a record from the global DB.
1214    pub async fn query(&self, params: &QueryRequest) -> Result<serde_json::Value, reqwest::Error> {
1215        let res = self
1216            .api_get::<QueryRequest>(&"data/query", Some(params))
1217            .await?;
1218
1219        Ok(res)
1220    }
1221
1222    /// Get a table record.
1223    pub async fn data_get(
1224        &self,
1225        table: &str,
1226        params: Option<RequestParams>,
1227    ) -> Result<serde_json::Value, reqwest::Error> {
1228        let mut payload = HashMap::new();
1229
1230        if let Some(params) = params {
1231            if let Ok(p) = serde_json::to_value(params) {
1232                if let Some(o) = p.as_object() {
1233                    payload.extend(o.iter().map(|(k, v)| (k.as_str(), v.clone())));
1234                }
1235            }
1236        }
1237
1238        let res = self
1239            .api_get::<serde_json::Value>(&format!("data/{}", table), None)
1240            .await?;
1241        Ok(res)
1242    }
1243
1244    /// Delete a record.
1245    pub async fn data_delete(
1246        &self,
1247        table: &str,
1248        params: Option<RequestParams>,
1249    ) -> Result<serde_json::Value, reqwest::Error> {
1250        let mut payload = HashMap::new();
1251
1252        if let Ok(params) = serde_json::to_value(params) {
1253            if let Ok(params) = serde_json::to_value(params) {
1254                if let Some(ref p) = params.as_object() {
1255                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1256                }
1257            }
1258        }
1259
1260        let res = self
1261            .api_delete(&format!("data/{}", table), Some(payload))
1262            .await?;
1263        res.json().await
1264    }
1265}
1266
1267#[cfg(test)]
1268mod tests {
1269    use super::*;
1270    use dotenv::dotenv;
1271    use lazy_static::lazy_static;
1272    use reqwest::ClientBuilder;
1273
1274    lazy_static! {
1275        static ref SPIDER_CLIENT: Spider = {
1276            dotenv().ok();
1277            let client = ClientBuilder::new();
1278            let client = client.user_agent("SpiderBot").build().unwrap();
1279
1280            Spider::new_with_client(None, client).expect("client to build")
1281        };
1282    }
1283
1284    #[tokio::test]
1285    #[ignore]
1286    async fn test_scrape_url() {
1287        let response = SPIDER_CLIENT
1288            .scrape_url("https://example.com", None, "application/json")
1289            .await;
1290        assert!(response.is_ok());
1291    }
1292
1293    #[tokio::test]
1294    async fn test_crawl_url() {
1295        let response = SPIDER_CLIENT
1296            .crawl_url(
1297                "https://example.com",
1298                None,
1299                false,
1300                "application/json",
1301                None::<fn(serde_json::Value)>,
1302            )
1303            .await;
1304        assert!(response.is_ok());
1305    }
1306
1307    #[tokio::test]
1308    #[ignore]
1309    async fn test_links() {
1310        let response: Result<serde_json::Value, Error> = SPIDER_CLIENT
1311            .links("https://example.com", None, false, "application/json")
1312            .await;
1313        assert!(response.is_ok());
1314    }
1315
1316    #[tokio::test]
1317    #[ignore]
1318    async fn test_screenshot() {
1319        let mut params = RequestParams::default();
1320        params.limit = Some(1);
1321
1322        let response = SPIDER_CLIENT
1323            .screenshot(
1324                "https://example.com",
1325                Some(params),
1326                false,
1327                "application/json",
1328            )
1329            .await;
1330        assert!(response.is_ok());
1331    }
1332
1333    // #[tokio::test(flavor = "multi_thread")]
1334    // async fn test_search() {
1335    //     let mut params = SearchRequestParams::default();
1336
1337    //     params.search_limit = Some(1);
1338    //     params.num = Some(1);
1339    //     params.fetch_page_content = Some(false);
1340
1341    //     let response = SPIDER_CLIENT
1342    //         .search("a sports website", Some(params), false, "application/json")
1343    //         .await;
1344
1345    //     assert!(response.is_ok());
1346    // }
1347
1348    #[tokio::test]
1349    #[ignore]
1350    async fn test_transform() {
1351        let data = vec![HashMap::from([(
1352            "<html><body><h1>Transformation</h1></body></html>".into(),
1353            "".into(),
1354        )])];
1355        let response = SPIDER_CLIENT
1356            .transform(data, None, false, "application/json")
1357            .await;
1358        assert!(response.is_ok());
1359    }
1360
1361    #[tokio::test]
1362    #[ignore]
1363    async fn test_extract_contacts() {
1364        let response = SPIDER_CLIENT
1365            .extract_contacts("https://example.com", None, false, "application/json")
1366            .await;
1367        assert!(response.is_ok());
1368    }
1369
1370    #[tokio::test]
1371    #[ignore]
1372    async fn test_label() {
1373        let response = SPIDER_CLIENT
1374            .label("https://example.com", None, false, "application/json")
1375            .await;
1376        assert!(response.is_ok());
1377    }
1378
1379    #[tokio::test]
1380    async fn test_create_signed_url() {
1381        let response = SPIDER_CLIENT
1382            .create_signed_url(Some("example.com"), None)
1383            .await;
1384        assert!(response.is_ok());
1385    }
1386
1387    #[tokio::test]
1388    async fn test_get_crawl_state() {
1389        let response = SPIDER_CLIENT
1390            .get_crawl_state("https://example.com", None, "application/json")
1391            .await;
1392        assert!(response.is_ok());
1393    }
1394
1395    #[tokio::test]
1396    async fn test_query() {
1397        let mut query = QueryRequest::default();
1398
1399        query.domain = Some("spider.cloud".into());
1400
1401        let response = SPIDER_CLIENT.query(&query).await;
1402        assert!(response.is_ok());
1403    }
1404
1405    #[tokio::test]
1406    async fn test_get_credits() {
1407        let response = SPIDER_CLIENT.get_credits().await;
1408        assert!(response.is_ok());
1409    }
1410}
spider_client/lib.rs

spider_client/
lib.rs