spider_client/
lib.rs

1//! The `spider-client` module provides the primary interface and
2//! functionalities for the Spider web crawler library, which is
3//! designed for rapid and efficient crawling of web pages to gather
4//! links using isolated contexts.
5//!
6//! ### Features
7//!
8//! - **Multi-threaded Crawling:** Spider can utilize multiple
9//!   threads to parallelize the crawling process, drastically
10//!   improving performance and allowing the ability to gather
11//!   millions of pages in a short time.
12//!
13//! - **Configurable:** The library provides various options to
14//!   configure the crawling behavior, such as setting the depth
15//!   of crawling, user-agent strings, delays between requests,
16//!   and more.
17//!
18//! - **Link Gathering:** One of the primary objectives of Spider is to
19//!   gather and manage links from the web pages it crawls,
20//!   compiling them into a structured format for further use.
21//!
22//! ### Examples
23//!
24//! Basic usage of the Spider client might look like this:
25//!
26//! ```rust
27//! use spider_client::{Spider, RequestType, RequestParams};
28//! use tokio;
29//!
30//!  # #[ignore]
31//! #[tokio::main]
32//! async fn main() {
33//!     let spider = Spider::new(Some("myspiderapikey".into())).expect("API key must be provided");
34//!
35//!     let url = "https://spider.cloud";
36//!
37//!     // Scrape a single URL
38//!     let scraped_data = spider.scrape_url(url, None, "application/json").await.expect("Failed to scrape the URL");
39//!
40//!     println!("Scraped Data: {:?}", scraped_data);
41//!
42//!     // Crawl a website
43//!     let crawler_params = RequestParams {
44//!         limit: Some(1),
45//!         proxy_enabled: Some(true),
46//!         store_data: Some(false),
47//!         metadata: Some(false),
48//!         request: Some(RequestType::Http),
49//!         ..Default::default()
50//!     };
51//!
52//!     let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
53//!
54//!     println!("Crawl Result: {:?}", crawl_result);
55//! }
56//! ```
57//!
58//! ### Modules
59//!
60//! - `config`: Contains the configuration options for the Spider client.
61//! - `utils`: Utility functions used by the Spider client.
62//!
63
64use backon::ExponentialBuilder;
65use backon::Retryable;
66use reqwest::Client;
67use reqwest::{Error, Response};
68use serde::{Deserialize, Serialize};
69use std::collections::HashMap;
70use tokio_stream::StreamExt;
71
72/// Structure representing the Chunking algorithm dictionary.
73#[derive(Debug, Deserialize, Serialize, Clone)]
74pub struct ChunkingAlgDict {
75    /// The chunking algorithm to use, defined as a specific type.
76    r#type: ChunkingType,
77    /// The amount to chunk by.
78    value: i32,
79}
80
81// The nested structures
82#[derive(Serialize, Deserialize, Debug, Clone)]
83pub struct Timeout {
84    /// The seconds up to 60.
85    pub secs: u64,
86    /// The nanoseconds.
87    pub nanos: u32,
88}
89
90#[derive(Serialize, Deserialize, Debug, Clone)]
91pub struct IdleNetwork {
92    /// The timeout to wait until.
93    pub timeout: Timeout,
94}
95
96#[derive(Serialize, Deserialize, Debug, Clone)]
97#[serde(tag = "type", rename_all = "PascalCase")]
98pub enum WebAutomation {
99    Evaluate { code: String },
100    Click { selector: String },
101    Wait { duration: u64 },
102    WaitForNavigation,
103    WaitFor { selector: String },
104    WaitForAndClick { selector: String },
105    ScrollX { pixels: i32 },
106    ScrollY { pixels: i32 },
107    Fill { selector: String, value: String },
108    InfiniteScroll { times: u32 },
109}
110
111#[derive(Default, Serialize, Deserialize, Debug, Clone)]
112#[serde(tag = "type", rename_all = "PascalCase")]
113pub enum RedirectPolicy {
114    Loose,
115    #[default]
116    Strict,
117}
118
119pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
120pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
121
122#[derive(Serialize, Deserialize, Debug, Clone)]
123pub struct Selector {
124    /// The timeout to wait until.
125    pub timeout: Timeout,
126    /// The selector to wait for.
127    pub selector: String,
128}
129
130#[derive(Serialize, Deserialize, Debug, Clone)]
131pub struct Delay {
132    /// The timeout to wait until.
133    pub timeout: Timeout,
134}
135
136#[derive(Serialize, Deserialize, Debug, Clone)]
137pub struct WaitFor {
138    /// Wait until idle networks with a timeout of idleness.
139    pub idle_network: Option<IdleNetwork>,
140    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
141    pub selector: Option<Selector>,
142    /// Wait until a hard delay.
143    pub delay: Option<Delay>,
144    /// Wait until page navigation happen. Default is true.
145    pub page_navigations: Option<bool>,
146}
147
148/// Query request to get a document.
149#[derive(Serialize, Deserialize, Debug, Clone, Default)]
150pub struct QueryRequest {
151    /// The exact website url.
152    pub url: Option<String>,
153    /// The website domain.
154    pub domain: Option<String>,
155    /// The path of the resource.
156    pub pathname: Option<String>,
157}
158
159/// Enum representing different types of Chunking.
160#[derive(Default, Debug, Deserialize, Serialize, Clone)]
161#[serde(rename_all = "lowercase")]
162pub enum ChunkingType {
163    #[default]
164    /// By the word count.
165    ByWords,
166    /// By the line count.
167    ByLines,
168    /// By the char length.
169    ByCharacterLength,
170    /// By sentence.
171    BySentence,
172}
173
174#[derive(Default, Debug, Deserialize, Serialize, Clone)]
175/// View port handling for chrome.
176pub struct Viewport {
177    /// Device screen Width
178    pub width: u32,
179    /// Device screen size
180    pub height: u32,
181    /// Device scale factor
182    pub device_scale_factor: Option<f64>,
183    /// Emulating Mobile?
184    pub emulating_mobile: bool,
185    /// Use landscape mode instead of portrait.
186    pub is_landscape: bool,
187    /// Touch screen device?
188    pub has_touch: bool,
189}
190
191/// The API url.
192const API_URL: &'static str = "https://api.spider.cloud";
193
194// Define the CSSSelector struct
195#[derive(Debug, Clone, Default, Deserialize, Serialize)]
196pub struct CSSSelector {
197    /// The name of the selector group
198    pub name: String,
199    /// A vector of CSS selectors
200    pub selectors: Vec<String>,
201}
202
203// Define the CSSExtractionMap type
204pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
205
206/// Represents the settings for a webhook configuration
207#[derive(Debug, Default, Deserialize, Serialize, Clone)]
208pub struct WebhookSettings {
209    /// The destination where the webhook information will be sent
210    destination: String,
211    /// Trigger an action when all credits are depleted
212    on_credits_depleted: bool,
213    /// Trigger an action when half of the credits are depleted
214    on_credits_half_depleted: bool,
215    /// Trigger an action on a website status update event
216    on_website_status: bool,
217    /// Send information about a new page find (such as links and bytes)
218    on_find: bool,
219    /// Handle the metadata of a found page
220    on_find_metadata: bool,
221}
222
223/// Send multiple return formats.
224#[derive(Debug, Deserialize, Serialize, Clone)]
225#[serde(untagged)]
226pub enum ReturnFormatHandling {
227    /// A single return item.
228    Single(ReturnFormat),
229    /// Multiple return formats.
230    Multi(std::collections::HashSet<ReturnFormat>),
231}
232
233impl Default for ReturnFormatHandling {
234    fn default() -> ReturnFormatHandling {
235        ReturnFormatHandling::Single(ReturnFormat::Raw)
236    }
237}
238
239#[derive(Debug, Default, Deserialize, Serialize, Clone)]
240pub struct EventTracker {
241    /// The responses received.
242    responses: Option<bool>,
243    ///The request sent.
244    requests: Option<bool>
245}
246
247/// Structure representing request parameters.
248#[derive(Debug, Default, Deserialize, Serialize, Clone)]
249pub struct RequestParams {
250    #[serde(default)]
251    /// The URL to be crawled.
252    pub url: Option<String>,
253    #[serde(default)]
254    /// The type of request to be made.
255    pub request: Option<RequestType>,
256    #[serde(default)]
257    /// The maximum number of pages the crawler should visit.
258    pub limit: Option<u32>,
259    #[serde(default)]
260    /// The format in which the result should be returned.
261    pub return_format: Option<ReturnFormatHandling>,
262    #[serde(default)]
263    /// Specifies whether to only visit the top-level domain.
264    pub tld: Option<bool>,
265    #[serde(default)]
266    /// The depth of the crawl.
267    pub depth: Option<u32>,
268    #[serde(default)]
269    /// Specifies whether the request should be cached.
270    pub cache: Option<bool>,
271    #[serde(default)]
272    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
273    pub scroll: Option<u32>,
274    #[serde(default)]
275    /// The budget for various resources.
276    pub budget: Option<HashMap<String, u32>>,
277    #[serde(default)]
278    /// The blacklist routes to ignore. This can be a Regex string pattern.
279    pub blacklist: Option<Vec<String>>,
280    #[serde(default)]
281    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
282    pub whitelist: Option<Vec<String>>,
283    #[serde(default)]
284    /// The locale to be used during the crawl.
285    pub locale: Option<String>,
286    #[serde(default)]
287    /// The cookies to be set for the request, formatted as a single string.
288    pub cookies: Option<String>,
289    #[serde(default)]
290    /// Specifies whether to use stealth techniques to avoid detection.
291    pub stealth: Option<bool>,
292    #[serde(default)]
293    /// The headers to be used for the request.
294    pub headers: Option<HashMap<String, String>>,
295    #[serde(default)]
296    /// Specifies whether anti-bot measures should be used.
297    pub anti_bot: Option<bool>,
298    #[serde(default)]
299    /// Specifies whether to send data via webhooks.
300    pub webhooks: Option<WebhookSettings>,
301    #[serde(default)]
302    /// Specifies whether to include metadata in the response.
303    pub metadata: Option<bool>,
304    #[serde(default)]
305    /// The dimensions of the viewport.
306    pub viewport: Option<Viewport>,
307    #[serde(default)]
308    /// The encoding to be used for the request.
309    pub encoding: Option<String>,
310    #[serde(default)]
311    /// Specifies whether to include subdomains in the crawl.
312    pub subdomains: Option<bool>,
313    #[serde(default)]
314    /// The user agent string to be used for the request.
315    pub user_agent: Option<String>,
316    #[serde(default)]
317    /// Specifies whether the response data should be stored.
318    pub store_data: Option<bool>,
319    #[serde(default)]
320    /// Configuration settings for GPT (general purpose texture mappings).
321    pub gpt_config: Option<HashMap<String, String>>,
322    #[serde(default)]
323    /// Specifies whether to use fingerprinting protection.
324    pub fingerprint: Option<bool>,
325    #[serde(default)]
326    /// Specifies whether to perform the request without using storage.
327    pub storageless: Option<bool>,
328    #[serde(default)]
329    /// Specifies whether readability optimizations should be applied.
330    pub readability: Option<bool>,
331    #[serde(default)]
332    /// Specifies whether to use a proxy for the request.
333    pub proxy_enabled: Option<bool>,
334    #[serde(default)]
335    /// Specifies whether to respect the site's robots.txt file.
336    pub respect_robots: Option<bool>,
337    #[serde(default)]
338    /// CSS selector to be used to filter the content.
339    pub root_selector: Option<String>,
340    #[serde(default)]
341    /// Specifies whether to load all resources of the crawl target.
342    pub full_resources: Option<bool>,
343    #[serde(default)]
344    /// The text string to extract data from.
345    pub text: Option<String>,
346    #[serde(default)]
347    /// Specifies whether to use the sitemap links.
348    pub sitemap: Option<bool>,
349    #[serde(default)]
350    /// External domains to include the crawl.
351    pub external_domains: Option<Vec<String>>,
352    #[serde(default)]
353    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
354    pub return_embeddings: Option<bool>,
355    #[serde(default)]
356    /// Returns the HTTP response headers.
357    pub return_headers: Option<bool>,
358    #[serde(default)]
359    /// Returns the link(s) found on the page that match the crawler query.
360    pub return_page_links: Option<bool>,
361    #[serde(default)]
362    /// Returns the HTTP response cookies.
363    pub return_cookies: Option<bool>,
364    #[serde(default)]
365    /// The timeout for the request, in milliseconds.
366    pub request_timeout: Option<u8>,
367    #[serde(default)]
368    /// Specifies whether to run the request in the background.
369    pub run_in_background: Option<bool>,
370    #[serde(default)]
371    /// Specifies whether to skip configuration checks.
372    pub skip_config_checks: Option<bool>,
373    #[serde(default)]
374    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
375    pub css_extraction_map: Option<CSSExtractionMap>,
376    #[serde(default)]
377    /// The chunking algorithm to use.
378    pub chunking_alg: Option<ChunkingAlgDict>,
379    #[serde(default)]
380    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
381    pub disable_intercept: Option<bool>,
382    #[serde(default)]
383    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
384    pub wait_for: Option<WaitFor>,
385    #[serde(default)]
386    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
387    pub execution_scripts: Option<ExecutionScriptsMap>,
388    #[serde(default)]
389    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
390    pub automation_scripts: Option<WebAutomationMap>,
391    #[serde(default)]
392    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
393    pub redirect_policy: Option<RedirectPolicy>,
394    #[serde(default)]
395    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
396    pub event_tracker: Option<EventTracker>,
397    #[serde(default)]
398    /// The timeout to stop the crawl.
399    pub crawl_timeout: Option<Timeout>,
400    #[serde(default)]
401    /// Evaluates given script in every frame upon creation (before loading frame's scripts).
402    pub evaluate_on_new_document: Option<Box<String>>
403}
404
405/// The structure representing request parameters for a search request.
406#[derive(Debug, Default, Deserialize, Serialize, Clone)]
407pub struct SearchRequestParams {
408    /// The base request parameters.
409    #[serde(default, flatten)]
410    pub base: RequestParams,
411    // The search request.
412    pub search: String,
413    /// The search limit.
414    pub search_limit: Option<u32>,
415    // Fetch the page content. Defaults to true.
416    pub fetch_page_content: Option<bool>,
417    /// The search location of the request
418    pub location: Option<String>,
419    /// The country code of the request
420    pub country: Option<String>,
421    /// The language code of the request.
422    pub language: Option<String>,
423    /// The number of search results
424    pub num: Option<u32>,
425    /// The page of the search results.
426    pub page: Option<u32>,
427    #[serde(default)]
428    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
429    pub website_limit: Option<u32>,
430}
431
432/// Structure representing request parameters for transforming files.
433#[derive(Debug, Default, Deserialize, Serialize, Clone)]
434pub struct TransformParams {
435    #[serde(default)]
436    /// The format in which the result should be returned.
437    pub return_format: Option<ReturnFormat>,
438    #[serde(default)]
439    /// Specifies whether readability optimizations should be applied.
440    pub readability: Option<bool>,
441    #[serde(default)]
442    /// Clean the markdown or text for AI.
443    pub clean: Option<bool>,
444    #[serde(default)]
445    /// Clean the markdown or text for AI removing footers, navigation, and more.
446    pub clean_full: Option<bool>,
447    /// The data being transformed.
448    pub data: Vec<DataParam>,
449}
450
451#[derive(Serialize, Deserialize, Debug, Clone)]
452pub struct DataParam {
453    /// The HTML resource.
454    pub html: String,
455    /// The website url.
456    pub url: Option<String>,
457}
458
459/// the request type to perform
460#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
461#[serde(rename_all = "lowercase")]
462pub enum RequestType {
463    /// Default HTTP request
464    Http,
465    /// Chrome browser rendering
466    Chrome,
467    #[default]
468    /// Smart mode defaulting to HTTP and using Chrome when needed.
469    SmartMode,
470}
471
472/// Enum representing different return formats.
473#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
474#[serde(rename_all = "lowercase")]
475pub enum ReturnFormat {
476    #[default]
477    /// The default return format of the resource.
478    Raw,
479    /// Return the response as Markdown.
480    Markdown,
481    /// Return the response as Commonmark.
482    Commonmark,
483    /// Return the response as Html2text.
484    Html2text,
485    /// Return the response as Text.
486    Text,
487    /// Return the response as XML.
488    Xml,
489    /// Return the response as Bytes.
490    Bytes,
491}
492
493/// Represents a Spider with API key and HTTP client.
494#[derive(Debug, Default)]
495pub struct Spider {
496    /// The Spider API key.
497    pub api_key: String,
498    /// The Spider Client to re-use.
499    pub client: Client,
500}
501
502impl Spider {
503    /// Creates a new instance of Spider.
504    ///
505    /// # Arguments
506    ///
507    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
508    ///
509    /// # Returns
510    ///
511    /// A new instance of Spider or an error string if no API key is provided.
512    pub fn new(api_key: Option<String>) -> Result<Self, &'static str> {
513        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
514
515        match api_key {
516            Some(key) => Ok(Self {
517                api_key: key,
518                client: Client::new(),
519            }),
520            None => Err("No API key provided"),
521        }
522    }
523
524    /// Creates a new instance of Spider.
525    ///
526    /// # Arguments
527    ///
528    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
529    /// * `client` - A custom client to pass in.
530    ///
531    /// # Returns
532    ///
533    /// A new instance of Spider or an error string if no API key is provided.
534    pub fn new_with_client(api_key: Option<String>, client: Client) -> Result<Self, &'static str> {
535        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
536
537        match api_key {
538            Some(key) => Ok(Self {
539                api_key: key,
540                client,
541            }),
542            None => Err("No API key provided"),
543        }
544    }
545
546    /// Sends a POST request to the API.
547    ///
548    /// # Arguments
549    ///
550    /// * `endpoint` - The API endpoint.
551    /// * `data` - The request data as a HashMap.
552    /// * `stream` - Whether streaming is enabled.
553    /// * `content_type` - The content type of the request.
554    ///
555    /// # Returns
556    ///
557    /// The response from the API.
558    async fn api_post_base(
559        &self,
560        endpoint: &str,
561        data: impl Serialize + Sized + std::fmt::Debug,
562        content_type: &str,
563    ) -> Result<Response, Error> {
564        let url: String = format!("{API_URL}/{}", endpoint);
565
566        self.client
567            .post(&url)
568            .header(
569                "User-Agent",
570                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
571            )
572            .header("Content-Type", content_type)
573            .header("Authorization", format!("Bearer {}", self.api_key))
574            .json(&data)
575            .send()
576            .await
577    }
578
579    /// Sends a POST request to the API.
580    ///
581    /// # Arguments
582    ///
583    /// * `endpoint` - The API endpoint.
584    /// * `data` - The request data as a HashMap.
585    /// * `stream` - Whether streaming is enabled.
586    /// * `content_type` - The content type of the request.
587    ///
588    /// # Returns
589    ///
590    /// The response from the API.
591    async fn api_post(
592        &self,
593        endpoint: &str,
594        data: impl Serialize + std::fmt::Debug + Clone + Send + Sync,
595        content_type: &str,
596    ) -> Result<Response, Error> {
597        let fetch = || async {
598            self.api_post_base(endpoint, data.to_owned(), content_type)
599                .await
600        };
601
602        fetch
603            .retry(ExponentialBuilder::default().with_max_times(5))
604            .when(|err: &reqwest::Error| {
605                if let Some(status) = err.status() {
606                    status.is_server_error()
607                } else {
608                    err.is_timeout()
609                }
610            })
611            .await
612    }
613
614    /// Sends a GET request to the API.
615    ///
616    /// # Arguments
617    ///
618    /// * `endpoint` - The API endpoint.
619    ///
620    /// # Returns
621    ///
622    /// The response from the API as a JSON value.
623    async fn api_get_base<T: Serialize>(
624        &self,
625        endpoint: &str,
626        query_params: Option<&T>,
627    ) -> Result<serde_json::Value, reqwest::Error> {
628        let url = format!("{API_URL}/{}", endpoint);
629        let res = self
630            .client
631            .get(&url)
632            .query(&query_params)
633            .header(
634                "User-Agent",
635                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
636            )
637            .header("Content-Type", "application/json")
638            .header("Authorization", format!("Bearer {}", self.api_key))
639            .send()
640            .await?;
641        res.json().await
642    }
643
644    /// Sends a GET request to the API.
645    ///
646    /// # Arguments
647    ///
648    /// * `endpoint` - The API endpoint.
649    ///
650    /// # Returns
651    ///
652    /// The response from the API as a JSON value.
653    async fn api_get<T: Serialize>(
654        &self,
655        endpoint: &str,
656        query_params: Option<&T>,
657    ) -> Result<serde_json::Value, reqwest::Error> {
658        let fetch = || async { self.api_get_base(endpoint, query_params.to_owned()).await };
659
660        fetch
661            .retry(ExponentialBuilder::default().with_max_times(5))
662            .when(|err: &reqwest::Error| {
663                if let Some(status) = err.status() {
664                    status.is_server_error()
665                } else {
666                    err.is_timeout()
667                }
668            })
669            .await
670    }
671
672    /// Sends a DELETE request to the API.
673    ///
674    /// # Arguments
675    ///
676    /// * `endpoint` - The API endpoint.
677    /// * `params` - Optional request parameters.
678    /// * `stream` - Whether streaming is enabled.
679    /// * `content_type` - The content type of the request.
680    ///
681    /// # Returns
682    ///
683    /// The response from the API.
684    async fn api_delete_base(
685        &self,
686        endpoint: &str,
687        params: Option<HashMap<String, serde_json::Value>>,
688    ) -> Result<Response, Error> {
689        let url = format!("{API_URL}/v1/{}", endpoint);
690        let request_builder = self
691            .client
692            .delete(&url)
693            .header(
694                "User-Agent",
695                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
696            )
697            .header("Content-Type", "application/json")
698            .header("Authorization", format!("Bearer {}", self.api_key));
699
700        let request_builder = if let Some(params) = params {
701            request_builder.json(&params)
702        } else {
703            request_builder
704        };
705
706        request_builder.send().await
707    }
708
709    /// Sends a DELETE request to the API.
710    ///
711    /// # Arguments
712    ///
713    /// * `endpoint` - The API endpoint.
714    /// * `params` - Optional request parameters.
715    /// * `stream` - Whether streaming is enabled.
716    /// * `content_type` - The content type of the request.
717    ///
718    /// # Returns
719    ///
720    /// The response from the API.
721    async fn api_delete(
722        &self,
723        endpoint: &str,
724        params: Option<HashMap<String, serde_json::Value>>,
725    ) -> Result<Response, Error> {
726        let fetch = || async { self.api_delete_base(endpoint, params.to_owned()).await };
727
728        fetch
729            .retry(ExponentialBuilder::default().with_max_times(5))
730            .when(|err: &reqwest::Error| {
731                if let Some(status) = err.status() {
732                    status.is_server_error()
733                } else {
734                    err.is_timeout()
735                }
736            })
737            .await
738    }
739
740    /// Scrapes a URL.
741    ///
742    /// # Arguments
743    ///
744    /// * `url` - The URL to scrape.
745    /// * `params` - Optional request parameters.
746    /// * `stream` - Whether streaming is enabled.
747    /// * `content_type` - The content type of the request.
748    ///
749    /// # Returns
750    ///
751    /// The response from the API as a JSON value.
752    pub async fn scrape_url(
753        &self,
754        url: &str,
755        params: Option<RequestParams>,
756        content_type: &str,
757    ) -> Result<serde_json::Value, reqwest::Error> {
758        let mut data = HashMap::new();
759
760        data.insert(
761            "url".to_string(),
762            serde_json::Value::String(url.to_string()),
763        );
764        data.insert("limit".to_string(), serde_json::Value::Number(1.into()));
765
766        if let Ok(params) = serde_json::to_value(params) {
767            if let Some(ref p) = params.as_object() {
768                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
769            }
770        }
771
772        let res = self.api_post("crawl", data, content_type).await?;
773        res.json().await
774    }
775
776    /// Crawls a URL.
777    ///
778    /// # Arguments
779    ///
780    /// * `url` - The URL to crawl.
781    /// * `params` - Optional request parameters.
782    /// * `stream` - Whether streaming is enabled.
783    /// * `content_type` - The content type of the request.
784    /// * `callback` - Optional callback function to handle each streamed chunk.
785    ///
786    /// # Returns
787    ///
788    /// The response from the API as a JSON value.
789    pub async fn crawl_url(
790        &self,
791        url: &str,
792        params: Option<RequestParams>,
793        stream: bool,
794        content_type: &str,
795        callback: Option<impl Fn(serde_json::Value) + Send>,
796    ) -> Result<serde_json::Value, reqwest::Error> {
797        use tokio_util::codec::{FramedRead, LinesCodec};
798
799        let mut data = HashMap::new();
800
801        if let Ok(params) = serde_json::to_value(params) {
802            if let Some(ref p) = params.as_object() {
803                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
804            }
805        }
806
807        data.insert("url".into(), serde_json::Value::String(url.to_string()));
808
809        let res = self.api_post("crawl", data, content_type).await?;
810
811        if stream {
812            if let Some(callback) = callback {
813                let stream = res.bytes_stream();
814
815                let stream_reader = tokio_util::io::StreamReader::new(
816                    stream.map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
817                );
818
819                let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
820
821                while let Some(line_result) = lines.next().await {
822                    match line_result {
823                        Ok(line) => {
824                            match serde_json::from_str::<serde_json::Value>(&line) {
825                                Ok(value) => {
826                                    callback(value);
827                                }
828                                Err(_e) => {
829                                    continue;
830                                }
831                            }
832                        }
833                        Err(_e) => {
834                            return Ok(serde_json::Value::Null)
835                        }
836                    }
837                }
838
839                Ok(serde_json::Value::Null)
840            } else {
841                Ok(serde_json::Value::Null)
842            }
843        } else {
844            res.json().await
845        }
846    }
847
848    /// Fetches links from a URL.
849    ///
850    /// # Arguments
851    ///
852    /// * `url` - The URL to fetch links from.
853    /// * `params` - Optional request parameters.
854    /// * `stream` - Whether streaming is enabled.
855    /// * `content_type` - The content type of the request.
856    ///
857    /// # Returns
858    ///
859    /// The response from the API as a JSON value.
860    pub async fn links(
861        &self,
862        url: &str,
863        params: Option<RequestParams>,
864        _stream: bool,
865        content_type: &str,
866    ) -> Result<serde_json::Value, reqwest::Error> {
867        let mut data = HashMap::new();
868
869        if let Ok(params) = serde_json::to_value(params) {
870            if let Some(ref p) = params.as_object() {
871                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
872            }
873        }
874
875        data.insert("url".into(), serde_json::Value::String(url.to_string()));
876
877        let res = self.api_post("links", data, content_type).await?;
878        res.json().await
879    }
880
881    /// Takes a screenshot of a URL.
882    ///
883    /// # Arguments
884    ///
885    /// * `url` - The URL to take a screenshot of.
886    /// * `params` - Optional request parameters.
887    /// * `stream` - Whether streaming is enabled.
888    /// * `content_type` - The content type of the request.
889    ///
890    /// # Returns
891    ///
892    /// The response from the API as a JSON value.
893    pub async fn screenshot(
894        &self,
895        url: &str,
896        params: Option<RequestParams>,
897        _stream: bool,
898        content_type: &str,
899    ) -> Result<serde_json::Value, reqwest::Error> {
900        let mut data = HashMap::new();
901
902        if let Ok(params) = serde_json::to_value(params) {
903            if let Some(ref p) = params.as_object() {
904                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
905            }
906        }
907
908        data.insert("url".into(), serde_json::Value::String(url.to_string()));
909
910        let res = self.api_post("screenshot", data, content_type).await?;
911        res.json().await
912    }
913
914    /// Searches for a query.
915    ///
916    /// # Arguments
917    ///
918    /// * `q` - The query to search for.
919    /// * `params` - Optional request parameters.
920    /// * `stream` - Whether streaming is enabled.
921    /// * `content_type` - The content type of the request.
922    ///
923    /// # Returns
924    ///
925    /// The response from the API as a JSON value.
926    pub async fn search(
927        &self,
928        q: &str,
929        params: Option<SearchRequestParams>,
930        _stream: bool,
931        content_type: &str,
932    ) -> Result<serde_json::Value, reqwest::Error> {
933        let body = match params {
934            Some(mut params) => {
935                params.search = q.to_string();
936                params
937            }
938            _ => {
939                let mut params = SearchRequestParams::default();
940                params.search = q.to_string();
941                params
942            }
943        };
944
945        let res = self.api_post("search", body, content_type).await?;
946
947        res.json().await
948    }
949
950    /// Transforms data.
951    ///
952    /// # Arguments
953    ///
954    /// * `data` - The data to transform.
955    /// * `params` - Optional request parameters.
956    /// * `stream` - Whether streaming is enabled.
957    /// * `content_type` - The content type of the request.
958    ///
959    /// # Returns
960    ///
961    /// The response from the API as a JSON value.
962    pub async fn transform(
963        &self,
964        data: Vec<HashMap<&str, &str>>,
965        params: Option<TransformParams>,
966        _stream: bool,
967        content_type: &str,
968    ) -> Result<serde_json::Value, reqwest::Error> {
969        let mut payload = HashMap::new();
970
971        if let Ok(params) = serde_json::to_value(params) {
972            if let Some(ref p) = params.as_object() {
973                payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
974            }
975        }
976
977        if let Ok(d) = serde_json::to_value(data) {
978            payload.insert("data".into(), d);
979        }
980
981        let res = self.api_post("transform", payload, content_type).await?;
982
983        res.json().await
984    }
985
986    /// Extracts contacts from a URL.
987    ///
988    /// # Arguments
989    ///
990    /// * `url` - The URL to extract contacts from.
991    /// * `params` - Optional request parameters.
992    /// * `stream` - Whether streaming is enabled.
993    /// * `content_type` - The content type of the request.
994    ///
995    /// # Returns
996    ///
997    /// The response from the API as a JSON value.
998    pub async fn extract_contacts(
999        &self,
1000        url: &str,
1001        params: Option<RequestParams>,
1002        _stream: bool,
1003        content_type: &str,
1004    ) -> Result<serde_json::Value, reqwest::Error> {
1005        let mut data = HashMap::new();
1006
1007        if let Ok(params) = serde_json::to_value(params) {
1008            if let Ok(params) = serde_json::to_value(params) {
1009                if let Some(ref p) = params.as_object() {
1010                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1011                }
1012            }
1013        }
1014
1015        match serde_json::to_value(url) {
1016            Ok(u) => {
1017                data.insert("url".into(), u);
1018            }
1019            _ => (),
1020        }
1021
1022        let res = self
1023            .api_post("pipeline/extract-contacts", data, content_type)
1024            .await?;
1025        res.json().await
1026    }
1027
1028    /// Labels data from a URL.
1029    ///
1030    /// # Arguments
1031    ///
1032    /// * `url` - The URL to label data from.
1033    /// * `params` - Optional request parameters.
1034    /// * `stream` - Whether streaming is enabled.
1035    /// * `content_type` - The content type of the request.
1036    ///
1037    /// # Returns
1038    ///
1039    /// The response from the API as a JSON value.
1040    pub async fn label(
1041        &self,
1042        url: &str,
1043        params: Option<RequestParams>,
1044        _stream: bool,
1045        content_type: &str,
1046    ) -> Result<serde_json::Value, reqwest::Error> {
1047        let mut data = HashMap::new();
1048
1049        if let Ok(params) = serde_json::to_value(params) {
1050            if let Ok(params) = serde_json::to_value(params) {
1051                if let Some(ref p) = params.as_object() {
1052                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1053                }
1054            }
1055        }
1056
1057        data.insert("url".into(), serde_json::Value::String(url.to_string()));
1058
1059        let res = self.api_post("pipeline/label", data, content_type).await?;
1060        res.json().await
1061    }
1062
1063    /// Download a record from storage.
1064    ///
1065    /// # Arguments
1066    ///
1067    /// * `url` - Optional exact url of the file in storage.
1068    /// * `options` - Optional options.
1069    /// * `stream` - Whether streaming is enabled.
1070    ///
1071    /// # Returns
1072    ///
1073    /// The response from the API.
1074    pub async fn download(
1075        &self,
1076        url: Option<&str>,
1077        options: Option<HashMap<&str, i32>>,
1078    ) -> Result<reqwest::Response, reqwest::Error> {
1079        let mut params = HashMap::new();
1080
1081        if let Some(url) = url {
1082            params.insert("url".to_string(), url.to_string());
1083        }
1084
1085        if let Some(options) = options {
1086            for (key, value) in options {
1087                params.insert(key.to_string(), value.to_string());
1088            }
1089        }
1090
1091        let url = format!("{API_URL}/v1/data/download");
1092        let request = self
1093            .client
1094            .get(&url)
1095            .header(
1096                "User-Agent",
1097                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1098            )
1099            .header("Content-Type", "application/octet-stream")
1100            .header("Authorization", format!("Bearer {}", self.api_key))
1101            .query(&params);
1102
1103        let res = request.send().await?;
1104
1105        Ok(res)
1106    }
1107
1108    /// Creates a signed URL of a file from storage.
1109    ///
1110    /// # Arguments
1111    ///
1112    /// * `url` - Optional exact url of the file in storage.
1113    /// * `options` - Optional options.
1114    /// * `stream` - Whether streaming is enabled.
1115    ///
1116    /// # Returns
1117    ///
1118    /// The response from the API.
1119    pub async fn create_signed_url(
1120        &self,
1121        url: Option<&str>,
1122        options: Option<HashMap<&str, i32>>,
1123    ) -> Result<serde_json::Value, reqwest::Error> {
1124        let mut params = HashMap::new();
1125
1126        if let Some(options) = options {
1127            for (key, value) in options {
1128                params.insert(key.to_string(), value.to_string());
1129            }
1130        }
1131
1132        if let Some(url) = url {
1133            params.insert("url".to_string(), url.to_string());
1134        }
1135
1136        let url = format!("{API_URL}/v1/data/sign-url");
1137        let request = self
1138            .client
1139            .get(&url)
1140            .header(
1141                "User-Agent",
1142                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1143            )
1144            .header("Authorization", format!("Bearer {}", self.api_key))
1145            .query(&params);
1146
1147        let res = request.send().await?;
1148
1149        res.json().await
1150    }
1151
1152    /// Gets the crawl state of a URL.
1153    ///
1154    /// # Arguments
1155    ///
1156    /// * `url` - The URL to get the crawl state of.
1157    /// * `params` - Optional request parameters.
1158    /// * `stream` - Whether streaming is enabled.
1159    /// * `content_type` - The content type of the request.
1160    ///
1161    /// # Returns
1162    ///
1163    pub async fn get_crawl_state(
1164        &self,
1165        url: &str,
1166        params: Option<RequestParams>,
1167        content_type: &str,
1168    ) -> Result<serde_json::Value, reqwest::Error> {
1169        let mut payload = HashMap::new();
1170        payload.insert("url".into(), serde_json::Value::String(url.to_string()));
1171        payload.insert(
1172            "contentType".into(),
1173            serde_json::Value::String(content_type.to_string()),
1174        );
1175
1176        if let Ok(params) = serde_json::to_value(params) {
1177            if let Ok(params) = serde_json::to_value(params) {
1178                if let Some(ref p) = params.as_object() {
1179                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1180                }
1181            }
1182        }
1183
1184        let res = self
1185            .api_post("data/crawl_state", payload, content_type)
1186            .await?;
1187        res.json().await
1188    }
1189
1190    /// Get the account credits left.
1191    pub async fn get_credits(&self) -> Result<serde_json::Value, reqwest::Error> {
1192        self.api_get::<serde_json::Value>("data/credits", None)
1193            .await
1194    }
1195
1196    /// Send a request for a data record.
1197    pub async fn data_post(
1198        &self,
1199        table: &str,
1200        data: Option<RequestParams>,
1201    ) -> Result<serde_json::Value, reqwest::Error> {
1202        let res = self
1203            .api_post(&format!("data/{}", table), data, "application/json")
1204            .await?;
1205        res.json().await
1206    }
1207
1208    /// Query a record from the global DB.
1209    pub async fn query(&self, params: &QueryRequest) -> Result<serde_json::Value, reqwest::Error> {
1210        let res = self
1211            .api_get::<QueryRequest>(&"data/query", Some(params))
1212            .await?;
1213
1214        Ok(res)
1215    }
1216
1217    /// Get a table record.
1218    pub async fn data_get(
1219        &self,
1220        table: &str,
1221        params: Option<RequestParams>,
1222    ) -> Result<serde_json::Value, reqwest::Error> {
1223        let mut payload = HashMap::new();
1224
1225        if let Some(params) = params {
1226            if let Ok(p) = serde_json::to_value(params) {
1227                if let Some(o) = p.as_object() {
1228                    payload.extend(o.iter().map(|(k, v)| (k.as_str(), v.clone())));
1229                }
1230            }
1231        }
1232
1233        let res = self
1234            .api_get::<serde_json::Value>(&format!("data/{}", table), None)
1235            .await?;
1236        Ok(res)
1237    }
1238
1239    /// Delete a record.
1240    pub async fn data_delete(
1241        &self,
1242        table: &str,
1243        params: Option<RequestParams>,
1244    ) -> Result<serde_json::Value, reqwest::Error> {
1245        let mut payload = HashMap::new();
1246
1247        if let Ok(params) = serde_json::to_value(params) {
1248            if let Ok(params) = serde_json::to_value(params) {
1249                if let Some(ref p) = params.as_object() {
1250                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1251                }
1252            }
1253        }
1254
1255        let res = self
1256            .api_delete(&format!("data/{}", table), Some(payload))
1257            .await?;
1258        res.json().await
1259    }
1260}
1261
1262#[cfg(test)]
1263mod tests {
1264    use super::*;
1265    use dotenv::dotenv;
1266    use lazy_static::lazy_static;
1267    use reqwest::ClientBuilder;
1268
1269    lazy_static! {
1270        static ref SPIDER_CLIENT: Spider = {
1271            dotenv().ok();
1272            let client = ClientBuilder::new();
1273            let client = client.user_agent("SpiderBot").build().unwrap();
1274
1275            Spider::new_with_client(None, client).expect("client to build")
1276        };
1277    }
1278
1279    #[tokio::test]
1280    #[ignore]
1281    async fn test_scrape_url() {
1282        let response = SPIDER_CLIENT
1283            .scrape_url("https://example.com", None, "application/json")
1284            .await;
1285        assert!(response.is_ok());
1286    }
1287
1288    #[tokio::test]
1289    async fn test_crawl_url() {
1290        let response = SPIDER_CLIENT
1291            .crawl_url(
1292                "https://example.com",
1293                None,
1294                false,
1295                "application/json",
1296                None::<fn(serde_json::Value)>,
1297            )
1298            .await;
1299        assert!(response.is_ok());
1300    }
1301
1302    #[tokio::test]
1303    #[ignore]
1304    async fn test_links() {
1305        let response: Result<serde_json::Value, Error> = SPIDER_CLIENT
1306            .links("https://example.com", None, false, "application/json")
1307            .await;
1308        assert!(response.is_ok());
1309    }
1310
1311    #[tokio::test]
1312    #[ignore]
1313    async fn test_screenshot() {
1314        let mut params = RequestParams::default();
1315        params.limit = Some(1);
1316
1317        let response = SPIDER_CLIENT
1318            .screenshot(
1319                "https://example.com",
1320                Some(params),
1321                false,
1322                "application/json",
1323            )
1324            .await;
1325        assert!(response.is_ok());
1326    }
1327
1328    // #[tokio::test(flavor = "multi_thread")]
1329    // async fn test_search() {
1330    //     let mut params = SearchRequestParams::default();
1331
1332    //     params.search_limit = Some(1);
1333    //     params.num = Some(1);
1334    //     params.fetch_page_content = Some(false);
1335
1336    //     let response = SPIDER_CLIENT
1337    //         .search("a sports website", Some(params), false, "application/json")
1338    //         .await;
1339
1340    //     assert!(response.is_ok());
1341    // }
1342
1343    #[tokio::test]
1344    #[ignore]
1345    async fn test_transform() {
1346        let data = vec![HashMap::from([(
1347            "<html><body><h1>Transformation</h1></body></html>".into(),
1348            "".into(),
1349        )])];
1350        let response = SPIDER_CLIENT
1351            .transform(data, None, false, "application/json")
1352            .await;
1353        assert!(response.is_ok());
1354    }
1355
1356    #[tokio::test]
1357    #[ignore]
1358    async fn test_extract_contacts() {
1359        let response = SPIDER_CLIENT
1360            .extract_contacts("https://example.com", None, false, "application/json")
1361            .await;
1362        assert!(response.is_ok());
1363    }
1364
1365    #[tokio::test]
1366    #[ignore]
1367    async fn test_label() {
1368        let response = SPIDER_CLIENT
1369            .label("https://example.com", None, false, "application/json")
1370            .await;
1371        assert!(response.is_ok());
1372    }
1373
1374    #[tokio::test]
1375    async fn test_create_signed_url() {
1376        let response = SPIDER_CLIENT
1377            .create_signed_url(Some("example.com"), None)
1378            .await;
1379        assert!(response.is_ok());
1380    }
1381
1382    #[tokio::test]
1383    async fn test_get_crawl_state() {
1384        let response = SPIDER_CLIENT
1385            .get_crawl_state("https://example.com", None, "application/json")
1386            .await;
1387        assert!(response.is_ok());
1388    }
1389
1390    #[tokio::test]
1391    async fn test_query() {
1392        let mut query = QueryRequest::default();
1393
1394        query.domain = Some("spider.cloud".into());
1395
1396        let response = SPIDER_CLIENT.query(&query).await;
1397        assert!(response.is_ok());
1398    }
1399
1400    #[tokio::test]
1401    async fn test_get_credits() {
1402        let response = SPIDER_CLIENT.get_credits().await;
1403        assert!(response.is_ok());
1404    }
1405}
spider_client/lib.rs

spider_client/
lib.rs