spider_client/
lib.rs

1//! The `spider-client` module provides the primary interface and
2//! functionalities for the Spider web crawler library, which is
3//! designed for rapid and efficient crawling of web pages to gather
4//! links using isolated contexts.
5//!
6//! ### Features
7//!
8//! - **Multi-threaded Crawling:** Spider can utilize multiple
9//!   threads to parallelize the crawling process, drastically
10//!   improving performance and allowing the ability to gather
11//!   millions of pages in a short time.
12//!
13//! - **Configurable:** The library provides various options to
14//!   configure the crawling behavior, such as setting the depth
15//!   of crawling, user-agent strings, delays between requests,
16//!   and more.
17//!
18//! - **Link Gathering:** One of the primary objectives of Spider is to
19//!   gather and manage links from the web pages it crawls,
20//!   compiling them into a structured format for further use.
21//!
22//! ### Examples
23//!
24//! Basic usage of the Spider client might look like this:
25//!
26//! ```rust
27//! use spider_client::{Spider, RequestType, RequestParams};
28//! use tokio;
29//!
30//!  # #[ignore]
31//! #[tokio::main]
32//! async fn main() {
33//!     let spider = Spider::new(Some("myspiderapikey".into())).expect("API key must be provided");
34//!
35//!     let url = "https://spider.cloud";
36//!
37//!     // Scrape a single URL
38//!     let scraped_data = spider.scrape_url(url, None, "application/json").await.expect("Failed to scrape the URL");
39//!
40//!     println!("Scraped Data: {:?}", scraped_data);
41//!
42//!     // Crawl a website
43//!     let crawler_params = RequestParams {
44//!         limit: Some(1),
45//!         proxy_enabled: Some(true),
46//!         store_data: Some(false),
47//!         metadata: Some(false),
48//!         request: Some(RequestType::Http),
49//!         ..Default::default()
50//!     };
51//!
52//!     let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
53//!
54//!     println!("Crawl Result: {:?}", crawl_result);
55//! }
56//! ```
57//!
58//! ### Modules
59//!
60//! - `config`: Contains the configuration options for the Spider client.
61//! - `utils`: Utility functions used by the Spider client.
62//!
63
64use backon::ExponentialBuilder;
65use backon::Retryable;
66use reqwest::Client;
67use reqwest::{Error, Response};
68use serde::{Deserialize, Serialize};
69use std::collections::HashMap;
70use tokio_stream::StreamExt;
71
72/// Structure representing the Chunking algorithm dictionary.
73#[derive(Debug, Deserialize, Serialize, Clone)]
74pub struct ChunkingAlgDict {
75    /// The chunking algorithm to use, defined as a specific type.
76    r#type: ChunkingType,
77    /// The amount to chunk by.
78    value: i32,
79}
80
81// The nested structures
82#[derive(Serialize, Deserialize, Debug, Clone)]
83pub struct Timeout {
84    /// The seconds up to 60.
85    pub secs: u64,
86    /// The nanoseconds.
87    pub nanos: u32,
88}
89
90#[derive(Serialize, Deserialize, Debug, Clone)]
91pub struct IdleNetwork {
92    /// The timeout to wait until.
93    pub timeout: Timeout,
94}
95
96#[derive(Serialize, Deserialize, Debug, Clone)]
97#[serde(tag = "type", rename_all = "PascalCase")]
98pub enum WebAutomation {
99    Evaluate { code: String },
100    Click { selector: String },
101    Wait { duration: u64 },
102    WaitForNavigation,
103    WaitFor { selector: String },
104    WaitForAndClick { selector: String },
105    ScrollX { pixels: i32 },
106    ScrollY { pixels: i32 },
107    Fill { selector: String, value: String },
108    InfiniteScroll { times: u32 },
109}
110
111#[derive(Default, Serialize, Deserialize, Debug, Clone)]
112#[serde(tag = "type", rename_all = "PascalCase")]
113pub enum RedirectPolicy {
114    Loose,
115    #[default]
116    Strict,
117}
118
119pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
120pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
121
122#[derive(Serialize, Deserialize, Debug, Clone)]
123pub struct Selector {
124    /// The timeout to wait until.
125    pub timeout: Timeout,
126    /// The selector to wait for.
127    pub selector: String,
128}
129
130#[derive(Serialize, Deserialize, Debug, Clone)]
131pub struct Delay {
132    /// The timeout to wait until.
133    pub timeout: Timeout,
134}
135
136#[derive(Serialize, Deserialize, Debug, Clone)]
137pub struct WaitFor {
138    /// Wait until idle networks with a timeout of idleness.
139    pub idle_network: Option<IdleNetwork>,
140    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
141    pub selector: Option<Selector>,
142    /// Wait until a hard delay.
143    pub delay: Option<Delay>,
144    /// Wait until page navigation happen. Default is true.
145    pub page_navigations: Option<bool>,
146}
147
148/// Query request to get a document.
149#[derive(Serialize, Deserialize, Debug, Clone, Default)]
150pub struct QueryRequest {
151    /// The exact website url.
152    pub url: Option<String>,
153    /// The website domain.
154    pub domain: Option<String>,
155    /// The path of the resource.
156    pub pathname: Option<String>,
157}
158
159/// Enum representing different types of Chunking.
160#[derive(Default, Debug, Deserialize, Serialize, Clone)]
161#[serde(rename_all = "lowercase")]
162pub enum ChunkingType {
163    #[default]
164    /// By the word count.
165    ByWords,
166    /// By the line count.
167    ByLines,
168    /// By the char length.
169    ByCharacterLength,
170    /// By sentence.
171    BySentence,
172}
173
174#[derive(Default, Debug, Deserialize, Serialize, Clone)]
175/// View port handling for chrome.
176pub struct Viewport {
177    /// Device screen Width
178    pub width: u32,
179    /// Device screen size
180    pub height: u32,
181    /// Device scale factor
182    pub device_scale_factor: Option<f64>,
183    /// Emulating Mobile?
184    pub emulating_mobile: bool,
185    /// Use landscape mode instead of portrait.
186    pub is_landscape: bool,
187    /// Touch screen device?
188    pub has_touch: bool,
189}
190
191/// The API url.
192const API_URL: &'static str = "https://api.spider.cloud";
193
194// Define the CSSSelector struct
195#[derive(Debug, Clone, Default, Deserialize, Serialize)]
196pub struct CSSSelector {
197    /// The name of the selector group
198    pub name: String,
199    /// A vector of CSS selectors
200    pub selectors: Vec<String>,
201}
202
203// Define the CSSExtractionMap type
204pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
205
206/// Represents the settings for a webhook configuration
207#[derive(Debug, Default, Deserialize, Serialize, Clone)]
208pub struct WebhookSettings {
209    /// The destination where the webhook information will be sent
210    destination: String,
211    /// Trigger an action when all credits are depleted
212    on_credits_depleted: bool,
213    /// Trigger an action when half of the credits are depleted
214    on_credits_half_depleted: bool,
215    /// Trigger an action on a website status update event
216    on_website_status: bool,
217    /// Send information about a new page find (such as links and bytes)
218    on_find: bool,
219    /// Handle the metadata of a found page
220    on_find_metadata: bool,
221}
222
223/// Send multiple return formats.
224#[derive(Debug, Deserialize, Serialize, Clone)]
225#[serde(untagged)]
226pub enum ReturnFormatHandling {
227    /// A single return item.
228    Single(ReturnFormat),
229    /// Multiple return formats.
230    Multi(std::collections::HashSet<ReturnFormat>),
231}
232
233impl Default for ReturnFormatHandling {
234    fn default() -> ReturnFormatHandling {
235        ReturnFormatHandling::Single(ReturnFormat::Raw)
236    }
237}
238
239#[derive(Debug, Default, Deserialize, Serialize, Clone)]
240pub struct EventTracker {
241    /// The responses received.
242    responses: Option<bool>,
243    ///The request sent.
244    requests: Option<bool>
245}
246
247/// Structure representing request parameters.
248#[derive(Debug, Default, Deserialize, Serialize, Clone)]
249pub struct RequestParams {
250    #[serde(default)]
251    /// The URL to be crawled.
252    pub url: Option<String>,
253    #[serde(default)]
254    /// The type of request to be made.
255    pub request: Option<RequestType>,
256    #[serde(default)]
257    /// The maximum number of pages the crawler should visit.
258    pub limit: Option<u32>,
259    #[serde(default)]
260    /// The format in which the result should be returned.
261    pub return_format: Option<ReturnFormatHandling>,
262    #[serde(default)]
263    /// Specifies whether to only visit the top-level domain.
264    pub tld: Option<bool>,
265    #[serde(default)]
266    /// The depth of the crawl.
267    pub depth: Option<u32>,
268    #[serde(default)]
269    /// Specifies whether the request should be cached.
270    pub cache: Option<bool>,
271    #[serde(default)]
272    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
273    pub scroll: Option<u32>,
274    #[serde(default)]
275    /// The budget for various resources.
276    pub budget: Option<HashMap<String, u32>>,
277    #[serde(default)]
278    /// The blacklist routes to ignore. This can be a Regex string pattern.
279    pub blacklist: Option<Vec<String>>,
280    #[serde(default)]
281    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
282    pub whitelist: Option<Vec<String>>,
283    #[serde(default)]
284    /// The locale to be used during the crawl.
285    pub locale: Option<String>,
286    #[serde(default)]
287    /// The cookies to be set for the request, formatted as a single string.
288    pub cookies: Option<String>,
289    #[serde(default)]
290    /// Specifies whether to use stealth techniques to avoid detection.
291    pub stealth: Option<bool>,
292    #[serde(default)]
293    /// The headers to be used for the request.
294    pub headers: Option<HashMap<String, String>>,
295    #[serde(default)]
296    /// Specifies whether anti-bot measures should be used.
297    pub anti_bot: Option<bool>,
298    #[serde(default)]
299    /// Specifies whether to send data via webhooks.
300    pub webhooks: Option<WebhookSettings>,
301    #[serde(default)]
302    /// Specifies whether to include metadata in the response.
303    pub metadata: Option<bool>,
304    #[serde(default)]
305    /// The dimensions of the viewport.
306    pub viewport: Option<Viewport>,
307    #[serde(default)]
308    /// The encoding to be used for the request.
309    pub encoding: Option<String>,
310    #[serde(default)]
311    /// Specifies whether to include subdomains in the crawl.
312    pub subdomains: Option<bool>,
313    #[serde(default)]
314    /// The user agent string to be used for the request.
315    pub user_agent: Option<String>,
316    #[serde(default)]
317    /// Specifies whether the response data should be stored.
318    pub store_data: Option<bool>,
319    #[serde(default)]
320    /// Configuration settings for GPT (general purpose texture mappings).
321    pub gpt_config: Option<HashMap<String, String>>,
322    #[serde(default)]
323    /// Specifies whether to use fingerprinting protection.
324    pub fingerprint: Option<bool>,
325    #[serde(default)]
326    /// Specifies whether to perform the request without using storage.
327    pub storageless: Option<bool>,
328    #[serde(default)]
329    /// Specifies whether readability optimizations should be applied.
330    pub readability: Option<bool>,
331    #[serde(default)]
332    /// Specifies whether to use a proxy for the request.
333    pub proxy_enabled: Option<bool>,
334    #[serde(default)]
335    /// Specifies whether to respect the site's robots.txt file.
336    pub respect_robots: Option<bool>,
337    #[serde(default)]
338    /// CSS selector to be used to filter the content.
339    pub root_selector: Option<String>,
340    #[serde(default)]
341    /// Specifies whether to load all resources of the crawl target.
342    pub full_resources: Option<bool>,
343    #[serde(default)]
344    /// The text string to extract data from.
345    pub text: Option<String>,
346    #[serde(default)]
347    /// Specifies whether to use the sitemap links.
348    pub sitemap: Option<bool>,
349    #[serde(default)]
350    /// External domains to include the crawl.
351    pub external_domains: Option<Vec<String>>,
352    #[serde(default)]
353    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
354    pub return_embeddings: Option<bool>,
355    #[serde(default)]
356    /// Returns the HTTP response headers.
357    pub return_headers: Option<bool>,
358    #[serde(default)]
359    /// Returns the link(s) found on the page that match the crawler query.
360    pub return_page_links: Option<bool>,
361    #[serde(default)]
362    /// Returns the HTTP response cookies.
363    pub return_cookies: Option<bool>,
364    #[serde(default)]
365    /// The timeout for the request, in milliseconds.
366    pub request_timeout: Option<u8>,
367    #[serde(default)]
368    /// Specifies whether to run the request in the background.
369    pub run_in_background: Option<bool>,
370    #[serde(default)]
371    /// Specifies whether to skip configuration checks.
372    pub skip_config_checks: Option<bool>,
373    #[serde(default)]
374    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
375    pub css_extraction_map: Option<CSSExtractionMap>,
376    #[serde(default)]
377    /// The chunking algorithm to use.
378    pub chunking_alg: Option<ChunkingAlgDict>,
379    #[serde(default)]
380    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
381    pub disable_intercept: Option<bool>,
382    #[serde(default)]
383    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
384    pub wait_for: Option<WaitFor>,
385    #[serde(default)]
386    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
387    pub execution_scripts: Option<ExecutionScriptsMap>,
388    #[serde(default)]
389    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
390    pub automation_scripts: Option<WebAutomationMap>,
391    #[serde(default)]
392    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
393    pub redirect_policy: Option<RedirectPolicy>,
394    #[serde(default)]
395    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
396    pub event_tracker: Option<EventTracker>,
397    #[serde(default)]
398    /// The timeout to stop the crawl.
399    pub crawl_timeout: Option<Timeout>,
400}
401
402/// The structure representing request parameters for a search request.
403#[derive(Debug, Default, Deserialize, Serialize, Clone)]
404pub struct SearchRequestParams {
405    /// The base request parameters.
406    #[serde(default, flatten)]
407    pub base: RequestParams,
408    // The search request.
409    pub search: String,
410    /// The search limit.
411    pub search_limit: Option<u32>,
412    // Fetch the page content. Defaults to true.
413    pub fetch_page_content: Option<bool>,
414    /// The search location of the request
415    pub location: Option<String>,
416    /// The country code of the request
417    pub country: Option<String>,
418    /// The language code of the request.
419    pub language: Option<String>,
420    /// The number of search results
421    pub num: Option<u32>,
422    /// The page of the search results.
423    pub page: Option<u32>,
424    #[serde(default)]
425    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
426    pub website_limit: Option<u32>,
427}
428
429/// Structure representing request parameters for transforming files.
430#[derive(Debug, Default, Deserialize, Serialize, Clone)]
431pub struct TransformParams {
432    #[serde(default)]
433    /// The format in which the result should be returned.
434    pub return_format: Option<ReturnFormat>,
435    #[serde(default)]
436    /// Specifies whether readability optimizations should be applied.
437    pub readability: Option<bool>,
438    #[serde(default)]
439    /// Clean the markdown or text for AI.
440    pub clean: Option<bool>,
441    #[serde(default)]
442    /// Clean the markdown or text for AI removing footers, navigation, and more.
443    pub clean_full: Option<bool>,
444    /// The data being transformed.
445    pub data: Vec<DataParam>,
446}
447
448#[derive(Serialize, Deserialize, Debug, Clone)]
449pub struct DataParam {
450    /// The HTML resource.
451    pub html: String,
452    /// The website url.
453    pub url: Option<String>,
454}
455
456/// the request type to perform
457#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
458#[serde(rename_all = "lowercase")]
459pub enum RequestType {
460    /// Default HTTP request
461    Http,
462    /// Chrome browser rendering
463    Chrome,
464    #[default]
465    /// Smart mode defaulting to HTTP and using Chrome when needed.
466    SmartMode,
467}
468
469/// Enum representing different return formats.
470#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
471#[serde(rename_all = "lowercase")]
472pub enum ReturnFormat {
473    #[default]
474    /// The default return format of the resource.
475    Raw,
476    /// Return the response as Markdown.
477    Markdown,
478    /// Return the response as Commonmark.
479    Commonmark,
480    /// Return the response as Html2text.
481    Html2text,
482    /// Return the response as Text.
483    Text,
484    /// Return the response as XML.
485    Xml,
486    /// Return the response as Bytes.
487    Bytes,
488}
489
490/// Represents a Spider with API key and HTTP client.
491#[derive(Debug, Default)]
492pub struct Spider {
493    /// The Spider API key.
494    pub api_key: String,
495    /// The Spider Client to re-use.
496    pub client: Client,
497}
498
499impl Spider {
500    /// Creates a new instance of Spider.
501    ///
502    /// # Arguments
503    ///
504    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
505    ///
506    /// # Returns
507    ///
508    /// A new instance of Spider or an error string if no API key is provided.
509    pub fn new(api_key: Option<String>) -> Result<Self, &'static str> {
510        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
511
512        match api_key {
513            Some(key) => Ok(Self {
514                api_key: key,
515                client: Client::new(),
516            }),
517            None => Err("No API key provided"),
518        }
519    }
520
521    /// Creates a new instance of Spider.
522    ///
523    /// # Arguments
524    ///
525    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
526    /// * `client` - A custom client to pass in.
527    ///
528    /// # Returns
529    ///
530    /// A new instance of Spider or an error string if no API key is provided.
531    pub fn new_with_client(api_key: Option<String>, client: Client) -> Result<Self, &'static str> {
532        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
533
534        match api_key {
535            Some(key) => Ok(Self {
536                api_key: key,
537                client,
538            }),
539            None => Err("No API key provided"),
540        }
541    }
542
543    /// Sends a POST request to the API.
544    ///
545    /// # Arguments
546    ///
547    /// * `endpoint` - The API endpoint.
548    /// * `data` - The request data as a HashMap.
549    /// * `stream` - Whether streaming is enabled.
550    /// * `content_type` - The content type of the request.
551    ///
552    /// # Returns
553    ///
554    /// The response from the API.
555    async fn api_post_base(
556        &self,
557        endpoint: &str,
558        data: impl Serialize + Sized + std::fmt::Debug,
559        content_type: &str,
560    ) -> Result<Response, Error> {
561        let url: String = format!("{API_URL}/{}", endpoint);
562
563        self.client
564            .post(&url)
565            .header(
566                "User-Agent",
567                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
568            )
569            .header("Content-Type", content_type)
570            .header("Authorization", format!("Bearer {}", self.api_key))
571            .json(&data)
572            .send()
573            .await
574    }
575
576    /// Sends a POST request to the API.
577    ///
578    /// # Arguments
579    ///
580    /// * `endpoint` - The API endpoint.
581    /// * `data` - The request data as a HashMap.
582    /// * `stream` - Whether streaming is enabled.
583    /// * `content_type` - The content type of the request.
584    ///
585    /// # Returns
586    ///
587    /// The response from the API.
588    async fn api_post(
589        &self,
590        endpoint: &str,
591        data: impl Serialize + std::fmt::Debug + Clone + Send + Sync,
592        content_type: &str,
593    ) -> Result<Response, Error> {
594        let fetch = || async {
595            self.api_post_base(endpoint, data.to_owned(), content_type)
596                .await
597        };
598
599        fetch
600            .retry(ExponentialBuilder::default().with_max_times(5))
601            .when(|err: &reqwest::Error| {
602                if let Some(status) = err.status() {
603                    status.is_server_error()
604                } else {
605                    err.is_timeout()
606                }
607            })
608            .await
609    }
610
611    /// Sends a GET request to the API.
612    ///
613    /// # Arguments
614    ///
615    /// * `endpoint` - The API endpoint.
616    ///
617    /// # Returns
618    ///
619    /// The response from the API as a JSON value.
620    async fn api_get_base<T: Serialize>(
621        &self,
622        endpoint: &str,
623        query_params: Option<&T>,
624    ) -> Result<serde_json::Value, reqwest::Error> {
625        let url = format!("{API_URL}/{}", endpoint);
626        let res = self
627            .client
628            .get(&url)
629            .query(&query_params)
630            .header(
631                "User-Agent",
632                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
633            )
634            .header("Content-Type", "application/json")
635            .header("Authorization", format!("Bearer {}", self.api_key))
636            .send()
637            .await?;
638        res.json().await
639    }
640
641    /// Sends a GET request to the API.
642    ///
643    /// # Arguments
644    ///
645    /// * `endpoint` - The API endpoint.
646    ///
647    /// # Returns
648    ///
649    /// The response from the API as a JSON value.
650    async fn api_get<T: Serialize>(
651        &self,
652        endpoint: &str,
653        query_params: Option<&T>,
654    ) -> Result<serde_json::Value, reqwest::Error> {
655        let fetch = || async { self.api_get_base(endpoint, query_params.to_owned()).await };
656
657        fetch
658            .retry(ExponentialBuilder::default().with_max_times(5))
659            .when(|err: &reqwest::Error| {
660                if let Some(status) = err.status() {
661                    status.is_server_error()
662                } else {
663                    err.is_timeout()
664                }
665            })
666            .await
667    }
668
669    /// Sends a DELETE request to the API.
670    ///
671    /// # Arguments
672    ///
673    /// * `endpoint` - The API endpoint.
674    /// * `params` - Optional request parameters.
675    /// * `stream` - Whether streaming is enabled.
676    /// * `content_type` - The content type of the request.
677    ///
678    /// # Returns
679    ///
680    /// The response from the API.
681    async fn api_delete_base(
682        &self,
683        endpoint: &str,
684        params: Option<HashMap<String, serde_json::Value>>,
685    ) -> Result<Response, Error> {
686        let url = format!("{API_URL}/v1/{}", endpoint);
687        let request_builder = self
688            .client
689            .delete(&url)
690            .header(
691                "User-Agent",
692                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
693            )
694            .header("Content-Type", "application/json")
695            .header("Authorization", format!("Bearer {}", self.api_key));
696
697        let request_builder = if let Some(params) = params {
698            request_builder.json(&params)
699        } else {
700            request_builder
701        };
702
703        request_builder.send().await
704    }
705
706    /// Sends a DELETE request to the API.
707    ///
708    /// # Arguments
709    ///
710    /// * `endpoint` - The API endpoint.
711    /// * `params` - Optional request parameters.
712    /// * `stream` - Whether streaming is enabled.
713    /// * `content_type` - The content type of the request.
714    ///
715    /// # Returns
716    ///
717    /// The response from the API.
718    async fn api_delete(
719        &self,
720        endpoint: &str,
721        params: Option<HashMap<String, serde_json::Value>>,
722    ) -> Result<Response, Error> {
723        let fetch = || async { self.api_delete_base(endpoint, params.to_owned()).await };
724
725        fetch
726            .retry(ExponentialBuilder::default().with_max_times(5))
727            .when(|err: &reqwest::Error| {
728                if let Some(status) = err.status() {
729                    status.is_server_error()
730                } else {
731                    err.is_timeout()
732                }
733            })
734            .await
735    }
736
737    /// Scrapes a URL.
738    ///
739    /// # Arguments
740    ///
741    /// * `url` - The URL to scrape.
742    /// * `params` - Optional request parameters.
743    /// * `stream` - Whether streaming is enabled.
744    /// * `content_type` - The content type of the request.
745    ///
746    /// # Returns
747    ///
748    /// The response from the API as a JSON value.
749    pub async fn scrape_url(
750        &self,
751        url: &str,
752        params: Option<RequestParams>,
753        content_type: &str,
754    ) -> Result<serde_json::Value, reqwest::Error> {
755        let mut data = HashMap::new();
756
757        data.insert(
758            "url".to_string(),
759            serde_json::Value::String(url.to_string()),
760        );
761        data.insert("limit".to_string(), serde_json::Value::Number(1.into()));
762
763        if let Ok(params) = serde_json::to_value(params) {
764            if let Some(ref p) = params.as_object() {
765                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
766            }
767        }
768
769        let res = self.api_post("crawl", data, content_type).await?;
770        res.json().await
771    }
772
773    /// Crawls a URL.
774    ///
775    /// # Arguments
776    ///
777    /// * `url` - The URL to crawl.
778    /// * `params` - Optional request parameters.
779    /// * `stream` - Whether streaming is enabled.
780    /// * `content_type` - The content type of the request.
781    /// * `callback` - Optional callback function to handle each streamed chunk.
782    ///
783    /// # Returns
784    ///
785    /// The response from the API as a JSON value.
786    pub async fn crawl_url(
787        &self,
788        url: &str,
789        params: Option<RequestParams>,
790        stream: bool,
791        content_type: &str,
792        callback: Option<impl Fn(serde_json::Value) + Send>,
793    ) -> Result<serde_json::Value, reqwest::Error> {
794        use tokio_util::codec::{FramedRead, LinesCodec};
795
796        let mut data = HashMap::new();
797
798        if let Ok(params) = serde_json::to_value(params) {
799            if let Some(ref p) = params.as_object() {
800                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
801            }
802        }
803
804        data.insert("url".into(), serde_json::Value::String(url.to_string()));
805
806        let res = self.api_post("crawl", data, content_type).await?;
807
808        if stream {
809            if let Some(callback) = callback {
810                let stream = res.bytes_stream();
811
812                let stream_reader = tokio_util::io::StreamReader::new(
813                    stream.map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
814                );
815
816                let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
817
818                while let Some(line_result) = lines.next().await {
819                    match line_result {
820                        Ok(line) => {
821                            match serde_json::from_str::<serde_json::Value>(&line) {
822                                Ok(value) => {
823                                    callback(value);
824                                }
825                                Err(_e) => {
826                                    continue;
827                                }
828                            }
829                        }
830                        Err(_e) => {
831                            return Ok(serde_json::Value::Null)
832                        }
833                    }
834                }
835
836                Ok(serde_json::Value::Null)
837            } else {
838                Ok(serde_json::Value::Null)
839            }
840        } else {
841            res.json().await
842        }
843    }
844
845    /// Fetches links from a URL.
846    ///
847    /// # Arguments
848    ///
849    /// * `url` - The URL to fetch links from.
850    /// * `params` - Optional request parameters.
851    /// * `stream` - Whether streaming is enabled.
852    /// * `content_type` - The content type of the request.
853    ///
854    /// # Returns
855    ///
856    /// The response from the API as a JSON value.
857    pub async fn links(
858        &self,
859        url: &str,
860        params: Option<RequestParams>,
861        _stream: bool,
862        content_type: &str,
863    ) -> Result<serde_json::Value, reqwest::Error> {
864        let mut data = HashMap::new();
865
866        if let Ok(params) = serde_json::to_value(params) {
867            if let Some(ref p) = params.as_object() {
868                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
869            }
870        }
871
872        data.insert("url".into(), serde_json::Value::String(url.to_string()));
873
874        let res = self.api_post("links", data, content_type).await?;
875        res.json().await
876    }
877
878    /// Takes a screenshot of a URL.
879    ///
880    /// # Arguments
881    ///
882    /// * `url` - The URL to take a screenshot of.
883    /// * `params` - Optional request parameters.
884    /// * `stream` - Whether streaming is enabled.
885    /// * `content_type` - The content type of the request.
886    ///
887    /// # Returns
888    ///
889    /// The response from the API as a JSON value.
890    pub async fn screenshot(
891        &self,
892        url: &str,
893        params: Option<RequestParams>,
894        _stream: bool,
895        content_type: &str,
896    ) -> Result<serde_json::Value, reqwest::Error> {
897        let mut data = HashMap::new();
898
899        if let Ok(params) = serde_json::to_value(params) {
900            if let Some(ref p) = params.as_object() {
901                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
902            }
903        }
904
905        data.insert("url".into(), serde_json::Value::String(url.to_string()));
906
907        let res = self.api_post("screenshot", data, content_type).await?;
908        res.json().await
909    }
910
911    /// Searches for a query.
912    ///
913    /// # Arguments
914    ///
915    /// * `q` - The query to search for.
916    /// * `params` - Optional request parameters.
917    /// * `stream` - Whether streaming is enabled.
918    /// * `content_type` - The content type of the request.
919    ///
920    /// # Returns
921    ///
922    /// The response from the API as a JSON value.
923    pub async fn search(
924        &self,
925        q: &str,
926        params: Option<SearchRequestParams>,
927        _stream: bool,
928        content_type: &str,
929    ) -> Result<serde_json::Value, reqwest::Error> {
930        let body = match params {
931            Some(mut params) => {
932                params.search = q.to_string();
933                params
934            }
935            _ => {
936                let mut params = SearchRequestParams::default();
937                params.search = q.to_string();
938                params
939            }
940        };
941
942        let res = self.api_post("search", body, content_type).await?;
943
944        res.json().await
945    }
946
947    /// Transforms data.
948    ///
949    /// # Arguments
950    ///
951    /// * `data` - The data to transform.
952    /// * `params` - Optional request parameters.
953    /// * `stream` - Whether streaming is enabled.
954    /// * `content_type` - The content type of the request.
955    ///
956    /// # Returns
957    ///
958    /// The response from the API as a JSON value.
959    pub async fn transform(
960        &self,
961        data: Vec<HashMap<&str, &str>>,
962        params: Option<TransformParams>,
963        _stream: bool,
964        content_type: &str,
965    ) -> Result<serde_json::Value, reqwest::Error> {
966        let mut payload = HashMap::new();
967
968        if let Ok(params) = serde_json::to_value(params) {
969            if let Some(ref p) = params.as_object() {
970                payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
971            }
972        }
973
974        if let Ok(d) = serde_json::to_value(data) {
975            payload.insert("data".into(), d);
976        }
977
978        let res = self.api_post("transform", payload, content_type).await?;
979
980        res.json().await
981    }
982
983    /// Extracts contacts from a URL.
984    ///
985    /// # Arguments
986    ///
987    /// * `url` - The URL to extract contacts from.
988    /// * `params` - Optional request parameters.
989    /// * `stream` - Whether streaming is enabled.
990    /// * `content_type` - The content type of the request.
991    ///
992    /// # Returns
993    ///
994    /// The response from the API as a JSON value.
995    pub async fn extract_contacts(
996        &self,
997        url: &str,
998        params: Option<RequestParams>,
999        _stream: bool,
1000        content_type: &str,
1001    ) -> Result<serde_json::Value, reqwest::Error> {
1002        let mut data = HashMap::new();
1003
1004        if let Ok(params) = serde_json::to_value(params) {
1005            if let Ok(params) = serde_json::to_value(params) {
1006                if let Some(ref p) = params.as_object() {
1007                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1008                }
1009            }
1010        }
1011
1012        match serde_json::to_value(url) {
1013            Ok(u) => {
1014                data.insert("url".into(), u);
1015            }
1016            _ => (),
1017        }
1018
1019        let res = self
1020            .api_post("pipeline/extract-contacts", data, content_type)
1021            .await?;
1022        res.json().await
1023    }
1024
1025    /// Labels data from a URL.
1026    ///
1027    /// # Arguments
1028    ///
1029    /// * `url` - The URL to label data from.
1030    /// * `params` - Optional request parameters.
1031    /// * `stream` - Whether streaming is enabled.
1032    /// * `content_type` - The content type of the request.
1033    ///
1034    /// # Returns
1035    ///
1036    /// The response from the API as a JSON value.
1037    pub async fn label(
1038        &self,
1039        url: &str,
1040        params: Option<RequestParams>,
1041        _stream: bool,
1042        content_type: &str,
1043    ) -> Result<serde_json::Value, reqwest::Error> {
1044        let mut data = HashMap::new();
1045
1046        if let Ok(params) = serde_json::to_value(params) {
1047            if let Ok(params) = serde_json::to_value(params) {
1048                if let Some(ref p) = params.as_object() {
1049                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1050                }
1051            }
1052        }
1053
1054        data.insert("url".into(), serde_json::Value::String(url.to_string()));
1055
1056        let res = self.api_post("pipeline/label", data, content_type).await?;
1057        res.json().await
1058    }
1059
1060    /// Download a record from storage.
1061    ///
1062    /// # Arguments
1063    ///
1064    /// * `url` - Optional exact url of the file in storage.
1065    /// * `options` - Optional options.
1066    /// * `stream` - Whether streaming is enabled.
1067    ///
1068    /// # Returns
1069    ///
1070    /// The response from the API.
1071    pub async fn download(
1072        &self,
1073        url: Option<&str>,
1074        options: Option<HashMap<&str, i32>>,
1075    ) -> Result<reqwest::Response, reqwest::Error> {
1076        let mut params = HashMap::new();
1077
1078        if let Some(url) = url {
1079            params.insert("url".to_string(), url.to_string());
1080        }
1081
1082        if let Some(options) = options {
1083            for (key, value) in options {
1084                params.insert(key.to_string(), value.to_string());
1085            }
1086        }
1087
1088        let url = format!("{API_URL}/v1/data/download");
1089        let request = self
1090            .client
1091            .get(&url)
1092            .header(
1093                "User-Agent",
1094                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1095            )
1096            .header("Content-Type", "application/octet-stream")
1097            .header("Authorization", format!("Bearer {}", self.api_key))
1098            .query(&params);
1099
1100        let res = request.send().await?;
1101
1102        Ok(res)
1103    }
1104
1105    /// Creates a signed URL of a file from storage.
1106    ///
1107    /// # Arguments
1108    ///
1109    /// * `url` - Optional exact url of the file in storage.
1110    /// * `options` - Optional options.
1111    /// * `stream` - Whether streaming is enabled.
1112    ///
1113    /// # Returns
1114    ///
1115    /// The response from the API.
1116    pub async fn create_signed_url(
1117        &self,
1118        url: Option<&str>,
1119        options: Option<HashMap<&str, i32>>,
1120    ) -> Result<serde_json::Value, reqwest::Error> {
1121        let mut params = HashMap::new();
1122
1123        if let Some(options) = options {
1124            for (key, value) in options {
1125                params.insert(key.to_string(), value.to_string());
1126            }
1127        }
1128
1129        if let Some(url) = url {
1130            params.insert("url".to_string(), url.to_string());
1131        }
1132
1133        let url = format!("{API_URL}/v1/data/sign-url");
1134        let request = self
1135            .client
1136            .get(&url)
1137            .header(
1138                "User-Agent",
1139                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1140            )
1141            .header("Authorization", format!("Bearer {}", self.api_key))
1142            .query(&params);
1143
1144        let res = request.send().await?;
1145
1146        res.json().await
1147    }
1148
1149    /// Gets the crawl state of a URL.
1150    ///
1151    /// # Arguments
1152    ///
1153    /// * `url` - The URL to get the crawl state of.
1154    /// * `params` - Optional request parameters.
1155    /// * `stream` - Whether streaming is enabled.
1156    /// * `content_type` - The content type of the request.
1157    ///
1158    /// # Returns
1159    ///
1160    pub async fn get_crawl_state(
1161        &self,
1162        url: &str,
1163        params: Option<RequestParams>,
1164        content_type: &str,
1165    ) -> Result<serde_json::Value, reqwest::Error> {
1166        let mut payload = HashMap::new();
1167        payload.insert("url".into(), serde_json::Value::String(url.to_string()));
1168        payload.insert(
1169            "contentType".into(),
1170            serde_json::Value::String(content_type.to_string()),
1171        );
1172
1173        if let Ok(params) = serde_json::to_value(params) {
1174            if let Ok(params) = serde_json::to_value(params) {
1175                if let Some(ref p) = params.as_object() {
1176                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1177                }
1178            }
1179        }
1180
1181        let res = self
1182            .api_post("data/crawl_state", payload, content_type)
1183            .await?;
1184        res.json().await
1185    }
1186
1187    /// Get the account credits left.
1188    pub async fn get_credits(&self) -> Result<serde_json::Value, reqwest::Error> {
1189        self.api_get::<serde_json::Value>("data/credits", None)
1190            .await
1191    }
1192
1193    /// Send a request for a data record.
1194    pub async fn data_post(
1195        &self,
1196        table: &str,
1197        data: Option<RequestParams>,
1198    ) -> Result<serde_json::Value, reqwest::Error> {
1199        let res = self
1200            .api_post(&format!("data/{}", table), data, "application/json")
1201            .await?;
1202        res.json().await
1203    }
1204
1205    /// Query a record from the global DB.
1206    pub async fn query(&self, params: &QueryRequest) -> Result<serde_json::Value, reqwest::Error> {
1207        let res = self
1208            .api_get::<QueryRequest>(&"data/query", Some(params))
1209            .await?;
1210
1211        Ok(res)
1212    }
1213
1214    /// Get a table record.
1215    pub async fn data_get(
1216        &self,
1217        table: &str,
1218        params: Option<RequestParams>,
1219    ) -> Result<serde_json::Value, reqwest::Error> {
1220        let mut payload = HashMap::new();
1221
1222        if let Some(params) = params {
1223            if let Ok(p) = serde_json::to_value(params) {
1224                if let Some(o) = p.as_object() {
1225                    payload.extend(o.iter().map(|(k, v)| (k.as_str(), v.clone())));
1226                }
1227            }
1228        }
1229
1230        let res = self
1231            .api_get::<serde_json::Value>(&format!("data/{}", table), None)
1232            .await?;
1233        Ok(res)
1234    }
1235
1236    /// Delete a record.
1237    pub async fn data_delete(
1238        &self,
1239        table: &str,
1240        params: Option<RequestParams>,
1241    ) -> Result<serde_json::Value, reqwest::Error> {
1242        let mut payload = HashMap::new();
1243
1244        if let Ok(params) = serde_json::to_value(params) {
1245            if let Ok(params) = serde_json::to_value(params) {
1246                if let Some(ref p) = params.as_object() {
1247                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1248                }
1249            }
1250        }
1251
1252        let res = self
1253            .api_delete(&format!("data/{}", table), Some(payload))
1254            .await?;
1255        res.json().await
1256    }
1257}
1258
1259#[cfg(test)]
1260mod tests {
1261    use super::*;
1262    use dotenv::dotenv;
1263    use lazy_static::lazy_static;
1264    use reqwest::ClientBuilder;
1265
1266    lazy_static! {
1267        static ref SPIDER_CLIENT: Spider = {
1268            dotenv().ok();
1269            let client = ClientBuilder::new();
1270            let client = client.user_agent("SpiderBot").build().unwrap();
1271
1272            Spider::new_with_client(None, client).expect("client to build")
1273        };
1274    }
1275
1276    #[tokio::test]
1277    #[ignore]
1278    async fn test_scrape_url() {
1279        let response = SPIDER_CLIENT
1280            .scrape_url("https://example.com", None, "application/json")
1281            .await;
1282        assert!(response.is_ok());
1283    }
1284
1285    #[tokio::test]
1286    async fn test_crawl_url() {
1287        let response = SPIDER_CLIENT
1288            .crawl_url(
1289                "https://example.com",
1290                None,
1291                false,
1292                "application/json",
1293                None::<fn(serde_json::Value)>,
1294            )
1295            .await;
1296        assert!(response.is_ok());
1297    }
1298
1299    #[tokio::test]
1300    #[ignore]
1301    async fn test_links() {
1302        let response: Result<serde_json::Value, Error> = SPIDER_CLIENT
1303            .links("https://example.com", None, false, "application/json")
1304            .await;
1305        assert!(response.is_ok());
1306    }
1307
1308    #[tokio::test]
1309    #[ignore]
1310    async fn test_screenshot() {
1311        let mut params = RequestParams::default();
1312        params.limit = Some(1);
1313
1314        let response = SPIDER_CLIENT
1315            .screenshot(
1316                "https://example.com",
1317                Some(params),
1318                false,
1319                "application/json",
1320            )
1321            .await;
1322        assert!(response.is_ok());
1323    }
1324
1325    // #[tokio::test(flavor = "multi_thread")]
1326    // async fn test_search() {
1327    //     let mut params = SearchRequestParams::default();
1328
1329    //     params.search_limit = Some(1);
1330    //     params.num = Some(1);
1331    //     params.fetch_page_content = Some(false);
1332
1333    //     let response = SPIDER_CLIENT
1334    //         .search("a sports website", Some(params), false, "application/json")
1335    //         .await;
1336
1337    //     assert!(response.is_ok());
1338    // }
1339
1340    #[tokio::test]
1341    #[ignore]
1342    async fn test_transform() {
1343        let data = vec![HashMap::from([(
1344            "<html><body><h1>Transformation</h1></body></html>".into(),
1345            "".into(),
1346        )])];
1347        let response = SPIDER_CLIENT
1348            .transform(data, None, false, "application/json")
1349            .await;
1350        assert!(response.is_ok());
1351    }
1352
1353    #[tokio::test]
1354    #[ignore]
1355    async fn test_extract_contacts() {
1356        let response = SPIDER_CLIENT
1357            .extract_contacts("https://example.com", None, false, "application/json")
1358            .await;
1359        assert!(response.is_ok());
1360    }
1361
1362    #[tokio::test]
1363    #[ignore]
1364    async fn test_label() {
1365        let response = SPIDER_CLIENT
1366            .label("https://example.com", None, false, "application/json")
1367            .await;
1368        assert!(response.is_ok());
1369    }
1370
1371    #[tokio::test]
1372    async fn test_create_signed_url() {
1373        let response = SPIDER_CLIENT
1374            .create_signed_url(Some("example.com"), None)
1375            .await;
1376        assert!(response.is_ok());
1377    }
1378
1379    #[tokio::test]
1380    async fn test_get_crawl_state() {
1381        let response = SPIDER_CLIENT
1382            .get_crawl_state("https://example.com", None, "application/json")
1383            .await;
1384        assert!(response.is_ok());
1385    }
1386
1387    #[tokio::test]
1388    async fn test_query() {
1389        let mut query = QueryRequest::default();
1390
1391        query.domain = Some("spider.cloud".into());
1392
1393        let response = SPIDER_CLIENT.query(&query).await;
1394        assert!(response.is_ok());
1395    }
1396
1397    #[tokio::test]
1398    async fn test_get_credits() {
1399        let response = SPIDER_CLIENT.get_credits().await;
1400        assert!(response.is_ok());
1401    }
1402}
spider_client/lib.rs

spider_client/
lib.rs