spider_client/
lib.rs

1//! The `spider-client` module provides the primary interface and
2//! functionalities for the Spider web crawler library, which is
3//! designed for rapid and efficient crawling of web pages to gather
4//! links using isolated contexts.
5//!
6//! ### Features
7//!
8//! - **Multi-threaded Crawling:** Spider can utilize multiple
9//!   threads to parallelize the crawling process, drastically
10//!   improving performance and allowing the ability to gather
11//!   millions of pages in a short time.
12//!
13//! - **Configurable:** The library provides various options to
14//!   configure the crawling behavior, such as setting the depth
15//!   of crawling, user-agent strings, delays between requests,
16//!   and more.
17//!
18//! - **Link Gathering:** One of the primary objectives of Spider is to
19//!   gather and manage links from the web pages it crawls,
20//!   compiling them into a structured format for further use.
21//!
22//! ### Examples
23//!
24//! Basic usage of the Spider client might look like this:
25//!
26//! ```rust
27//! use spider_client::{Spider, RequestType, RequestParams};
28//! use tokio;
29//!
30//!  # #[ignore]
31//! #[tokio::main]
32//! async fn main() {
33//!     let spider = Spider::new(Some("myspiderapikey".into())).expect("API key must be provided");
34//!
35//!     let url = "https://spider.cloud";
36//!
37//!     // Scrape a single URL
38//!     let scraped_data = spider.scrape_url(url, None, "application/json").await.expect("Failed to scrape the URL");
39//!
40//!     println!("Scraped Data: {:?}", scraped_data);
41//!
42//!     // Crawl a website
43//!     let crawler_params = RequestParams {
44//!         limit: Some(1),
45//!         proxy_enabled: Some(true),
46//!         store_data: Some(false),
47//!         metadata: Some(false),
48//!         request: Some(RequestType::Http),
49//!         ..Default::default()
50//!     };
51//!
52//!     let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
53//!
54//!     println!("Crawl Result: {:?}", crawl_result);
55//! }
56//! ```
57//!
58//! ### Modules
59//!
60//! - `config`: Contains the configuration options for the Spider client.
61//! - `utils`: Utility functions used by the Spider client.
62//!
63
64use backon::ExponentialBuilder;
65use backon::Retryable;
66use reqwest::Client;
67use reqwest::{Error, Response};
68use serde::{Deserialize, Serialize};
69use std::collections::HashMap;
70use tokio_stream::StreamExt;
71
72/// Structure representing the Chunking algorithm dictionary.
73#[derive(Debug, Deserialize, Serialize, Clone)]
74pub struct ChunkingAlgDict {
75    /// The chunking algorithm to use, defined as a specific type.
76    r#type: ChunkingType,
77    /// The amount to chunk by.
78    value: i32,
79}
80
81// The nested structures
82#[derive(Serialize, Deserialize, Debug, Clone)]
83pub struct Timeout {
84    /// The seconds up to 60.
85    pub secs: u64,
86    /// The nanoseconds.
87    pub nanos: u32,
88}
89
90#[derive(Serialize, Deserialize, Debug, Clone)]
91pub struct IdleNetwork {
92    /// The timeout to wait until.
93    pub timeout: Timeout,
94}
95
96#[derive(Serialize, Deserialize, Debug, Clone)]
97#[serde(tag = "type", rename_all = "PascalCase")]
98pub enum WebAutomation {
99    Evaluate { code: String },
100    Click { selector: String },
101    Wait { duration: u64 },
102    WaitForNavigation,
103    WaitFor { selector: String },
104    WaitForAndClick { selector: String },
105    ScrollX { pixels: i32 },
106    ScrollY { pixels: i32 },
107    Fill { selector: String, value: String },
108    InfiniteScroll { times: u32 },
109}
110
111#[derive(Default, Serialize, Deserialize, Debug, Clone)]
112#[serde(tag = "type", rename_all = "PascalCase")]
113pub enum RedirectPolicy {
114    Loose,
115    #[default]
116    Strict,
117}
118
119pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
120pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
121
122#[derive(Serialize, Deserialize, Debug, Clone)]
123pub struct Selector {
124    /// The timeout to wait until.
125    pub timeout: Timeout,
126    /// The selector to wait for.
127    pub selector: String,
128}
129
130#[derive(Serialize, Deserialize, Debug, Clone)]
131pub struct Delay {
132    /// The timeout to wait until.
133    pub timeout: Timeout,
134}
135
136#[derive(Serialize, Deserialize, Debug, Clone)]
137pub struct WaitFor {
138    /// Wait until idle networks with a timeout of idleness.
139    pub idle_network: Option<IdleNetwork>,
140    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
141    pub selector: Option<Selector>,
142    /// Wait until a hard delay.
143    pub delay: Option<Delay>,
144    /// Wait until page navigation happen. Default is true.
145    pub page_navigations: Option<bool>,
146}
147
148/// Query request to get a document.
149#[derive(Serialize, Deserialize, Debug, Clone, Default)]
150pub struct QueryRequest {
151    /// The exact website url.
152    pub url: Option<String>,
153    /// The website domain.
154    pub domain: Option<String>,
155    /// The path of the resource.
156    pub pathname: Option<String>,
157}
158
159/// Enum representing different types of Chunking.
160#[derive(Default, Debug, Deserialize, Serialize, Clone)]
161#[serde(rename_all = "lowercase")]
162pub enum ChunkingType {
163    #[default]
164    /// By the word count.
165    ByWords,
166    /// By the line count.
167    ByLines,
168    /// By the char length.
169    ByCharacterLength,
170    /// By sentence.
171    BySentence,
172}
173
174#[derive(Default, Debug, Deserialize, Serialize, Clone)]
175/// View port handling for chrome.
176pub struct Viewport {
177    /// Device screen Width
178    pub width: u32,
179    /// Device screen size
180    pub height: u32,
181    /// Device scale factor
182    pub device_scale_factor: Option<f64>,
183    /// Emulating Mobile?
184    pub emulating_mobile: bool,
185    /// Use landscape mode instead of portrait.
186    pub is_landscape: bool,
187    /// Touch screen device?
188    pub has_touch: bool,
189}
190
191/// The API url.
192const API_URL: &'static str = "https://api.spider.cloud";
193
194// Define the CSSSelector struct
195#[derive(Debug, Clone, Default, Deserialize, Serialize)]
196pub struct CSSSelector {
197    /// The name of the selector group
198    pub name: String,
199    /// A vector of CSS selectors
200    pub selectors: Vec<String>,
201}
202
203// Define the CSSExtractionMap type
204pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
205
206/// Represents the settings for a webhook configuration
207#[derive(Debug, Default, Deserialize, Serialize, Clone)]
208pub struct WebhookSettings {
209    /// The destination where the webhook information will be sent
210    destination: String,
211    /// Trigger an action when all credits are depleted
212    on_credits_depleted: bool,
213    /// Trigger an action when half of the credits are depleted
214    on_credits_half_depleted: bool,
215    /// Trigger an action on a website status update event
216    on_website_status: bool,
217    /// Send information about a new page find (such as links and bytes)
218    on_find: bool,
219    /// Handle the metadata of a found page
220    on_find_metadata: bool,
221}
222
223/// Send multiple return formats.
224#[derive(Debug, Deserialize, Serialize, Clone)]
225#[serde(untagged)]
226pub enum ReturnFormatHandling {
227    /// A single return item.
228    Single(ReturnFormat),
229    /// Multiple return formats.
230    Multi(std::collections::HashSet<ReturnFormat>),
231}
232
233impl Default for ReturnFormatHandling {
234    fn default() -> ReturnFormatHandling {
235        ReturnFormatHandling::Single(ReturnFormat::Raw)
236    }
237}
238
239/// Structure representing request parameters.
240#[derive(Debug, Default, Deserialize, Serialize, Clone)]
241pub struct RequestParams {
242    #[serde(default)]
243    /// The URL to be crawled.
244    pub url: Option<String>,
245    #[serde(default)]
246    /// The type of request to be made.
247    pub request: Option<RequestType>,
248    #[serde(default)]
249    /// The maximum number of pages the crawler should visit.
250    pub limit: Option<u32>,
251    #[serde(default)]
252    /// The format in which the result should be returned.
253    pub return_format: Option<ReturnFormatHandling>,
254    #[serde(default)]
255    /// Specifies whether to only visit the top-level domain.
256    pub tld: Option<bool>,
257    #[serde(default)]
258    /// The depth of the crawl.
259    pub depth: Option<u32>,
260    #[serde(default)]
261    /// Specifies whether the request should be cached.
262    pub cache: Option<bool>,
263    #[serde(default)]
264    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
265    pub scroll: Option<u32>,
266    #[serde(default)]
267    /// The budget for various resources.
268    pub budget: Option<HashMap<String, u32>>,
269    #[serde(default)]
270    /// The blacklist routes to ignore. This can be a Regex string pattern.
271    pub blacklist: Option<Vec<String>>,
272    #[serde(default)]
273    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
274    pub whitelist: Option<Vec<String>>,
275    #[serde(default)]
276    /// The locale to be used during the crawl.
277    pub locale: Option<String>,
278    #[serde(default)]
279    /// The cookies to be set for the request, formatted as a single string.
280    pub cookies: Option<String>,
281    #[serde(default)]
282    /// Specifies whether to use stealth techniques to avoid detection.
283    pub stealth: Option<bool>,
284    #[serde(default)]
285    /// The headers to be used for the request.
286    pub headers: Option<HashMap<String, String>>,
287    #[serde(default)]
288    /// Specifies whether anti-bot measures should be used.
289    pub anti_bot: Option<bool>,
290    #[serde(default)]
291    /// Specifies whether to send data via webhooks.
292    pub webhooks: Option<WebhookSettings>,
293    #[serde(default)]
294    /// Specifies whether to include metadata in the response.
295    pub metadata: Option<bool>,
296    #[serde(default)]
297    /// The dimensions of the viewport.
298    pub viewport: Option<Viewport>,
299    #[serde(default)]
300    /// The encoding to be used for the request.
301    pub encoding: Option<String>,
302    #[serde(default)]
303    /// Specifies whether to include subdomains in the crawl.
304    pub subdomains: Option<bool>,
305    #[serde(default)]
306    /// The user agent string to be used for the request.
307    pub user_agent: Option<String>,
308    #[serde(default)]
309    /// Specifies whether the response data should be stored.
310    pub store_data: Option<bool>,
311    #[serde(default)]
312    /// Configuration settings for GPT (general purpose texture mappings).
313    pub gpt_config: Option<HashMap<String, String>>,
314    #[serde(default)]
315    /// Specifies whether to use fingerprinting protection.
316    pub fingerprint: Option<bool>,
317    #[serde(default)]
318    /// Specifies whether to perform the request without using storage.
319    pub storageless: Option<bool>,
320    #[serde(default)]
321    /// Specifies whether readability optimizations should be applied.
322    pub readability: Option<bool>,
323    #[serde(default)]
324    /// Specifies whether to use a proxy for the request.
325    pub proxy_enabled: Option<bool>,
326    #[serde(default)]
327    /// Specifies whether to respect the site's robots.txt file.
328    pub respect_robots: Option<bool>,
329    #[serde(default)]
330    /// CSS selector to be used to filter the content.
331    pub root_selector: Option<String>,
332    #[serde(default)]
333    /// Specifies whether to load all resources of the crawl target.
334    pub full_resources: Option<bool>,
335    #[serde(default)]
336    /// The text string to extract data from.
337    pub text: Option<String>,
338    #[serde(default)]
339    /// Specifies whether to use the sitemap links.
340    pub sitemap: Option<bool>,
341    #[serde(default)]
342    /// External domains to include the crawl.
343    pub external_domains: Option<Vec<String>>,
344    #[serde(default)]
345    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
346    pub return_embeddings: Option<bool>,
347    #[serde(default)]
348    /// Returns the HTTP response headers.
349    pub return_headers: Option<bool>,
350    #[serde(default)]
351    /// Returns the link(s) found on the page that match the crawler query.
352    pub return_page_links: Option<bool>,
353    #[serde(default)]
354    /// Returns the HTTP response cookies.
355    pub return_cookies: Option<bool>,
356    #[serde(default)]
357    /// The timeout for the request, in milliseconds.
358    pub request_timeout: Option<u8>,
359    #[serde(default)]
360    /// Specifies whether to run the request in the background.
361    pub run_in_background: Option<bool>,
362    #[serde(default)]
363    /// Specifies whether to skip configuration checks.
364    pub skip_config_checks: Option<bool>,
365    #[serde(default)]
366    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
367    pub css_extraction_map: Option<CSSExtractionMap>,
368    #[serde(default)]
369    /// The chunking algorithm to use.
370    pub chunking_alg: Option<ChunkingAlgDict>,
371    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
372    pub disable_intercept: Option<bool>,
373    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
374    pub wait_for: Option<WaitFor>,
375    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
376    pub execution_scripts: Option<ExecutionScriptsMap>,
377    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
378    pub automation_scripts: Option<WebAutomationMap>,
379    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
380    pub redirect_policy: Option<RedirectPolicy>,
381}
382
383/// The structure representing request parameters for a search request.
384#[derive(Debug, Default, Deserialize, Serialize, Clone)]
385pub struct SearchRequestParams {
386    /// The base request parameters.
387    #[serde(default, flatten)]
388    pub base: RequestParams,
389    // The search request.
390    pub search: String,
391    /// The search limit.
392    pub search_limit: Option<u32>,
393    // Fetch the page content. Defaults to true.
394    pub fetch_page_content: Option<bool>,
395    /// The search location of the request
396    pub location: Option<String>,
397    /// The country code of the request
398    pub country: Option<String>,
399    /// The language code of the request.
400    pub language: Option<String>,
401    /// The number of search results
402    pub num: Option<u32>,
403    /// The page of the search results.
404    pub page: Option<u32>,
405    #[serde(default)]
406    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
407    pub website_limit: Option<u32>,
408}
409
410/// Structure representing request parameters for transforming files.
411#[derive(Debug, Default, Deserialize, Serialize, Clone)]
412pub struct TransformParams {
413    #[serde(default)]
414    /// The format in which the result should be returned.
415    pub return_format: Option<ReturnFormat>,
416    #[serde(default)]
417    /// Specifies whether readability optimizations should be applied.
418    pub readability: Option<bool>,
419    #[serde(default)]
420    /// Clean the markdown or text for AI.
421    pub clean: Option<bool>,
422    #[serde(default)]
423    /// Clean the markdown or text for AI removing footers, navigation, and more.
424    pub clean_full: Option<bool>,
425    /// The data being transformed.
426    pub data: Vec<DataParam>,
427}
428
429#[derive(Serialize, Deserialize, Debug, Clone)]
430pub struct DataParam {
431    /// The HTML resource.
432    pub html: String,
433    /// The website url.
434    pub url: Option<String>,
435}
436
437/// the request type to perform
438#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
439#[serde(rename_all = "lowercase")]
440pub enum RequestType {
441    /// Default HTTP request
442    Http,
443    /// Chrome browser rendering
444    Chrome,
445    #[default]
446    /// Smart mode defaulting to HTTP and using Chrome when needed.
447    SmartMode,
448}
449
450/// Enum representing different return formats.
451#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
452#[serde(rename_all = "lowercase")]
453pub enum ReturnFormat {
454    #[default]
455    /// The default return format of the resource.
456    Raw,
457    /// Return the response as Markdown.
458    Markdown,
459    /// Return the response as Commonmark.
460    Commonmark,
461    /// Return the response as Html2text.
462    Html2text,
463    /// Return the response as Text.
464    Text,
465    /// Return the response as XML.
466    Xml,
467    /// Return the response as Bytes.
468    Bytes,
469}
470
471/// Represents a Spider with API key and HTTP client.
472#[derive(Debug, Default)]
473pub struct Spider {
474    /// The Spider API key.
475    pub api_key: String,
476    /// The Spider Client to re-use.
477    pub client: Client,
478}
479
480impl Spider {
481    /// Creates a new instance of Spider.
482    ///
483    /// # Arguments
484    ///
485    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
486    ///
487    /// # Returns
488    ///
489    /// A new instance of Spider or an error string if no API key is provided.
490    pub fn new(api_key: Option<String>) -> Result<Self, &'static str> {
491        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
492
493        match api_key {
494            Some(key) => Ok(Self {
495                api_key: key,
496                client: Client::new(),
497            }),
498            None => Err("No API key provided"),
499        }
500    }
501
502    /// Creates a new instance of Spider.
503    ///
504    /// # Arguments
505    ///
506    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
507    /// * `client` - A custom client to pass in.
508    ///
509    /// # Returns
510    ///
511    /// A new instance of Spider or an error string if no API key is provided.
512    pub fn new_with_client(api_key: Option<String>, client: Client) -> Result<Self, &'static str> {
513        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
514
515        match api_key {
516            Some(key) => Ok(Self {
517                api_key: key,
518                client,
519            }),
520            None => Err("No API key provided"),
521        }
522    }
523
524    /// Sends a POST request to the API.
525    ///
526    /// # Arguments
527    ///
528    /// * `endpoint` - The API endpoint.
529    /// * `data` - The request data as a HashMap.
530    /// * `stream` - Whether streaming is enabled.
531    /// * `content_type` - The content type of the request.
532    ///
533    /// # Returns
534    ///
535    /// The response from the API.
536    async fn api_post_base(
537        &self,
538        endpoint: &str,
539        data: impl Serialize + Sized + std::fmt::Debug,
540        content_type: &str,
541    ) -> Result<Response, Error> {
542        let url: String = format!("{API_URL}/{}", endpoint);
543
544        self.client
545            .post(&url)
546            .header(
547                "User-Agent",
548                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
549            )
550            .header("Content-Type", content_type)
551            .header("Authorization", format!("Bearer {}", self.api_key))
552            .json(&data)
553            .send()
554            .await
555    }
556
557    /// Sends a POST request to the API.
558    ///
559    /// # Arguments
560    ///
561    /// * `endpoint` - The API endpoint.
562    /// * `data` - The request data as a HashMap.
563    /// * `stream` - Whether streaming is enabled.
564    /// * `content_type` - The content type of the request.
565    ///
566    /// # Returns
567    ///
568    /// The response from the API.
569    async fn api_post(
570        &self,
571        endpoint: &str,
572        data: impl Serialize + std::fmt::Debug + Clone + Send + Sync,
573        content_type: &str,
574    ) -> Result<Response, Error> {
575        let fetch = || async {
576            self.api_post_base(endpoint, data.to_owned(), content_type)
577                .await
578        };
579
580        fetch
581            .retry(ExponentialBuilder::default().with_max_times(5))
582            .when(|err: &reqwest::Error| {
583                if let Some(status) = err.status() {
584                    status.is_server_error()
585                } else {
586                    err.is_timeout()
587                }
588            })
589            .await
590    }
591
592    /// Sends a GET request to the API.
593    ///
594    /// # Arguments
595    ///
596    /// * `endpoint` - The API endpoint.
597    ///
598    /// # Returns
599    ///
600    /// The response from the API as a JSON value.
601    async fn api_get_base<T: Serialize>(
602        &self,
603        endpoint: &str,
604        query_params: Option<&T>,
605    ) -> Result<serde_json::Value, reqwest::Error> {
606        let url = format!("{API_URL}/{}", endpoint);
607        let res = self
608            .client
609            .get(&url)
610            .query(&query_params)
611            .header(
612                "User-Agent",
613                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
614            )
615            .header("Content-Type", "application/json")
616            .header("Authorization", format!("Bearer {}", self.api_key))
617            .send()
618            .await?;
619        res.json().await
620    }
621
622    /// Sends a GET request to the API.
623    ///
624    /// # Arguments
625    ///
626    /// * `endpoint` - The API endpoint.
627    ///
628    /// # Returns
629    ///
630    /// The response from the API as a JSON value.
631    async fn api_get<T: Serialize>(
632        &self,
633        endpoint: &str,
634        query_params: Option<&T>,
635    ) -> Result<serde_json::Value, reqwest::Error> {
636        let fetch = || async { self.api_get_base(endpoint, query_params.to_owned()).await };
637
638        fetch
639            .retry(ExponentialBuilder::default().with_max_times(5))
640            .when(|err: &reqwest::Error| {
641                if let Some(status) = err.status() {
642                    status.is_server_error()
643                } else {
644                    err.is_timeout()
645                }
646            })
647            .await
648    }
649
650    /// Sends a DELETE request to the API.
651    ///
652    /// # Arguments
653    ///
654    /// * `endpoint` - The API endpoint.
655    /// * `params` - Optional request parameters.
656    /// * `stream` - Whether streaming is enabled.
657    /// * `content_type` - The content type of the request.
658    ///
659    /// # Returns
660    ///
661    /// The response from the API.
662    async fn api_delete_base(
663        &self,
664        endpoint: &str,
665        params: Option<HashMap<String, serde_json::Value>>,
666    ) -> Result<Response, Error> {
667        let url = format!("{API_URL}/v1/{}", endpoint);
668        let request_builder = self
669            .client
670            .delete(&url)
671            .header(
672                "User-Agent",
673                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
674            )
675            .header("Content-Type", "application/json")
676            .header("Authorization", format!("Bearer {}", self.api_key));
677
678        let request_builder = if let Some(params) = params {
679            request_builder.json(&params)
680        } else {
681            request_builder
682        };
683
684        request_builder.send().await
685    }
686
687    /// Sends a DELETE request to the API.
688    ///
689    /// # Arguments
690    ///
691    /// * `endpoint` - The API endpoint.
692    /// * `params` - Optional request parameters.
693    /// * `stream` - Whether streaming is enabled.
694    /// * `content_type` - The content type of the request.
695    ///
696    /// # Returns
697    ///
698    /// The response from the API.
699    async fn api_delete(
700        &self,
701        endpoint: &str,
702        params: Option<HashMap<String, serde_json::Value>>,
703    ) -> Result<Response, Error> {
704        let fetch = || async { self.api_delete_base(endpoint, params.to_owned()).await };
705
706        fetch
707            .retry(ExponentialBuilder::default().with_max_times(5))
708            .when(|err: &reqwest::Error| {
709                if let Some(status) = err.status() {
710                    status.is_server_error()
711                } else {
712                    err.is_timeout()
713                }
714            })
715            .await
716    }
717
718    /// Scrapes a URL.
719    ///
720    /// # Arguments
721    ///
722    /// * `url` - The URL to scrape.
723    /// * `params` - Optional request parameters.
724    /// * `stream` - Whether streaming is enabled.
725    /// * `content_type` - The content type of the request.
726    ///
727    /// # Returns
728    ///
729    /// The response from the API as a JSON value.
730    pub async fn scrape_url(
731        &self,
732        url: &str,
733        params: Option<RequestParams>,
734        content_type: &str,
735    ) -> Result<serde_json::Value, reqwest::Error> {
736        let mut data = HashMap::new();
737
738        data.insert(
739            "url".to_string(),
740            serde_json::Value::String(url.to_string()),
741        );
742        data.insert("limit".to_string(), serde_json::Value::Number(1.into()));
743
744        if let Ok(params) = serde_json::to_value(params) {
745            if let Some(ref p) = params.as_object() {
746                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
747            }
748        }
749
750        let res = self.api_post("crawl", data, content_type).await?;
751        res.json().await
752    }
753
754    /// Crawls a URL.
755    ///
756    /// # Arguments
757    ///
758    /// * `url` - The URL to crawl.
759    /// * `params` - Optional request parameters.
760    /// * `stream` - Whether streaming is enabled.
761    /// * `content_type` - The content type of the request.
762    /// * `callback` - Optional callback function to handle each streamed chunk.
763    ///
764    /// # Returns
765    ///
766    /// The response from the API as a JSON value.
767    pub async fn crawl_url(
768        &self,
769        url: &str,
770        params: Option<RequestParams>,
771        stream: bool,
772        content_type: &str,
773        callback: Option<impl Fn(serde_json::Value) + Send>,
774    ) -> Result<serde_json::Value, reqwest::Error> {
775        use tokio_util::codec::{FramedRead, LinesCodec};
776
777        let mut data = HashMap::new();
778
779        if let Ok(params) = serde_json::to_value(params) {
780            if let Some(ref p) = params.as_object() {
781                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
782            }
783        }
784
785        data.insert("url".into(), serde_json::Value::String(url.to_string()));
786
787        let res = self.api_post("crawl", data, content_type).await?;
788
789        if stream {
790            if let Some(callback) = callback {
791                let stream = res.bytes_stream();
792
793                let stream_reader = tokio_util::io::StreamReader::new(
794                    stream.map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
795                );
796
797                let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
798
799                while let Some(line_result) = lines.next().await {
800                    match line_result {
801                        Ok(line) => {
802                            match serde_json::from_str::<serde_json::Value>(&line) {
803                                Ok(value) => {
804                                    callback(value);
805                                }
806                                Err(_e) => {
807                                    continue;
808                                }
809                            }
810                        }
811                        Err(_e) => {
812                            return Ok(serde_json::Value::Null)
813                        }
814                    }
815                }
816
817                Ok(serde_json::Value::Null)
818            } else {
819                Ok(serde_json::Value::Null)
820            }
821        } else {
822            res.json().await
823        }
824    }
825
826    /// Fetches links from a URL.
827    ///
828    /// # Arguments
829    ///
830    /// * `url` - The URL to fetch links from.
831    /// * `params` - Optional request parameters.
832    /// * `stream` - Whether streaming is enabled.
833    /// * `content_type` - The content type of the request.
834    ///
835    /// # Returns
836    ///
837    /// The response from the API as a JSON value.
838    pub async fn links(
839        &self,
840        url: &str,
841        params: Option<RequestParams>,
842        _stream: bool,
843        content_type: &str,
844    ) -> Result<serde_json::Value, reqwest::Error> {
845        let mut data = HashMap::new();
846
847        if let Ok(params) = serde_json::to_value(params) {
848            if let Some(ref p) = params.as_object() {
849                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
850            }
851        }
852
853        data.insert("url".into(), serde_json::Value::String(url.to_string()));
854
855        let res = self.api_post("links", data, content_type).await?;
856        res.json().await
857    }
858
859    /// Takes a screenshot of a URL.
860    ///
861    /// # Arguments
862    ///
863    /// * `url` - The URL to take a screenshot of.
864    /// * `params` - Optional request parameters.
865    /// * `stream` - Whether streaming is enabled.
866    /// * `content_type` - The content type of the request.
867    ///
868    /// # Returns
869    ///
870    /// The response from the API as a JSON value.
871    pub async fn screenshot(
872        &self,
873        url: &str,
874        params: Option<RequestParams>,
875        _stream: bool,
876        content_type: &str,
877    ) -> Result<serde_json::Value, reqwest::Error> {
878        let mut data = HashMap::new();
879
880        if let Ok(params) = serde_json::to_value(params) {
881            if let Some(ref p) = params.as_object() {
882                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
883            }
884        }
885
886        data.insert("url".into(), serde_json::Value::String(url.to_string()));
887
888        let res = self.api_post("screenshot", data, content_type).await?;
889        res.json().await
890    }
891
892    /// Searches for a query.
893    ///
894    /// # Arguments
895    ///
896    /// * `q` - The query to search for.
897    /// * `params` - Optional request parameters.
898    /// * `stream` - Whether streaming is enabled.
899    /// * `content_type` - The content type of the request.
900    ///
901    /// # Returns
902    ///
903    /// The response from the API as a JSON value.
904    pub async fn search(
905        &self,
906        q: &str,
907        params: Option<SearchRequestParams>,
908        _stream: bool,
909        content_type: &str,
910    ) -> Result<serde_json::Value, reqwest::Error> {
911        let body = match params {
912            Some(mut params) => {
913                params.search = q.to_string();
914                params
915            }
916            _ => {
917                let mut params = SearchRequestParams::default();
918                params.search = q.to_string();
919                params
920            }
921        };
922
923        let res = self.api_post("search", body, content_type).await?;
924
925        res.json().await
926    }
927
928    /// Transforms data.
929    ///
930    /// # Arguments
931    ///
932    /// * `data` - The data to transform.
933    /// * `params` - Optional request parameters.
934    /// * `stream` - Whether streaming is enabled.
935    /// * `content_type` - The content type of the request.
936    ///
937    /// # Returns
938    ///
939    /// The response from the API as a JSON value.
940    pub async fn transform(
941        &self,
942        data: Vec<HashMap<&str, &str>>,
943        params: Option<TransformParams>,
944        _stream: bool,
945        content_type: &str,
946    ) -> Result<serde_json::Value, reqwest::Error> {
947        let mut payload = HashMap::new();
948
949        if let Ok(params) = serde_json::to_value(params) {
950            if let Some(ref p) = params.as_object() {
951                payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
952            }
953        }
954
955        if let Ok(d) = serde_json::to_value(data) {
956            payload.insert("data".into(), d);
957        }
958
959        let res = self.api_post("transform", payload, content_type).await?;
960
961        res.json().await
962    }
963
964    /// Extracts contacts from a URL.
965    ///
966    /// # Arguments
967    ///
968    /// * `url` - The URL to extract contacts from.
969    /// * `params` - Optional request parameters.
970    /// * `stream` - Whether streaming is enabled.
971    /// * `content_type` - The content type of the request.
972    ///
973    /// # Returns
974    ///
975    /// The response from the API as a JSON value.
976    pub async fn extract_contacts(
977        &self,
978        url: &str,
979        params: Option<RequestParams>,
980        _stream: bool,
981        content_type: &str,
982    ) -> Result<serde_json::Value, reqwest::Error> {
983        let mut data = HashMap::new();
984
985        if let Ok(params) = serde_json::to_value(params) {
986            if let Ok(params) = serde_json::to_value(params) {
987                if let Some(ref p) = params.as_object() {
988                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
989                }
990            }
991        }
992
993        match serde_json::to_value(url) {
994            Ok(u) => {
995                data.insert("url".into(), u);
996            }
997            _ => (),
998        }
999
1000        let res = self
1001            .api_post("pipeline/extract-contacts", data, content_type)
1002            .await?;
1003        res.json().await
1004    }
1005
1006    /// Labels data from a URL.
1007    ///
1008    /// # Arguments
1009    ///
1010    /// * `url` - The URL to label data from.
1011    /// * `params` - Optional request parameters.
1012    /// * `stream` - Whether streaming is enabled.
1013    /// * `content_type` - The content type of the request.
1014    ///
1015    /// # Returns
1016    ///
1017    /// The response from the API as a JSON value.
1018    pub async fn label(
1019        &self,
1020        url: &str,
1021        params: Option<RequestParams>,
1022        _stream: bool,
1023        content_type: &str,
1024    ) -> Result<serde_json::Value, reqwest::Error> {
1025        let mut data = HashMap::new();
1026
1027        if let Ok(params) = serde_json::to_value(params) {
1028            if let Ok(params) = serde_json::to_value(params) {
1029                if let Some(ref p) = params.as_object() {
1030                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1031                }
1032            }
1033        }
1034
1035        data.insert("url".into(), serde_json::Value::String(url.to_string()));
1036
1037        let res = self.api_post("pipeline/label", data, content_type).await?;
1038        res.json().await
1039    }
1040
1041    /// Download a record from storage.
1042    ///
1043    /// # Arguments
1044    ///
1045    /// * `url` - Optional exact url of the file in storage.
1046    /// * `options` - Optional options.
1047    /// * `stream` - Whether streaming is enabled.
1048    ///
1049    /// # Returns
1050    ///
1051    /// The response from the API.
1052    pub async fn download(
1053        &self,
1054        url: Option<&str>,
1055        options: Option<HashMap<&str, i32>>,
1056    ) -> Result<reqwest::Response, reqwest::Error> {
1057        let mut params = HashMap::new();
1058
1059        if let Some(url) = url {
1060            params.insert("url".to_string(), url.to_string());
1061        }
1062
1063        if let Some(options) = options {
1064            for (key, value) in options {
1065                params.insert(key.to_string(), value.to_string());
1066            }
1067        }
1068
1069        let url = format!("{API_URL}/v1/data/download");
1070        let request = self
1071            .client
1072            .get(&url)
1073            .header(
1074                "User-Agent",
1075                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1076            )
1077            .header("Content-Type", "application/octet-stream")
1078            .header("Authorization", format!("Bearer {}", self.api_key))
1079            .query(&params);
1080
1081        let res = request.send().await?;
1082
1083        Ok(res)
1084    }
1085
1086    /// Creates a signed URL of a file from storage.
1087    ///
1088    /// # Arguments
1089    ///
1090    /// * `url` - Optional exact url of the file in storage.
1091    /// * `options` - Optional options.
1092    /// * `stream` - Whether streaming is enabled.
1093    ///
1094    /// # Returns
1095    ///
1096    /// The response from the API.
1097    pub async fn create_signed_url(
1098        &self,
1099        url: Option<&str>,
1100        options: Option<HashMap<&str, i32>>,
1101    ) -> Result<serde_json::Value, reqwest::Error> {
1102        let mut params = HashMap::new();
1103
1104        if let Some(options) = options {
1105            for (key, value) in options {
1106                params.insert(key.to_string(), value.to_string());
1107            }
1108        }
1109
1110        if let Some(url) = url {
1111            params.insert("url".to_string(), url.to_string());
1112        }
1113
1114        let url = format!("{API_URL}/v1/data/sign-url");
1115        let request = self
1116            .client
1117            .get(&url)
1118            .header(
1119                "User-Agent",
1120                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1121            )
1122            .header("Authorization", format!("Bearer {}", self.api_key))
1123            .query(&params);
1124
1125        let res = request.send().await?;
1126
1127        res.json().await
1128    }
1129
1130    /// Gets the crawl state of a URL.
1131    ///
1132    /// # Arguments
1133    ///
1134    /// * `url` - The URL to get the crawl state of.
1135    /// * `params` - Optional request parameters.
1136    /// * `stream` - Whether streaming is enabled.
1137    /// * `content_type` - The content type of the request.
1138    ///
1139    /// # Returns
1140    ///
1141    pub async fn get_crawl_state(
1142        &self,
1143        url: &str,
1144        params: Option<RequestParams>,
1145        content_type: &str,
1146    ) -> Result<serde_json::Value, reqwest::Error> {
1147        let mut payload = HashMap::new();
1148        payload.insert("url".into(), serde_json::Value::String(url.to_string()));
1149        payload.insert(
1150            "contentType".into(),
1151            serde_json::Value::String(content_type.to_string()),
1152        );
1153
1154        if let Ok(params) = serde_json::to_value(params) {
1155            if let Ok(params) = serde_json::to_value(params) {
1156                if let Some(ref p) = params.as_object() {
1157                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1158                }
1159            }
1160        }
1161
1162        let res = self
1163            .api_post("data/crawl_state", payload, content_type)
1164            .await?;
1165        res.json().await
1166    }
1167
1168    /// Get the account credits left.
1169    pub async fn get_credits(&self) -> Result<serde_json::Value, reqwest::Error> {
1170        self.api_get::<serde_json::Value>("data/credits", None)
1171            .await
1172    }
1173
1174    /// Send a request for a data record.
1175    pub async fn data_post(
1176        &self,
1177        table: &str,
1178        data: Option<RequestParams>,
1179    ) -> Result<serde_json::Value, reqwest::Error> {
1180        let res = self
1181            .api_post(&format!("data/{}", table), data, "application/json")
1182            .await?;
1183        res.json().await
1184    }
1185
1186    /// Query a record from the global DB.
1187    pub async fn query(&self, params: &QueryRequest) -> Result<serde_json::Value, reqwest::Error> {
1188        let res = self
1189            .api_get::<QueryRequest>(&"data/query", Some(params))
1190            .await?;
1191
1192        Ok(res)
1193    }
1194
1195    /// Get a table record.
1196    pub async fn data_get(
1197        &self,
1198        table: &str,
1199        params: Option<RequestParams>,
1200    ) -> Result<serde_json::Value, reqwest::Error> {
1201        let mut payload = HashMap::new();
1202
1203        if let Some(params) = params {
1204            if let Ok(p) = serde_json::to_value(params) {
1205                if let Some(o) = p.as_object() {
1206                    payload.extend(o.iter().map(|(k, v)| (k.as_str(), v.clone())));
1207                }
1208            }
1209        }
1210
1211        let res = self
1212            .api_get::<serde_json::Value>(&format!("data/{}", table), None)
1213            .await?;
1214        Ok(res)
1215    }
1216
1217    /// Delete a record.
1218    pub async fn data_delete(
1219        &self,
1220        table: &str,
1221        params: Option<RequestParams>,
1222    ) -> Result<serde_json::Value, reqwest::Error> {
1223        let mut payload = HashMap::new();
1224
1225        if let Ok(params) = serde_json::to_value(params) {
1226            if let Ok(params) = serde_json::to_value(params) {
1227                if let Some(ref p) = params.as_object() {
1228                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1229                }
1230            }
1231        }
1232
1233        let res = self
1234            .api_delete(&format!("data/{}", table), Some(payload))
1235            .await?;
1236        res.json().await
1237    }
1238}
1239
1240#[cfg(test)]
1241mod tests {
1242    use super::*;
1243    use dotenv::dotenv;
1244    use lazy_static::lazy_static;
1245    use reqwest::ClientBuilder;
1246
1247    lazy_static! {
1248        static ref SPIDER_CLIENT: Spider = {
1249            dotenv().ok();
1250            let client = ClientBuilder::new();
1251            let client = client.user_agent("SpiderBot").build().unwrap();
1252
1253            Spider::new_with_client(None, client).expect("client to build")
1254        };
1255    }
1256
1257    #[tokio::test]
1258    #[ignore]
1259    async fn test_scrape_url() {
1260        let response = SPIDER_CLIENT
1261            .scrape_url("https://example.com", None, "application/json")
1262            .await;
1263        assert!(response.is_ok());
1264    }
1265
1266    #[tokio::test]
1267    async fn test_crawl_url() {
1268        let response = SPIDER_CLIENT
1269            .crawl_url(
1270                "https://example.com",
1271                None,
1272                false,
1273                "application/json",
1274                None::<fn(serde_json::Value)>,
1275            )
1276            .await;
1277        assert!(response.is_ok());
1278    }
1279
1280    #[tokio::test]
1281    #[ignore]
1282    async fn test_links() {
1283        let response: Result<serde_json::Value, Error> = SPIDER_CLIENT
1284            .links("https://example.com", None, false, "application/json")
1285            .await;
1286        assert!(response.is_ok());
1287    }
1288
1289    #[tokio::test]
1290    #[ignore]
1291    async fn test_screenshot() {
1292        let mut params = RequestParams::default();
1293        params.limit = Some(1);
1294
1295        let response = SPIDER_CLIENT
1296            .screenshot(
1297                "https://example.com",
1298                Some(params),
1299                false,
1300                "application/json",
1301            )
1302            .await;
1303        assert!(response.is_ok());
1304    }
1305
1306    // #[tokio::test(flavor = "multi_thread")]
1307    // async fn test_search() {
1308    //     let mut params = SearchRequestParams::default();
1309
1310    //     params.search_limit = Some(1);
1311    //     params.num = Some(1);
1312    //     params.fetch_page_content = Some(false);
1313
1314    //     let response = SPIDER_CLIENT
1315    //         .search("a sports website", Some(params), false, "application/json")
1316    //         .await;
1317
1318    //     assert!(response.is_ok());
1319    // }
1320
1321    #[tokio::test]
1322    #[ignore]
1323    async fn test_transform() {
1324        let data = vec![HashMap::from([(
1325            "<html><body><h1>Transformation</h1></body></html>".into(),
1326            "".into(),
1327        )])];
1328        let response = SPIDER_CLIENT
1329            .transform(data, None, false, "application/json")
1330            .await;
1331        assert!(response.is_ok());
1332    }
1333
1334    #[tokio::test]
1335    #[ignore]
1336    async fn test_extract_contacts() {
1337        let response = SPIDER_CLIENT
1338            .extract_contacts("https://example.com", None, false, "application/json")
1339            .await;
1340        assert!(response.is_ok());
1341    }
1342
1343    #[tokio::test]
1344    #[ignore]
1345    async fn test_label() {
1346        let response = SPIDER_CLIENT
1347            .label("https://example.com", None, false, "application/json")
1348            .await;
1349        assert!(response.is_ok());
1350    }
1351
1352    #[tokio::test]
1353    async fn test_create_signed_url() {
1354        let response = SPIDER_CLIENT
1355            .create_signed_url(Some("example.com"), None)
1356            .await;
1357        assert!(response.is_ok());
1358    }
1359
1360    #[tokio::test]
1361    async fn test_get_crawl_state() {
1362        let response = SPIDER_CLIENT
1363            .get_crawl_state("https://example.com", None, "application/json")
1364            .await;
1365        assert!(response.is_ok());
1366    }
1367
1368    #[tokio::test]
1369    async fn test_query() {
1370        let mut query = QueryRequest::default();
1371
1372        query.domain = Some("spider.cloud".into());
1373
1374        let response = SPIDER_CLIENT.query(&query).await;
1375        assert!(response.is_ok());
1376    }
1377
1378    #[tokio::test]
1379    async fn test_get_credits() {
1380        let response = SPIDER_CLIENT.get_credits().await;
1381        assert!(response.is_ok());
1382    }
1383}
spider_client/lib.rs

spider_client/
lib.rs