spider_client/
lib.rs

1//! The `spider-client` module provides the primary interface and
2//! functionalities for the Spider web crawler library, which is
3//! designed for rapid and efficient crawling of web pages to gather
4//! links using isolated contexts.
5//!
6//! ### Features
7//!
8//! - **Multi-threaded Crawling:** Spider can utilize multiple
9//!   threads to parallelize the crawling process, drastically
10//!   improving performance and allowing the ability to gather
11//!   millions of pages in a short time.
12//!
13//! - **Configurable:** The library provides various options to
14//!   configure the crawling behavior, such as setting the depth
15//!   of crawling, user-agent strings, delays between requests,
16//!   and more.
17//!
18//! - **Link Gathering:** One of the primary objectives of Spider is to
19//!   gather and manage links from the web pages it crawls,
20//!   compiling them into a structured format for further use.
21//!
22//! ### Examples
23//!
24//! Basic usage of the Spider client might look like this:
25//!
26//! ```rust
27//! use spider_client::{Spider, RequestType, RequestParams};
28//! use tokio;
29//!
30//!  # #[ignore]
31//! #[tokio::main]
32//! async fn main() {
33//!     let spider = Spider::new(Some("myspiderapikey".into())).expect("API key must be provided");
34//!
35//!     let url = "https://spider.cloud";
36//!
37//!     // Scrape a single URL
38//!     let scraped_data = spider.scrape_url(url, None, "application/json").await.expect("Failed to scrape the URL");
39//!
40//!     println!("Scraped Data: {:?}", scraped_data);
41//!
42//!     // Crawl a website
43//!     let crawler_params = RequestParams {
44//!         limit: Some(1),
45//!         proxy_enabled: Some(true),
46//!         store_data: Some(false),
47//!         metadata: Some(false),
48//!         request: Some(RequestType::Http),
49//!         ..Default::default()
50//!     };
51//!
52//!     let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
53//!
54//!     println!("Crawl Result: {:?}", crawl_result);
55//! }
56//! ```
57//!
58//! ### Modules
59//!
60//! - `config`: Contains the configuration options for the Spider client.
61//! - `utils`: Utility functions used by the Spider client.
62//!
63
64use backon::ExponentialBuilder;
65use backon::Retryable;
66use reqwest::Client;
67use reqwest::{Error, Response};
68use serde::{Deserialize, Serialize};
69use std::collections::HashMap;
70use tokio_stream::StreamExt;
71
72/// Structure representing the Chunking algorithm dictionary.
73#[derive(Debug, Deserialize, Serialize, Clone)]
74pub struct ChunkingAlgDict {
75    /// The chunking algorithm to use, defined as a specific type.
76    r#type: ChunkingType,
77    /// The amount to chunk by.
78    value: i32,
79}
80
81// The nested structures
82#[derive(Serialize, Deserialize, Debug, Clone)]
83pub struct Timeout {
84    /// The seconds up to 60.
85    pub secs: u64,
86    /// The nanoseconds.
87    pub nanos: u32,
88}
89
90#[derive(Serialize, Deserialize, Debug, Clone)]
91pub struct IdleNetwork {
92    /// The timeout to wait until.
93    pub timeout: Timeout,
94}
95
96#[derive(Serialize, Deserialize, Debug, Clone)]
97#[serde(tag = "type", rename_all = "PascalCase")]
98pub enum WebAutomation {
99    Evaluate { code: String },
100    Click { selector: String },
101    Wait { duration: u64 },
102    WaitForNavigation,
103    WaitFor { selector: String },
104    WaitForAndClick { selector: String },
105    ScrollX { pixels: i32 },
106    ScrollY { pixels: i32 },
107    Fill { selector: String, value: String },
108    InfiniteScroll { times: u32 },
109}
110
111#[derive(Default, Serialize, Deserialize, Debug, Clone)]
112#[serde(tag = "type", rename_all = "PascalCase")]
113pub enum RedirectPolicy {
114    Loose,
115    #[default]
116    Strict,
117}
118
119pub type WebAutomationMap = std::collections::HashMap<String, Vec<WebAutomation>>;
120pub type ExecutionScriptsMap = std::collections::HashMap<String, String>;
121
122#[derive(Serialize, Deserialize, Debug, Clone)]
123pub struct Selector {
124    /// The timeout to wait until.
125    pub timeout: Timeout,
126    /// The selector to wait for.
127    pub selector: String,
128}
129
130#[derive(Serialize, Deserialize, Debug, Clone)]
131pub struct Delay {
132    /// The timeout to wait until.
133    pub timeout: Timeout,
134}
135
136#[derive(Serialize, Deserialize, Debug, Clone)]
137pub struct WaitFor {
138    /// Wait until idle networks with a timeout of idleness.
139    pub idle_network: Option<IdleNetwork>,
140    /// Wait until a selector exist. Can determine if a selector exist after executing all js and network events.
141    pub selector: Option<Selector>,
142    /// Wait until a hard delay.
143    pub delay: Option<Delay>,
144    /// Wait until page navigation happen. Default is true.
145    pub page_navigations: Option<bool>,
146}
147
148/// Query request to get a document.
149#[derive(Serialize, Deserialize, Debug, Clone, Default)]
150pub struct QueryRequest {
151    /// The exact website url.
152    pub url: Option<String>,
153    /// The website domain.
154    pub domain: Option<String>,
155    /// The path of the resource.
156    pub pathname: Option<String>,
157}
158
159/// Enum representing different types of Chunking.
160#[derive(Default, Debug, Deserialize, Serialize, Clone)]
161#[serde(rename_all = "lowercase")]
162pub enum ChunkingType {
163    #[default]
164    /// By the word count.
165    ByWords,
166    /// By the line count.
167    ByLines,
168    /// By the char length.
169    ByCharacterLength,
170    /// By sentence.
171    BySentence,
172}
173
174#[derive(Default, Debug, Deserialize, Serialize, Clone)]
175/// View port handling for chrome.
176pub struct Viewport {
177    /// Device screen Width
178    pub width: u32,
179    /// Device screen size
180    pub height: u32,
181    /// Device scale factor
182    pub device_scale_factor: Option<f64>,
183    /// Emulating Mobile?
184    pub emulating_mobile: bool,
185    /// Use landscape mode instead of portrait.
186    pub is_landscape: bool,
187    /// Touch screen device?
188    pub has_touch: bool,
189}
190
191/// The API url.
192const API_URL: &'static str = "https://api.spider.cloud";
193
194// Define the CSSSelector struct
195#[derive(Debug, Clone, Default, Deserialize, Serialize)]
196pub struct CSSSelector {
197    /// The name of the selector group
198    pub name: String,
199    /// A vector of CSS selectors
200    pub selectors: Vec<String>,
201}
202
203// Define the CSSExtractionMap type
204pub type CSSExtractionMap = HashMap<String, Vec<CSSSelector>>;
205
206/// Represents the settings for a webhook configuration
207#[derive(Debug, Default, Deserialize, Serialize, Clone)]
208pub struct WebhookSettings {
209    /// The destination where the webhook information will be sent
210    destination: String,
211    /// Trigger an action when all credits are depleted
212    on_credits_depleted: bool,
213    /// Trigger an action when half of the credits are depleted
214    on_credits_half_depleted: bool,
215    /// Trigger an action on a website status update event
216    on_website_status: bool,
217    /// Send information about a new page find (such as links and bytes)
218    on_find: bool,
219    /// Handle the metadata of a found page
220    on_find_metadata: bool,
221}
222
223/// Send multiple return formats.
224#[derive(Debug, Deserialize, Serialize, Clone)]
225#[serde(untagged)]
226pub enum ReturnFormatHandling {
227    /// A single return item.
228    Single(ReturnFormat),
229    /// Multiple return formats.
230    Multi(std::collections::HashSet<ReturnFormat>),
231}
232
233impl Default for ReturnFormatHandling {
234    fn default() -> ReturnFormatHandling {
235        ReturnFormatHandling::Single(ReturnFormat::Raw)
236    }
237}
238
239#[derive(Debug, Default, Deserialize, Serialize, Clone)]
240pub struct EventTracker {
241    /// The responses received.
242    responses: Option<bool>,
243    ///The request sent.
244    requests: Option<bool>
245}
246
247/// Structure representing request parameters.
248#[derive(Debug, Default, Deserialize, Serialize, Clone)]
249pub struct RequestParams {
250    #[serde(default)]
251    /// The URL to be crawled.
252    pub url: Option<String>,
253    #[serde(default)]
254    /// The type of request to be made.
255    pub request: Option<RequestType>,
256    #[serde(default)]
257    /// The maximum number of pages the crawler should visit.
258    pub limit: Option<u32>,
259    #[serde(default)]
260    /// The format in which the result should be returned.
261    pub return_format: Option<ReturnFormatHandling>,
262    #[serde(default)]
263    /// Specifies whether to only visit the top-level domain.
264    pub tld: Option<bool>,
265    #[serde(default)]
266    /// The depth of the crawl.
267    pub depth: Option<u32>,
268    #[serde(default)]
269    /// Specifies whether the request should be cached.
270    pub cache: Option<bool>,
271    #[serde(default)]
272    /// Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
273    pub scroll: Option<u32>,
274    #[serde(default)]
275    /// The budget for various resources.
276    pub budget: Option<HashMap<String, u32>>,
277    #[serde(default)]
278    /// The blacklist routes to ignore. This can be a Regex string pattern.
279    pub blacklist: Option<Vec<String>>,
280    #[serde(default)]
281    /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
282    pub whitelist: Option<Vec<String>>,
283    #[serde(default)]
284    /// The locale to be used during the crawl.
285    pub locale: Option<String>,
286    #[serde(default)]
287    /// The cookies to be set for the request, formatted as a single string.
288    pub cookies: Option<String>,
289    #[serde(default)]
290    /// Specifies whether to use stealth techniques to avoid detection.
291    pub stealth: Option<bool>,
292    #[serde(default)]
293    /// The headers to be used for the request.
294    pub headers: Option<HashMap<String, String>>,
295    #[serde(default)]
296    /// Specifies whether anti-bot measures should be used.
297    pub anti_bot: Option<bool>,
298    #[serde(default)]
299    /// Specifies whether to send data via webhooks.
300    pub webhooks: Option<WebhookSettings>,
301    #[serde(default)]
302    /// Specifies whether to include metadata in the response.
303    pub metadata: Option<bool>,
304    #[serde(default)]
305    /// The dimensions of the viewport.
306    pub viewport: Option<Viewport>,
307    #[serde(default)]
308    /// The encoding to be used for the request.
309    pub encoding: Option<String>,
310    #[serde(default)]
311    /// Specifies whether to include subdomains in the crawl.
312    pub subdomains: Option<bool>,
313    #[serde(default)]
314    /// The user agent string to be used for the request.
315    pub user_agent: Option<String>,
316    #[serde(default)]
317    /// Specifies whether the response data should be stored.
318    pub store_data: Option<bool>,
319    #[serde(default)]
320    /// Configuration settings for GPT (general purpose texture mappings).
321    pub gpt_config: Option<HashMap<String, String>>,
322    #[serde(default)]
323    /// Specifies whether to use fingerprinting protection.
324    pub fingerprint: Option<bool>,
325    #[serde(default)]
326    /// Specifies whether to perform the request without using storage.
327    pub storageless: Option<bool>,
328    #[serde(default)]
329    /// Specifies whether readability optimizations should be applied.
330    pub readability: Option<bool>,
331    #[serde(default)]
332    /// Specifies whether to use a proxy for the request.
333    pub proxy_enabled: Option<bool>,
334    #[serde(default)]
335    /// Specifies whether to respect the site's robots.txt file.
336    pub respect_robots: Option<bool>,
337    #[serde(default)]
338    /// CSS selector to be used to filter the content.
339    pub root_selector: Option<String>,
340    #[serde(default)]
341    /// Specifies whether to load all resources of the crawl target.
342    pub full_resources: Option<bool>,
343    #[serde(default)]
344    /// The text string to extract data from.
345    pub text: Option<String>,
346    #[serde(default)]
347    /// Specifies whether to use the sitemap links.
348    pub sitemap: Option<bool>,
349    #[serde(default)]
350    /// External domains to include the crawl.
351    pub external_domains: Option<Vec<String>>,
352    #[serde(default)]
353    /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
354    pub return_embeddings: Option<bool>,
355    #[serde(default)]
356    /// Returns the HTTP response headers.
357    pub return_headers: Option<bool>,
358    #[serde(default)]
359    /// Returns the link(s) found on the page that match the crawler query.
360    pub return_page_links: Option<bool>,
361    #[serde(default)]
362    /// Returns the HTTP response cookies.
363    pub return_cookies: Option<bool>,
364    #[serde(default)]
365    /// The timeout for the request, in milliseconds.
366    pub request_timeout: Option<u8>,
367    #[serde(default)]
368    /// Specifies whether to run the request in the background.
369    pub run_in_background: Option<bool>,
370    #[serde(default)]
371    /// Specifies whether to skip configuration checks.
372    pub skip_config_checks: Option<bool>,
373    #[serde(default)]
374    /// Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
375    pub css_extraction_map: Option<CSSExtractionMap>,
376    #[serde(default)]
377    /// The chunking algorithm to use.
378    pub chunking_alg: Option<ChunkingAlgDict>,
379    #[serde(default)]
380    /// Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
381    pub disable_intercept: Option<bool>,
382    #[serde(default)]
383    /// The wait for events on the page. You need to make your `request` `chrome` or `smart`.
384    pub wait_for: Option<WaitFor>,
385    #[serde(default)]
386    /// Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`
387    pub execution_scripts: Option<ExecutionScriptsMap>,
388    #[serde(default)]
389    /// Perform web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`
390    pub automation_scripts: Option<WebAutomationMap>,
391    #[serde(default)]
392    /// The redirect policy for HTTP request. Set the value to Loose to allow all.
393    pub redirect_policy: Option<RedirectPolicy>,
394    #[serde(default)]
395    /// Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
396    pub event_tracker: Option<EventTracker>
397}
398
399/// The structure representing request parameters for a search request.
400#[derive(Debug, Default, Deserialize, Serialize, Clone)]
401pub struct SearchRequestParams {
402    /// The base request parameters.
403    #[serde(default, flatten)]
404    pub base: RequestParams,
405    // The search request.
406    pub search: String,
407    /// The search limit.
408    pub search_limit: Option<u32>,
409    // Fetch the page content. Defaults to true.
410    pub fetch_page_content: Option<bool>,
411    /// The search location of the request
412    pub location: Option<String>,
413    /// The country code of the request
414    pub country: Option<String>,
415    /// The language code of the request.
416    pub language: Option<String>,
417    /// The number of search results
418    pub num: Option<u32>,
419    /// The page of the search results.
420    pub page: Option<u32>,
421    #[serde(default)]
422    /// The websites limit if a list is sent from text or urls comma split. This helps automatic configuration of the system.
423    pub website_limit: Option<u32>,
424}
425
426/// Structure representing request parameters for transforming files.
427#[derive(Debug, Default, Deserialize, Serialize, Clone)]
428pub struct TransformParams {
429    #[serde(default)]
430    /// The format in which the result should be returned.
431    pub return_format: Option<ReturnFormat>,
432    #[serde(default)]
433    /// Specifies whether readability optimizations should be applied.
434    pub readability: Option<bool>,
435    #[serde(default)]
436    /// Clean the markdown or text for AI.
437    pub clean: Option<bool>,
438    #[serde(default)]
439    /// Clean the markdown or text for AI removing footers, navigation, and more.
440    pub clean_full: Option<bool>,
441    /// The data being transformed.
442    pub data: Vec<DataParam>,
443}
444
445#[derive(Serialize, Deserialize, Debug, Clone)]
446pub struct DataParam {
447    /// The HTML resource.
448    pub html: String,
449    /// The website url.
450    pub url: Option<String>,
451}
452
453/// the request type to perform
454#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
455#[serde(rename_all = "lowercase")]
456pub enum RequestType {
457    /// Default HTTP request
458    Http,
459    /// Chrome browser rendering
460    Chrome,
461    #[default]
462    /// Smart mode defaulting to HTTP and using Chrome when needed.
463    SmartMode,
464}
465
466/// Enum representing different return formats.
467#[derive(Default, Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Hash)]
468#[serde(rename_all = "lowercase")]
469pub enum ReturnFormat {
470    #[default]
471    /// The default return format of the resource.
472    Raw,
473    /// Return the response as Markdown.
474    Markdown,
475    /// Return the response as Commonmark.
476    Commonmark,
477    /// Return the response as Html2text.
478    Html2text,
479    /// Return the response as Text.
480    Text,
481    /// Return the response as XML.
482    Xml,
483    /// Return the response as Bytes.
484    Bytes,
485}
486
487/// Represents a Spider with API key and HTTP client.
488#[derive(Debug, Default)]
489pub struct Spider {
490    /// The Spider API key.
491    pub api_key: String,
492    /// The Spider Client to re-use.
493    pub client: Client,
494}
495
496impl Spider {
497    /// Creates a new instance of Spider.
498    ///
499    /// # Arguments
500    ///
501    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
502    ///
503    /// # Returns
504    ///
505    /// A new instance of Spider or an error string if no API key is provided.
506    pub fn new(api_key: Option<String>) -> Result<Self, &'static str> {
507        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
508
509        match api_key {
510            Some(key) => Ok(Self {
511                api_key: key,
512                client: Client::new(),
513            }),
514            None => Err("No API key provided"),
515        }
516    }
517
518    /// Creates a new instance of Spider.
519    ///
520    /// # Arguments
521    ///
522    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
523    /// * `client` - A custom client to pass in.
524    ///
525    /// # Returns
526    ///
527    /// A new instance of Spider or an error string if no API key is provided.
528    pub fn new_with_client(api_key: Option<String>, client: Client) -> Result<Self, &'static str> {
529        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
530
531        match api_key {
532            Some(key) => Ok(Self {
533                api_key: key,
534                client,
535            }),
536            None => Err("No API key provided"),
537        }
538    }
539
540    /// Sends a POST request to the API.
541    ///
542    /// # Arguments
543    ///
544    /// * `endpoint` - The API endpoint.
545    /// * `data` - The request data as a HashMap.
546    /// * `stream` - Whether streaming is enabled.
547    /// * `content_type` - The content type of the request.
548    ///
549    /// # Returns
550    ///
551    /// The response from the API.
552    async fn api_post_base(
553        &self,
554        endpoint: &str,
555        data: impl Serialize + Sized + std::fmt::Debug,
556        content_type: &str,
557    ) -> Result<Response, Error> {
558        let url: String = format!("{API_URL}/{}", endpoint);
559
560        self.client
561            .post(&url)
562            .header(
563                "User-Agent",
564                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
565            )
566            .header("Content-Type", content_type)
567            .header("Authorization", format!("Bearer {}", self.api_key))
568            .json(&data)
569            .send()
570            .await
571    }
572
573    /// Sends a POST request to the API.
574    ///
575    /// # Arguments
576    ///
577    /// * `endpoint` - The API endpoint.
578    /// * `data` - The request data as a HashMap.
579    /// * `stream` - Whether streaming is enabled.
580    /// * `content_type` - The content type of the request.
581    ///
582    /// # Returns
583    ///
584    /// The response from the API.
585    async fn api_post(
586        &self,
587        endpoint: &str,
588        data: impl Serialize + std::fmt::Debug + Clone + Send + Sync,
589        content_type: &str,
590    ) -> Result<Response, Error> {
591        let fetch = || async {
592            self.api_post_base(endpoint, data.to_owned(), content_type)
593                .await
594        };
595
596        fetch
597            .retry(ExponentialBuilder::default().with_max_times(5))
598            .when(|err: &reqwest::Error| {
599                if let Some(status) = err.status() {
600                    status.is_server_error()
601                } else {
602                    err.is_timeout()
603                }
604            })
605            .await
606    }
607
608    /// Sends a GET request to the API.
609    ///
610    /// # Arguments
611    ///
612    /// * `endpoint` - The API endpoint.
613    ///
614    /// # Returns
615    ///
616    /// The response from the API as a JSON value.
617    async fn api_get_base<T: Serialize>(
618        &self,
619        endpoint: &str,
620        query_params: Option<&T>,
621    ) -> Result<serde_json::Value, reqwest::Error> {
622        let url = format!("{API_URL}/{}", endpoint);
623        let res = self
624            .client
625            .get(&url)
626            .query(&query_params)
627            .header(
628                "User-Agent",
629                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
630            )
631            .header("Content-Type", "application/json")
632            .header("Authorization", format!("Bearer {}", self.api_key))
633            .send()
634            .await?;
635        res.json().await
636    }
637
638    /// Sends a GET request to the API.
639    ///
640    /// # Arguments
641    ///
642    /// * `endpoint` - The API endpoint.
643    ///
644    /// # Returns
645    ///
646    /// The response from the API as a JSON value.
647    async fn api_get<T: Serialize>(
648        &self,
649        endpoint: &str,
650        query_params: Option<&T>,
651    ) -> Result<serde_json::Value, reqwest::Error> {
652        let fetch = || async { self.api_get_base(endpoint, query_params.to_owned()).await };
653
654        fetch
655            .retry(ExponentialBuilder::default().with_max_times(5))
656            .when(|err: &reqwest::Error| {
657                if let Some(status) = err.status() {
658                    status.is_server_error()
659                } else {
660                    err.is_timeout()
661                }
662            })
663            .await
664    }
665
666    /// Sends a DELETE request to the API.
667    ///
668    /// # Arguments
669    ///
670    /// * `endpoint` - The API endpoint.
671    /// * `params` - Optional request parameters.
672    /// * `stream` - Whether streaming is enabled.
673    /// * `content_type` - The content type of the request.
674    ///
675    /// # Returns
676    ///
677    /// The response from the API.
678    async fn api_delete_base(
679        &self,
680        endpoint: &str,
681        params: Option<HashMap<String, serde_json::Value>>,
682    ) -> Result<Response, Error> {
683        let url = format!("{API_URL}/v1/{}", endpoint);
684        let request_builder = self
685            .client
686            .delete(&url)
687            .header(
688                "User-Agent",
689                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
690            )
691            .header("Content-Type", "application/json")
692            .header("Authorization", format!("Bearer {}", self.api_key));
693
694        let request_builder = if let Some(params) = params {
695            request_builder.json(&params)
696        } else {
697            request_builder
698        };
699
700        request_builder.send().await
701    }
702
703    /// Sends a DELETE request to the API.
704    ///
705    /// # Arguments
706    ///
707    /// * `endpoint` - The API endpoint.
708    /// * `params` - Optional request parameters.
709    /// * `stream` - Whether streaming is enabled.
710    /// * `content_type` - The content type of the request.
711    ///
712    /// # Returns
713    ///
714    /// The response from the API.
715    async fn api_delete(
716        &self,
717        endpoint: &str,
718        params: Option<HashMap<String, serde_json::Value>>,
719    ) -> Result<Response, Error> {
720        let fetch = || async { self.api_delete_base(endpoint, params.to_owned()).await };
721
722        fetch
723            .retry(ExponentialBuilder::default().with_max_times(5))
724            .when(|err: &reqwest::Error| {
725                if let Some(status) = err.status() {
726                    status.is_server_error()
727                } else {
728                    err.is_timeout()
729                }
730            })
731            .await
732    }
733
734    /// Scrapes a URL.
735    ///
736    /// # Arguments
737    ///
738    /// * `url` - The URL to scrape.
739    /// * `params` - Optional request parameters.
740    /// * `stream` - Whether streaming is enabled.
741    /// * `content_type` - The content type of the request.
742    ///
743    /// # Returns
744    ///
745    /// The response from the API as a JSON value.
746    pub async fn scrape_url(
747        &self,
748        url: &str,
749        params: Option<RequestParams>,
750        content_type: &str,
751    ) -> Result<serde_json::Value, reqwest::Error> {
752        let mut data = HashMap::new();
753
754        data.insert(
755            "url".to_string(),
756            serde_json::Value::String(url.to_string()),
757        );
758        data.insert("limit".to_string(), serde_json::Value::Number(1.into()));
759
760        if let Ok(params) = serde_json::to_value(params) {
761            if let Some(ref p) = params.as_object() {
762                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
763            }
764        }
765
766        let res = self.api_post("crawl", data, content_type).await?;
767        res.json().await
768    }
769
770    /// Crawls a URL.
771    ///
772    /// # Arguments
773    ///
774    /// * `url` - The URL to crawl.
775    /// * `params` - Optional request parameters.
776    /// * `stream` - Whether streaming is enabled.
777    /// * `content_type` - The content type of the request.
778    /// * `callback` - Optional callback function to handle each streamed chunk.
779    ///
780    /// # Returns
781    ///
782    /// The response from the API as a JSON value.
783    pub async fn crawl_url(
784        &self,
785        url: &str,
786        params: Option<RequestParams>,
787        stream: bool,
788        content_type: &str,
789        callback: Option<impl Fn(serde_json::Value) + Send>,
790    ) -> Result<serde_json::Value, reqwest::Error> {
791        use tokio_util::codec::{FramedRead, LinesCodec};
792
793        let mut data = HashMap::new();
794
795        if let Ok(params) = serde_json::to_value(params) {
796            if let Some(ref p) = params.as_object() {
797                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
798            }
799        }
800
801        data.insert("url".into(), serde_json::Value::String(url.to_string()));
802
803        let res = self.api_post("crawl", data, content_type).await?;
804
805        if stream {
806            if let Some(callback) = callback {
807                let stream = res.bytes_stream();
808
809                let stream_reader = tokio_util::io::StreamReader::new(
810                    stream.map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
811                );
812
813                let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
814
815                while let Some(line_result) = lines.next().await {
816                    match line_result {
817                        Ok(line) => {
818                            match serde_json::from_str::<serde_json::Value>(&line) {
819                                Ok(value) => {
820                                    callback(value);
821                                }
822                                Err(_e) => {
823                                    continue;
824                                }
825                            }
826                        }
827                        Err(_e) => {
828                            return Ok(serde_json::Value::Null)
829                        }
830                    }
831                }
832
833                Ok(serde_json::Value::Null)
834            } else {
835                Ok(serde_json::Value::Null)
836            }
837        } else {
838            res.json().await
839        }
840    }
841
842    /// Fetches links from a URL.
843    ///
844    /// # Arguments
845    ///
846    /// * `url` - The URL to fetch links from.
847    /// * `params` - Optional request parameters.
848    /// * `stream` - Whether streaming is enabled.
849    /// * `content_type` - The content type of the request.
850    ///
851    /// # Returns
852    ///
853    /// The response from the API as a JSON value.
854    pub async fn links(
855        &self,
856        url: &str,
857        params: Option<RequestParams>,
858        _stream: bool,
859        content_type: &str,
860    ) -> Result<serde_json::Value, reqwest::Error> {
861        let mut data = HashMap::new();
862
863        if let Ok(params) = serde_json::to_value(params) {
864            if let Some(ref p) = params.as_object() {
865                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
866            }
867        }
868
869        data.insert("url".into(), serde_json::Value::String(url.to_string()));
870
871        let res = self.api_post("links", data, content_type).await?;
872        res.json().await
873    }
874
875    /// Takes a screenshot of a URL.
876    ///
877    /// # Arguments
878    ///
879    /// * `url` - The URL to take a screenshot of.
880    /// * `params` - Optional request parameters.
881    /// * `stream` - Whether streaming is enabled.
882    /// * `content_type` - The content type of the request.
883    ///
884    /// # Returns
885    ///
886    /// The response from the API as a JSON value.
887    pub async fn screenshot(
888        &self,
889        url: &str,
890        params: Option<RequestParams>,
891        _stream: bool,
892        content_type: &str,
893    ) -> Result<serde_json::Value, reqwest::Error> {
894        let mut data = HashMap::new();
895
896        if let Ok(params) = serde_json::to_value(params) {
897            if let Some(ref p) = params.as_object() {
898                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
899            }
900        }
901
902        data.insert("url".into(), serde_json::Value::String(url.to_string()));
903
904        let res = self.api_post("screenshot", data, content_type).await?;
905        res.json().await
906    }
907
908    /// Searches for a query.
909    ///
910    /// # Arguments
911    ///
912    /// * `q` - The query to search for.
913    /// * `params` - Optional request parameters.
914    /// * `stream` - Whether streaming is enabled.
915    /// * `content_type` - The content type of the request.
916    ///
917    /// # Returns
918    ///
919    /// The response from the API as a JSON value.
920    pub async fn search(
921        &self,
922        q: &str,
923        params: Option<SearchRequestParams>,
924        _stream: bool,
925        content_type: &str,
926    ) -> Result<serde_json::Value, reqwest::Error> {
927        let body = match params {
928            Some(mut params) => {
929                params.search = q.to_string();
930                params
931            }
932            _ => {
933                let mut params = SearchRequestParams::default();
934                params.search = q.to_string();
935                params
936            }
937        };
938
939        let res = self.api_post("search", body, content_type).await?;
940
941        res.json().await
942    }
943
944    /// Transforms data.
945    ///
946    /// # Arguments
947    ///
948    /// * `data` - The data to transform.
949    /// * `params` - Optional request parameters.
950    /// * `stream` - Whether streaming is enabled.
951    /// * `content_type` - The content type of the request.
952    ///
953    /// # Returns
954    ///
955    /// The response from the API as a JSON value.
956    pub async fn transform(
957        &self,
958        data: Vec<HashMap<&str, &str>>,
959        params: Option<TransformParams>,
960        _stream: bool,
961        content_type: &str,
962    ) -> Result<serde_json::Value, reqwest::Error> {
963        let mut payload = HashMap::new();
964
965        if let Ok(params) = serde_json::to_value(params) {
966            if let Some(ref p) = params.as_object() {
967                payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
968            }
969        }
970
971        if let Ok(d) = serde_json::to_value(data) {
972            payload.insert("data".into(), d);
973        }
974
975        let res = self.api_post("transform", payload, content_type).await?;
976
977        res.json().await
978    }
979
980    /// Extracts contacts from a URL.
981    ///
982    /// # Arguments
983    ///
984    /// * `url` - The URL to extract contacts from.
985    /// * `params` - Optional request parameters.
986    /// * `stream` - Whether streaming is enabled.
987    /// * `content_type` - The content type of the request.
988    ///
989    /// # Returns
990    ///
991    /// The response from the API as a JSON value.
992    pub async fn extract_contacts(
993        &self,
994        url: &str,
995        params: Option<RequestParams>,
996        _stream: bool,
997        content_type: &str,
998    ) -> Result<serde_json::Value, reqwest::Error> {
999        let mut data = HashMap::new();
1000
1001        if let Ok(params) = serde_json::to_value(params) {
1002            if let Ok(params) = serde_json::to_value(params) {
1003                if let Some(ref p) = params.as_object() {
1004                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1005                }
1006            }
1007        }
1008
1009        match serde_json::to_value(url) {
1010            Ok(u) => {
1011                data.insert("url".into(), u);
1012            }
1013            _ => (),
1014        }
1015
1016        let res = self
1017            .api_post("pipeline/extract-contacts", data, content_type)
1018            .await?;
1019        res.json().await
1020    }
1021
1022    /// Labels data from a URL.
1023    ///
1024    /// # Arguments
1025    ///
1026    /// * `url` - The URL to label data from.
1027    /// * `params` - Optional request parameters.
1028    /// * `stream` - Whether streaming is enabled.
1029    /// * `content_type` - The content type of the request.
1030    ///
1031    /// # Returns
1032    ///
1033    /// The response from the API as a JSON value.
1034    pub async fn label(
1035        &self,
1036        url: &str,
1037        params: Option<RequestParams>,
1038        _stream: bool,
1039        content_type: &str,
1040    ) -> Result<serde_json::Value, reqwest::Error> {
1041        let mut data = HashMap::new();
1042
1043        if let Ok(params) = serde_json::to_value(params) {
1044            if let Ok(params) = serde_json::to_value(params) {
1045                if let Some(ref p) = params.as_object() {
1046                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1047                }
1048            }
1049        }
1050
1051        data.insert("url".into(), serde_json::Value::String(url.to_string()));
1052
1053        let res = self.api_post("pipeline/label", data, content_type).await?;
1054        res.json().await
1055    }
1056
1057    /// Download a record from storage.
1058    ///
1059    /// # Arguments
1060    ///
1061    /// * `url` - Optional exact url of the file in storage.
1062    /// * `options` - Optional options.
1063    /// * `stream` - Whether streaming is enabled.
1064    ///
1065    /// # Returns
1066    ///
1067    /// The response from the API.
1068    pub async fn download(
1069        &self,
1070        url: Option<&str>,
1071        options: Option<HashMap<&str, i32>>,
1072    ) -> Result<reqwest::Response, reqwest::Error> {
1073        let mut params = HashMap::new();
1074
1075        if let Some(url) = url {
1076            params.insert("url".to_string(), url.to_string());
1077        }
1078
1079        if let Some(options) = options {
1080            for (key, value) in options {
1081                params.insert(key.to_string(), value.to_string());
1082            }
1083        }
1084
1085        let url = format!("{API_URL}/v1/data/download");
1086        let request = self
1087            .client
1088            .get(&url)
1089            .header(
1090                "User-Agent",
1091                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1092            )
1093            .header("Content-Type", "application/octet-stream")
1094            .header("Authorization", format!("Bearer {}", self.api_key))
1095            .query(&params);
1096
1097        let res = request.send().await?;
1098
1099        Ok(res)
1100    }
1101
1102    /// Creates a signed URL of a file from storage.
1103    ///
1104    /// # Arguments
1105    ///
1106    /// * `url` - Optional exact url of the file in storage.
1107    /// * `options` - Optional options.
1108    /// * `stream` - Whether streaming is enabled.
1109    ///
1110    /// # Returns
1111    ///
1112    /// The response from the API.
1113    pub async fn create_signed_url(
1114        &self,
1115        url: Option<&str>,
1116        options: Option<HashMap<&str, i32>>,
1117    ) -> Result<serde_json::Value, reqwest::Error> {
1118        let mut params = HashMap::new();
1119
1120        if let Some(options) = options {
1121            for (key, value) in options {
1122                params.insert(key.to_string(), value.to_string());
1123            }
1124        }
1125
1126        if let Some(url) = url {
1127            params.insert("url".to_string(), url.to_string());
1128        }
1129
1130        let url = format!("{API_URL}/v1/data/sign-url");
1131        let request = self
1132            .client
1133            .get(&url)
1134            .header(
1135                "User-Agent",
1136                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
1137            )
1138            .header("Authorization", format!("Bearer {}", self.api_key))
1139            .query(&params);
1140
1141        let res = request.send().await?;
1142
1143        res.json().await
1144    }
1145
1146    /// Gets the crawl state of a URL.
1147    ///
1148    /// # Arguments
1149    ///
1150    /// * `url` - The URL to get the crawl state of.
1151    /// * `params` - Optional request parameters.
1152    /// * `stream` - Whether streaming is enabled.
1153    /// * `content_type` - The content type of the request.
1154    ///
1155    /// # Returns
1156    ///
1157    pub async fn get_crawl_state(
1158        &self,
1159        url: &str,
1160        params: Option<RequestParams>,
1161        content_type: &str,
1162    ) -> Result<serde_json::Value, reqwest::Error> {
1163        let mut payload = HashMap::new();
1164        payload.insert("url".into(), serde_json::Value::String(url.to_string()));
1165        payload.insert(
1166            "contentType".into(),
1167            serde_json::Value::String(content_type.to_string()),
1168        );
1169
1170        if let Ok(params) = serde_json::to_value(params) {
1171            if let Ok(params) = serde_json::to_value(params) {
1172                if let Some(ref p) = params.as_object() {
1173                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1174                }
1175            }
1176        }
1177
1178        let res = self
1179            .api_post("data/crawl_state", payload, content_type)
1180            .await?;
1181        res.json().await
1182    }
1183
1184    /// Get the account credits left.
1185    pub async fn get_credits(&self) -> Result<serde_json::Value, reqwest::Error> {
1186        self.api_get::<serde_json::Value>("data/credits", None)
1187            .await
1188    }
1189
1190    /// Send a request for a data record.
1191    pub async fn data_post(
1192        &self,
1193        table: &str,
1194        data: Option<RequestParams>,
1195    ) -> Result<serde_json::Value, reqwest::Error> {
1196        let res = self
1197            .api_post(&format!("data/{}", table), data, "application/json")
1198            .await?;
1199        res.json().await
1200    }
1201
1202    /// Query a record from the global DB.
1203    pub async fn query(&self, params: &QueryRequest) -> Result<serde_json::Value, reqwest::Error> {
1204        let res = self
1205            .api_get::<QueryRequest>(&"data/query", Some(params))
1206            .await?;
1207
1208        Ok(res)
1209    }
1210
1211    /// Get a table record.
1212    pub async fn data_get(
1213        &self,
1214        table: &str,
1215        params: Option<RequestParams>,
1216    ) -> Result<serde_json::Value, reqwest::Error> {
1217        let mut payload = HashMap::new();
1218
1219        if let Some(params) = params {
1220            if let Ok(p) = serde_json::to_value(params) {
1221                if let Some(o) = p.as_object() {
1222                    payload.extend(o.iter().map(|(k, v)| (k.as_str(), v.clone())));
1223                }
1224            }
1225        }
1226
1227        let res = self
1228            .api_get::<serde_json::Value>(&format!("data/{}", table), None)
1229            .await?;
1230        Ok(res)
1231    }
1232
1233    /// Delete a record.
1234    pub async fn data_delete(
1235        &self,
1236        table: &str,
1237        params: Option<RequestParams>,
1238    ) -> Result<serde_json::Value, reqwest::Error> {
1239        let mut payload = HashMap::new();
1240
1241        if let Ok(params) = serde_json::to_value(params) {
1242            if let Ok(params) = serde_json::to_value(params) {
1243                if let Some(ref p) = params.as_object() {
1244                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1245                }
1246            }
1247        }
1248
1249        let res = self
1250            .api_delete(&format!("data/{}", table), Some(payload))
1251            .await?;
1252        res.json().await
1253    }
1254}
1255
1256#[cfg(test)]
1257mod tests {
1258    use super::*;
1259    use dotenv::dotenv;
1260    use lazy_static::lazy_static;
1261    use reqwest::ClientBuilder;
1262
1263    lazy_static! {
1264        static ref SPIDER_CLIENT: Spider = {
1265            dotenv().ok();
1266            let client = ClientBuilder::new();
1267            let client = client.user_agent("SpiderBot").build().unwrap();
1268
1269            Spider::new_with_client(None, client).expect("client to build")
1270        };
1271    }
1272
1273    #[tokio::test]
1274    #[ignore]
1275    async fn test_scrape_url() {
1276        let response = SPIDER_CLIENT
1277            .scrape_url("https://example.com", None, "application/json")
1278            .await;
1279        assert!(response.is_ok());
1280    }
1281
1282    #[tokio::test]
1283    async fn test_crawl_url() {
1284        let response = SPIDER_CLIENT
1285            .crawl_url(
1286                "https://example.com",
1287                None,
1288                false,
1289                "application/json",
1290                None::<fn(serde_json::Value)>,
1291            )
1292            .await;
1293        assert!(response.is_ok());
1294    }
1295
1296    #[tokio::test]
1297    #[ignore]
1298    async fn test_links() {
1299        let response: Result<serde_json::Value, Error> = SPIDER_CLIENT
1300            .links("https://example.com", None, false, "application/json")
1301            .await;
1302        assert!(response.is_ok());
1303    }
1304
1305    #[tokio::test]
1306    #[ignore]
1307    async fn test_screenshot() {
1308        let mut params = RequestParams::default();
1309        params.limit = Some(1);
1310
1311        let response = SPIDER_CLIENT
1312            .screenshot(
1313                "https://example.com",
1314                Some(params),
1315                false,
1316                "application/json",
1317            )
1318            .await;
1319        assert!(response.is_ok());
1320    }
1321
1322    // #[tokio::test(flavor = "multi_thread")]
1323    // async fn test_search() {
1324    //     let mut params = SearchRequestParams::default();
1325
1326    //     params.search_limit = Some(1);
1327    //     params.num = Some(1);
1328    //     params.fetch_page_content = Some(false);
1329
1330    //     let response = SPIDER_CLIENT
1331    //         .search("a sports website", Some(params), false, "application/json")
1332    //         .await;
1333
1334    //     assert!(response.is_ok());
1335    // }
1336
1337    #[tokio::test]
1338    #[ignore]
1339    async fn test_transform() {
1340        let data = vec![HashMap::from([(
1341            "<html><body><h1>Transformation</h1></body></html>".into(),
1342            "".into(),
1343        )])];
1344        let response = SPIDER_CLIENT
1345            .transform(data, None, false, "application/json")
1346            .await;
1347        assert!(response.is_ok());
1348    }
1349
1350    #[tokio::test]
1351    #[ignore]
1352    async fn test_extract_contacts() {
1353        let response = SPIDER_CLIENT
1354            .extract_contacts("https://example.com", None, false, "application/json")
1355            .await;
1356        assert!(response.is_ok());
1357    }
1358
1359    #[tokio::test]
1360    #[ignore]
1361    async fn test_label() {
1362        let response = SPIDER_CLIENT
1363            .label("https://example.com", None, false, "application/json")
1364            .await;
1365        assert!(response.is_ok());
1366    }
1367
1368    #[tokio::test]
1369    async fn test_create_signed_url() {
1370        let response = SPIDER_CLIENT
1371            .create_signed_url(Some("example.com"), None)
1372            .await;
1373        assert!(response.is_ok());
1374    }
1375
1376    #[tokio::test]
1377    async fn test_get_crawl_state() {
1378        let response = SPIDER_CLIENT
1379            .get_crawl_state("https://example.com", None, "application/json")
1380            .await;
1381        assert!(response.is_ok());
1382    }
1383
1384    #[tokio::test]
1385    async fn test_query() {
1386        let mut query = QueryRequest::default();
1387
1388        query.domain = Some("spider.cloud".into());
1389
1390        let response = SPIDER_CLIENT.query(&query).await;
1391        assert!(response.is_ok());
1392    }
1393
1394    #[tokio::test]
1395    async fn test_get_credits() {
1396        let response = SPIDER_CLIENT.get_credits().await;
1397        assert!(response.is_ok());
1398    }
1399}
spider_client/lib.rs

spider_client/
lib.rs