spider_client/
lib.rs

1//! The `spider-client` module provides the primary interface and
2//! functionalities for the Spider web crawler library, which is
3//! designed for rapid and efficient crawling of web pages to gather
4//! links using isolated contexts.
5//!
6//! ### Features
7//!
8//! - **Multi-threaded Crawling:** Spider can utilize multiple
9//!   threads to parallelize the crawling process, drastically
10//!   improving performance and allowing the ability to gather
11//!   millions of pages in a short time.
12//!
13//! - **Configurable:** The library provides various options to
14//!   configure the crawling behavior, such as setting the depth
15//!   of crawling, user-agent strings, delays between requests,
16//!   and more.
17//!
18//! - **Link Gathering:** One of the primary objectives of Spider is to
19//!   gather and manage links from the web pages it crawls,
20//!   compiling them into a structured format for further use.
21//!
22//! ### Examples
23//!
24//! Basic usage of the Spider client might look like this:
25//!
26//! ```rust
27//! use spider_client::{Spider, RequestType, RequestParams};
28//! use tokio;
29//!
30//!  # #[ignore]
31//! #[tokio::main]
32//! async fn main() {
33//!     let spider = Spider::new(Some("myspiderapikey".into())).expect("API key must be provided");
34//!
35//!     let url = "https://spider.cloud";
36//!
37//!     // Scrape a single URL
38//!     let scraped_data = spider.scrape_url(url, None, "application/json").await.expect("Failed to scrape the URL");
39//!
40//!     println!("Scraped Data: {:?}", scraped_data);
41//!
42//!     // Crawl a website
43//!     let crawler_params = RequestParams {
44//!         limit: Some(1),
45//!         proxy_enabled: Some(true),
46//!         store_data: Some(false),
47//!         metadata: Some(false),
48//!         request: Some(RequestType::Http),
49//!         ..Default::default()
50//!     };
51//!
52//!     let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
53//!
54//!     println!("Crawl Result: {:?}", crawl_result);
55//! }
56//! ```
57//!
58//! ### Modules
59//!
60//! - `config`: Contains the configuration options for the Spider client.
61//! - `utils`: Utility functions used by the Spider client.
62//!
63
64pub mod shapes;
65
66use backon::ExponentialBuilder;
67use backon::Retryable;
68use reqwest::Client;
69use reqwest::{Error, Response};
70use serde::Serialize;
71use std::collections::HashMap;
72use tokio_stream::StreamExt;
73
74pub use shapes::{request::*, response::*};
75
76/// The API url.
77const API_URL: &'static str = "https://api.spider.cloud";
78
79/// Represents a Spider with API key and HTTP client.
80#[derive(Debug, Default)]
81pub struct Spider {
82    /// The Spider API key.
83    pub api_key: String,
84    /// The Spider Client to re-use.
85    pub client: Client,
86}
87
88/// Handle the json response.
89pub async fn handle_json(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
90    res.json().await
91}
92
93/// Handle the jsonl response.
94pub async fn handle_jsonl(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
95    let text = res.text().await?;
96    let lines = text
97        .lines()
98        .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
99        .collect::<Vec<_>>();
100    Ok(serde_json::Value::Array(lines))
101}
102
103/// Handle the CSV response.
104#[cfg(feature = "csv")]
105pub async fn handle_csv(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
106    use std::collections::HashMap;
107    let text = res.text().await?;
108    let mut rdr = csv::Reader::from_reader(text.as_bytes());
109    let records: Vec<HashMap<String, String>> = rdr.deserialize().filter_map(Result::ok).collect();
110
111    if let Ok(record) = serde_json::to_value(records) {
112        Ok(record)
113    } else {
114        Ok(serde_json::Value::String(text))
115    }
116}
117
118#[cfg(not(feature = "csv"))]
119pub async fn handle_csv(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
120    handle_text(res).await
121}
122
123/// Basic handle response to text
124pub async fn handle_text(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
125    Ok(serde_json::Value::String(
126        res.text().await.unwrap_or_default(),
127    ))
128}
129
130/// Handle the XML response.
131#[cfg(feature = "csv")]
132pub async fn handle_xml(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
133    let text = res.text().await?;
134    match quick_xml::de::from_str::<serde_json::Value>(&text) {
135        Ok(val) => Ok(val),
136        Err(_) => Ok(serde_json::Value::String(text)),
137    }
138}
139
140#[cfg(not(feature = "csv"))]
141/// Handle the XML response.
142pub async fn handle_xml(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
143    handle_text(res).await
144}
145
146pub async fn parse_response(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
147    let content_type = res
148        .headers()
149        .get(reqwest::header::CONTENT_TYPE)
150        .and_then(|v| v.to_str().ok())
151        .unwrap_or_default()
152        .to_ascii_lowercase();
153
154    if content_type.contains("json") && !content_type.contains("jsonl") {
155        handle_json(res).await
156    } else if content_type.contains("jsonl") || content_type.contains("ndjson") {
157        handle_jsonl(res).await
158    } else if content_type.contains("csv") {
159        handle_csv(res).await
160    } else if content_type.contains("xml") {
161        handle_xml(res).await
162    } else {
163        handle_text(res).await
164    }
165}
166
167impl Spider {
168    /// Creates a new instance of Spider.
169    ///
170    /// # Arguments
171    ///
172    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
173    ///
174    /// # Returns
175    ///
176    /// A new instance of Spider or an error string if no API key is provided.
177    pub fn new(api_key: Option<String>) -> Result<Self, &'static str> {
178        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
179
180        match api_key {
181            Some(key) => Ok(Self {
182                api_key: key,
183                client: Client::new(),
184            }),
185            None => Err("No API key provided"),
186        }
187    }
188
189    /// Creates a new instance of Spider.
190    ///
191    /// # Arguments
192    ///
193    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
194    /// * `client` - A custom client to pass in.
195    ///
196    /// # Returns
197    ///
198    /// A new instance of Spider or an error string if no API key is provided.
199    pub fn new_with_client(api_key: Option<String>, client: Client) -> Result<Self, &'static str> {
200        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
201
202        match api_key {
203            Some(key) => Ok(Self {
204                api_key: key,
205                client,
206            }),
207            None => Err("No API key provided"),
208        }
209    }
210
211    /// Sends a POST request to the API.
212    ///
213    /// # Arguments
214    ///
215    /// * `endpoint` - The API endpoint.
216    /// * `data` - The request data as a HashMap.
217    /// * `stream` - Whether streaming is enabled.
218    /// * `content_type` - The content type of the request.
219    ///
220    /// # Returns
221    ///
222    /// The response from the API.
223    async fn api_post_base(
224        &self,
225        endpoint: &str,
226        data: impl Serialize + Sized + std::fmt::Debug,
227        content_type: &str,
228    ) -> Result<Response, Error> {
229        let url: String = format!("{API_URL}/{}", endpoint);
230
231        self.client
232            .post(&url)
233            .header(
234                "User-Agent",
235                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
236            )
237            .header("Content-Type", content_type)
238            .header("Authorization", format!("Bearer {}", self.api_key))
239            .json(&data)
240            .send()
241            .await
242    }
243
244    /// Sends a POST request to the API.
245    ///
246    /// # Arguments
247    ///
248    /// * `endpoint` - The API endpoint.
249    /// * `data` - The request data as a HashMap.
250    /// * `stream` - Whether streaming is enabled.
251    /// * `content_type` - The content type of the request.
252    ///
253    /// # Returns
254    ///
255    /// The response from the API.
256    async fn api_post(
257        &self,
258        endpoint: &str,
259        data: impl Serialize + std::fmt::Debug + Clone + Send + Sync,
260        content_type: &str,
261    ) -> Result<Response, Error> {
262        let fetch = || async {
263            self.api_post_base(endpoint, data.to_owned(), content_type)
264                .await
265        };
266
267        fetch
268            .retry(ExponentialBuilder::default().with_max_times(5))
269            .when(|err: &reqwest::Error| {
270                if let Some(status) = err.status() {
271                    status.is_server_error()
272                } else {
273                    err.is_timeout()
274                }
275            })
276            .await
277    }
278
279    /// Sends a GET request to the API.
280    ///
281    /// # Arguments
282    ///
283    /// * `endpoint` - The API endpoint.
284    ///
285    /// # Returns
286    ///
287    /// The response from the API as a JSON value.
288    async fn api_get_base<T: Serialize>(
289        &self,
290        endpoint: &str,
291        query_params: Option<&T>,
292    ) -> Result<serde_json::Value, reqwest::Error> {
293        let url = format!("{API_URL}/{}", endpoint);
294        let res = self
295            .client
296            .get(&url)
297            .query(&query_params)
298            .header(
299                "User-Agent",
300                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
301            )
302            .header("Content-Type", "application/json")
303            .header("Authorization", format!("Bearer {}", self.api_key))
304            .send()
305            .await?;
306        parse_response(res).await
307    }
308
309    /// Sends a GET request to the API.
310    ///
311    /// # Arguments
312    ///
313    /// * `endpoint` - The API endpoint.
314    ///
315    /// # Returns
316    ///
317    /// The response from the API as a JSON value.
318    async fn api_get<T: Serialize>(
319        &self,
320        endpoint: &str,
321        query_params: Option<&T>,
322    ) -> Result<serde_json::Value, reqwest::Error> {
323        let fetch = || async { self.api_get_base(endpoint, query_params.to_owned()).await };
324
325        fetch
326            .retry(ExponentialBuilder::default().with_max_times(5))
327            .when(|err: &reqwest::Error| {
328                if let Some(status) = err.status() {
329                    status.is_server_error()
330                } else {
331                    err.is_timeout()
332                }
333            })
334            .await
335    }
336
337    /// Sends a DELETE request to the API.
338    ///
339    /// # Arguments
340    ///
341    /// * `endpoint` - The API endpoint.
342    /// * `params` - Optional request parameters.
343    /// * `stream` - Whether streaming is enabled.
344    /// * `content_type` - The content type of the request.
345    ///
346    /// # Returns
347    ///
348    /// The response from the API.
349    async fn api_delete_base(
350        &self,
351        endpoint: &str,
352        params: Option<HashMap<String, serde_json::Value>>,
353    ) -> Result<Response, Error> {
354        let url = format!("{API_URL}/v1/{}", endpoint);
355        let request_builder = self
356            .client
357            .delete(&url)
358            .header(
359                "User-Agent",
360                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
361            )
362            .header("Content-Type", "application/json")
363            .header("Authorization", format!("Bearer {}", self.api_key));
364
365        let request_builder = if let Some(params) = params {
366            request_builder.json(&params)
367        } else {
368            request_builder
369        };
370
371        request_builder.send().await
372    }
373
374    /// Sends a DELETE request to the API.
375    ///
376    /// # Arguments
377    ///
378    /// * `endpoint` - The API endpoint.
379    /// * `params` - Optional request parameters.
380    /// * `stream` - Whether streaming is enabled.
381    /// * `content_type` - The content type of the request.
382    ///
383    /// # Returns
384    ///
385    /// The response from the API.
386    async fn api_delete(
387        &self,
388        endpoint: &str,
389        params: Option<HashMap<String, serde_json::Value>>,
390    ) -> Result<Response, Error> {
391        let fetch = || async { self.api_delete_base(endpoint, params.to_owned()).await };
392
393        fetch
394            .retry(ExponentialBuilder::default().with_max_times(5))
395            .when(|err: &reqwest::Error| {
396                if let Some(status) = err.status() {
397                    status.is_server_error()
398                } else {
399                    err.is_timeout()
400                }
401            })
402            .await
403    }
404
405    /// Scrapes a URL.
406    ///
407    /// # Arguments
408    ///
409    /// * `url` - The URL to scrape.
410    /// * `params` - Optional request parameters.
411    /// * `stream` - Whether streaming is enabled.
412    /// * `content_type` - The content type of the request.
413    ///
414    /// # Returns
415    ///
416    /// The response from the API as a JSON value.
417    pub async fn scrape_url(
418        &self,
419        url: &str,
420        params: Option<RequestParams>,
421        content_type: &str,
422    ) -> Result<serde_json::Value, reqwest::Error> {
423        let mut data = HashMap::new();
424
425        if let Ok(params) = serde_json::to_value(params) {
426            if let Some(ref p) = params.as_object() {
427                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
428            }
429        }
430
431        if !url.is_empty() {
432            data.insert(
433                "url".to_string(),
434                serde_json::Value::String(url.to_string()),
435            );
436        }
437
438        data.insert("limit".to_string(), serde_json::Value::Number(1.into()));
439
440        let res = self.api_post("crawl", data, content_type).await?;
441        parse_response(res).await
442    }
443
444    /// Crawls a URL.
445    ///
446    /// # Arguments
447    ///
448    /// * `url` - The URL to crawl.
449    /// * `params` - Optional request parameters.
450    /// * `stream` - Whether streaming is enabled.
451    /// * `content_type` - The content type of the request.
452    /// * `callback` - Optional callback function to handle each streamed chunk.
453    ///
454    /// # Returns
455    ///
456    /// The response from the API as a JSON value.
457    pub async fn crawl_url(
458        &self,
459        url: &str,
460        params: Option<RequestParams>,
461        stream: bool,
462        content_type: &str,
463        callback: Option<impl Fn(serde_json::Value) + Send>,
464    ) -> Result<serde_json::Value, reqwest::Error> {
465        use tokio_util::codec::{FramedRead, LinesCodec};
466
467        let mut data = HashMap::new();
468
469        if let Ok(params) = serde_json::to_value(params) {
470            if let Some(ref p) = params.as_object() {
471                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
472            }
473        }
474
475        data.insert("url".into(), serde_json::Value::String(url.to_string()));
476
477        let res = self.api_post("crawl", data, content_type).await?;
478
479        if stream {
480            if let Some(callback) = callback {
481                let stream = res.bytes_stream();
482
483                let stream_reader = tokio_util::io::StreamReader::new(
484                    stream
485                        .map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
486                );
487
488                let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
489
490                while let Some(line_result) = lines.next().await {
491                    match line_result {
492                        Ok(line) => match serde_json::from_str::<serde_json::Value>(&line) {
493                            Ok(value) => {
494                                callback(value);
495                            }
496                            Err(_e) => {
497                                continue;
498                            }
499                        },
500                        Err(_e) => return Ok(serde_json::Value::Null),
501                    }
502                }
503
504                Ok(serde_json::Value::Null)
505            } else {
506                Ok(serde_json::Value::Null)
507            }
508        } else {
509            parse_response(res).await
510        }
511    }
512
513    /// Fetches links from a URL.
514    ///
515    /// # Arguments
516    ///
517    /// * `url` - The URL to fetch links from.
518    /// * `params` - Optional request parameters.
519    /// * `stream` - Whether streaming is enabled.
520    /// * `content_type` - The content type of the request.
521    ///
522    /// # Returns
523    ///
524    /// The response from the API as a JSON value.
525    pub async fn links(
526        &self,
527        url: &str,
528        params: Option<RequestParams>,
529        _stream: bool,
530        content_type: &str,
531    ) -> Result<serde_json::Value, reqwest::Error> {
532        let mut data = HashMap::new();
533
534        if let Ok(params) = serde_json::to_value(params) {
535            if let Some(ref p) = params.as_object() {
536                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
537            }
538        }
539
540        data.insert("url".into(), serde_json::Value::String(url.to_string()));
541
542        let res = self.api_post("links", data, content_type).await?;
543        parse_response(res).await
544    }
545
546    /// Takes a screenshot of a URL.
547    ///
548    /// # Arguments
549    ///
550    /// * `url` - The URL to take a screenshot of.
551    /// * `params` - Optional request parameters.
552    /// * `stream` - Whether streaming is enabled.
553    /// * `content_type` - The content type of the request.
554    ///
555    /// # Returns
556    ///
557    /// The response from the API as a JSON value.
558    pub async fn screenshot(
559        &self,
560        url: &str,
561        params: Option<RequestParams>,
562        _stream: bool,
563        content_type: &str,
564    ) -> Result<serde_json::Value, reqwest::Error> {
565        let mut data = HashMap::new();
566
567        if let Ok(params) = serde_json::to_value(params) {
568            if let Some(ref p) = params.as_object() {
569                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
570            }
571        }
572
573        data.insert("url".into(), serde_json::Value::String(url.to_string()));
574
575        let res = self.api_post("screenshot", data, content_type).await?;
576        parse_response(res).await
577    }
578
579    /// Searches for a query.
580    ///
581    /// # Arguments
582    ///
583    /// * `q` - The query to search for.
584    /// * `params` - Optional request parameters.
585    /// * `stream` - Whether streaming is enabled.
586    /// * `content_type` - The content type of the request.
587    ///
588    /// # Returns
589    ///
590    /// The response from the API as a JSON value.
591    pub async fn search(
592        &self,
593        q: &str,
594        params: Option<SearchRequestParams>,
595        _stream: bool,
596        content_type: &str,
597    ) -> Result<serde_json::Value, reqwest::Error> {
598        let body = match params {
599            Some(mut params) => {
600                params.search = q.to_string();
601                params
602            }
603            _ => {
604                let mut params = SearchRequestParams::default();
605                params.search = q.to_string();
606                params
607            }
608        };
609
610        let res = self.api_post("search", body, content_type).await?;
611
612        parse_response(res).await
613    }
614
615    /// Transforms data.
616    ///
617    /// # Arguments
618    ///
619    /// * `data` - The data to transform.
620    /// * `params` - Optional request parameters.
621    /// * `stream` - Whether streaming is enabled.
622    /// * `content_type` - The content type of the request.
623    ///
624    /// # Returns
625    ///
626    /// The response from the API as a JSON value.
627    pub async fn transform(
628        &self,
629        data: Vec<HashMap<&str, &str>>,
630        params: Option<TransformParams>,
631        _stream: bool,
632        content_type: &str,
633    ) -> Result<serde_json::Value, reqwest::Error> {
634        let mut payload = HashMap::new();
635
636        if let Ok(params) = serde_json::to_value(params) {
637            if let Some(ref p) = params.as_object() {
638                payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
639            }
640        }
641
642        if let Ok(d) = serde_json::to_value(data) {
643            payload.insert("data".into(), d);
644        }
645
646        let res = self.api_post("transform", payload, content_type).await?;
647
648        parse_response(res).await
649    }
650
651    /// Extracts contacts from a URL.
652    ///
653    /// # Arguments
654    ///
655    /// * `url` - The URL to extract contacts from.
656    /// * `params` - Optional request parameters.
657    /// * `stream` - Whether streaming is enabled.
658    /// * `content_type` - The content type of the request.
659    ///
660    /// # Returns
661    ///
662    /// The response from the API as a JSON value.
663    pub async fn extract_contacts(
664        &self,
665        url: &str,
666        params: Option<RequestParams>,
667        _stream: bool,
668        content_type: &str,
669    ) -> Result<serde_json::Value, reqwest::Error> {
670        let mut data = HashMap::new();
671
672        if let Ok(params) = serde_json::to_value(params) {
673            if let Ok(params) = serde_json::to_value(params) {
674                if let Some(ref p) = params.as_object() {
675                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
676                }
677            }
678        }
679
680        match serde_json::to_value(url) {
681            Ok(u) => {
682                data.insert("url".into(), u);
683            }
684            _ => (),
685        }
686
687        let res = self
688            .api_post("pipeline/extract-contacts", data, content_type)
689            .await?;
690
691        parse_response(res).await
692    }
693
694    /// Labels data from a URL.
695    ///
696    /// # Arguments
697    ///
698    /// * `url` - The URL to label data from.
699    /// * `params` - Optional request parameters.
700    /// * `stream` - Whether streaming is enabled.
701    /// * `content_type` - The content type of the request.
702    ///
703    /// # Returns
704    ///
705    /// The response from the API as a JSON value.
706    pub async fn label(
707        &self,
708        url: &str,
709        params: Option<RequestParams>,
710        _stream: bool,
711        content_type: &str,
712    ) -> Result<serde_json::Value, reqwest::Error> {
713        let mut data = HashMap::new();
714
715        if let Ok(params) = serde_json::to_value(params) {
716            if let Ok(params) = serde_json::to_value(params) {
717                if let Some(ref p) = params.as_object() {
718                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
719                }
720            }
721        }
722
723        data.insert("url".into(), serde_json::Value::String(url.to_string()));
724
725        let res = self.api_post("pipeline/label", data, content_type).await?;
726        parse_response(res).await
727    }
728
729    /// Download a record from storage.
730    ///
731    /// # Arguments
732    ///
733    /// * `url` - Optional exact url of the file in storage.
734    /// * `options` - Optional options.
735    /// * `stream` - Whether streaming is enabled.
736    ///
737    /// # Returns
738    ///
739    /// The response from the API.
740    pub async fn download(
741        &self,
742        url: Option<&str>,
743        options: Option<HashMap<&str, i32>>,
744    ) -> Result<reqwest::Response, reqwest::Error> {
745        let mut params = HashMap::new();
746
747        if let Some(url) = url {
748            params.insert("url".to_string(), url.to_string());
749        }
750
751        if let Some(options) = options {
752            for (key, value) in options {
753                params.insert(key.to_string(), value.to_string());
754            }
755        }
756
757        let url = format!("{API_URL}/v1/data/download");
758        let request = self
759            .client
760            .get(&url)
761            .header(
762                "User-Agent",
763                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
764            )
765            .header("Content-Type", "application/octet-stream")
766            .header("Authorization", format!("Bearer {}", self.api_key))
767            .query(&params);
768
769        let res = request.send().await?;
770
771        Ok(res)
772    }
773
774    /// Creates a signed URL of a file from storage.
775    ///
776    /// # Arguments
777    ///
778    /// * `url` - Optional exact url of the file in storage.
779    /// * `options` - Optional options.
780    /// * `stream` - Whether streaming is enabled.
781    ///
782    /// # Returns
783    ///
784    /// The response from the API.
785    pub async fn create_signed_url(
786        &self,
787        url: Option<&str>,
788        options: Option<HashMap<&str, i32>>,
789    ) -> Result<serde_json::Value, reqwest::Error> {
790        let mut params = HashMap::new();
791
792        if let Some(options) = options {
793            for (key, value) in options {
794                params.insert(key.to_string(), value.to_string());
795            }
796        }
797
798        if let Some(url) = url {
799            params.insert("url".to_string(), url.to_string());
800        }
801
802        let url = format!("{API_URL}/v1/data/sign-url");
803        let request = self
804            .client
805            .get(&url)
806            .header(
807                "User-Agent",
808                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
809            )
810            .header("Authorization", format!("Bearer {}", self.api_key))
811            .query(&params);
812
813        let res = request.send().await?;
814
815        parse_response(res).await
816    }
817
818    /// Gets the crawl state of a URL.
819    ///
820    /// # Arguments
821    ///
822    /// * `url` - The URL to get the crawl state of.
823    /// * `params` - Optional request parameters.
824    /// * `stream` - Whether streaming is enabled.
825    /// * `content_type` - The content type of the request.
826    ///
827    /// # Returns
828    ///
829    pub async fn get_crawl_state(
830        &self,
831        url: &str,
832        params: Option<RequestParams>,
833        content_type: &str,
834    ) -> Result<serde_json::Value, reqwest::Error> {
835        let mut payload = HashMap::new();
836        payload.insert("url".into(), serde_json::Value::String(url.to_string()));
837        payload.insert(
838            "contentType".into(),
839            serde_json::Value::String(content_type.to_string()),
840        );
841
842        if let Ok(params) = serde_json::to_value(params) {
843            if let Ok(params) = serde_json::to_value(params) {
844                if let Some(ref p) = params.as_object() {
845                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
846                }
847            }
848        }
849
850        let res = self
851            .api_post("data/crawl_state", payload, content_type)
852            .await?;
853        parse_response(res).await
854    }
855
856    /// Get the account credits left.
857    pub async fn get_credits(&self) -> Result<serde_json::Value, reqwest::Error> {
858        self.api_get::<serde_json::Value>("data/credits", None)
859            .await
860    }
861
862    /// Send a request for a data record.
863    pub async fn data_post(
864        &self,
865        table: &str,
866        data: Option<RequestParams>,
867    ) -> Result<serde_json::Value, reqwest::Error> {
868        let res = self
869            .api_post(&format!("data/{}", table), data, "application/json")
870            .await?;
871        parse_response(res).await
872    }
873
874    /// Query a record from the global DB.
875    pub async fn query(&self, params: &QueryRequest) -> Result<serde_json::Value, reqwest::Error> {
876        let res = self
877            .api_get::<QueryRequest>(&"data/query", Some(params))
878            .await?;
879
880        Ok(res)
881    }
882
883    /// Get a table record.
884    pub async fn data_get(
885        &self,
886        table: &str,
887        params: Option<RequestParams>,
888    ) -> Result<serde_json::Value, reqwest::Error> {
889        let mut payload = HashMap::new();
890
891        if let Some(params) = params {
892            if let Ok(p) = serde_json::to_value(params) {
893                if let Some(o) = p.as_object() {
894                    payload.extend(o.iter().map(|(k, v)| (k.as_str(), v.clone())));
895                }
896            }
897        }
898
899        let res = self
900            .api_get::<serde_json::Value>(&format!("data/{}", table), None)
901            .await?;
902        Ok(res)
903    }
904
905    /// Delete a record.
906    pub async fn data_delete(
907        &self,
908        table: &str,
909        params: Option<RequestParams>,
910    ) -> Result<serde_json::Value, reqwest::Error> {
911        let mut payload = HashMap::new();
912
913        if let Ok(params) = serde_json::to_value(params) {
914            if let Ok(params) = serde_json::to_value(params) {
915                if let Some(ref p) = params.as_object() {
916                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
917                }
918            }
919        }
920
921        let res = self
922            .api_delete(&format!("data/{}", table), Some(payload))
923            .await?;
924        parse_response(res).await
925    }
926}
927
928#[cfg(test)]
929mod tests {
930    use super::*;
931    use dotenv::dotenv;
932    use lazy_static::lazy_static;
933    use reqwest::ClientBuilder;
934
935    lazy_static! {
936        static ref SPIDER_CLIENT: Spider = {
937            dotenv().ok();
938            let client = ClientBuilder::new();
939            let client = client.user_agent("SpiderBot").build().unwrap();
940
941            Spider::new_with_client(None, client).expect("client to build")
942        };
943    }
944
945    #[tokio::test]
946    #[ignore]
947    async fn test_scrape_url() {
948        let response = SPIDER_CLIENT
949            .scrape_url("https://example.com", None, "application/json")
950            .await;
951        assert!(response.is_ok());
952    }
953
954    #[tokio::test]
955    async fn test_crawl_url() {
956        let response = SPIDER_CLIENT
957            .crawl_url(
958                "https://example.com",
959                None,
960                false,
961                "application/json",
962                None::<fn(serde_json::Value)>,
963            )
964            .await;
965        assert!(response.is_ok());
966    }
967
968    #[tokio::test]
969    #[ignore]
970    async fn test_links() {
971        let response: Result<serde_json::Value, Error> = SPIDER_CLIENT
972            .links("https://example.com", None, false, "application/json")
973            .await;
974        assert!(response.is_ok());
975    }
976
977    #[tokio::test]
978    #[ignore]
979    async fn test_screenshot() {
980        let mut params = RequestParams::default();
981        params.limit = Some(1);
982
983        let response = SPIDER_CLIENT
984            .screenshot(
985                "https://example.com",
986                Some(params),
987                false,
988                "application/json",
989            )
990            .await;
991        assert!(response.is_ok());
992    }
993
994    // #[tokio::test(flavor = "multi_thread")]
995    // async fn test_search() {
996    //     let mut params = SearchRequestParams::default();
997
998    //     params.search_limit = Some(1);
999    //     params.num = Some(1);
1000    //     params.fetch_page_content = Some(false);
1001
1002    //     let response = SPIDER_CLIENT
1003    //         .search("a sports website", Some(params), false, "application/json")
1004    //         .await;
1005
1006    //     assert!(response.is_ok());
1007    // }
1008
1009    #[tokio::test]
1010    #[ignore]
1011    async fn test_transform() {
1012        let data = vec![HashMap::from([(
1013            "<html><body><h1>Transformation</h1></body></html>".into(),
1014            "".into(),
1015        )])];
1016        let response = SPIDER_CLIENT
1017            .transform(data, None, false, "application/json")
1018            .await;
1019        assert!(response.is_ok());
1020    }
1021
1022    #[tokio::test]
1023    #[ignore]
1024    async fn test_extract_contacts() {
1025        let response = SPIDER_CLIENT
1026            .extract_contacts("https://example.com", None, false, "application/json")
1027            .await;
1028        assert!(response.is_ok());
1029    }
1030
1031    #[tokio::test]
1032    #[ignore]
1033    async fn test_label() {
1034        let response = SPIDER_CLIENT
1035            .label("https://example.com", None, false, "application/json")
1036            .await;
1037        assert!(response.is_ok());
1038    }
1039
1040    #[tokio::test]
1041    async fn test_create_signed_url() {
1042        let response = SPIDER_CLIENT
1043            .create_signed_url(Some("example.com"), None)
1044            .await;
1045        assert!(response.is_ok());
1046    }
1047
1048    #[tokio::test]
1049    async fn test_get_crawl_state() {
1050        let response = SPIDER_CLIENT
1051            .get_crawl_state("https://example.com", None, "application/json")
1052            .await;
1053        assert!(response.is_ok());
1054    }
1055
1056    #[tokio::test]
1057    async fn test_query() {
1058        let mut query = QueryRequest::default();
1059
1060        query.domain = Some("spider.cloud".into());
1061
1062        let response = SPIDER_CLIENT.query(&query).await;
1063        assert!(response.is_ok());
1064    }
1065
1066    #[tokio::test]
1067    async fn test_get_credits() {
1068        let response = SPIDER_CLIENT.get_credits().await;
1069        assert!(response.is_ok());
1070    }
1071}