spider_client/
lib.rs

1//! The `spider-client` module provides the primary interface and
2//! functionalities for the Spider web crawler library, which is
3//! designed for rapid and efficient crawling of web pages to gather
4//! links using isolated contexts.
5//!
6//! ### Features
7//!
8//! - **Multi-threaded Crawling:** Spider can utilize multiple
9//!   threads to parallelize the crawling process, drastically
10//!   improving performance and allowing the ability to gather
11//!   millions of pages in a short time.
12//!
13//! - **Configurable:** The library provides various options to
14//!   configure the crawling behavior, such as setting the depth
15//!   of crawling, user-agent strings, delays between requests,
16//!   and more.
17//!
18//! - **Link Gathering:** One of the primary objectives of Spider is to
19//!   gather and manage links from the web pages it crawls,
20//!   compiling them into a structured format for further use.
21//!
22//! ### Examples
23//!
24//! Basic usage of the Spider client might look like this:
25//!
26//! ```rust
27//! use spider_client::{Spider, RequestType, RequestParams};
28//! use tokio;
29//!
30//!  # #[ignore]
31//! #[tokio::main]
32//! async fn main() {
33//!     let spider = Spider::new(Some("myspiderapikey".into())).expect("API key must be provided");
34//!
35//!     let url = "https://spider.cloud";
36//!
37//!     // Scrape a single URL
38//!     let scraped_data = spider.scrape_url(url, None, "application/json").await.expect("Failed to scrape the URL");
39//!
40//!     println!("Scraped Data: {:?}", scraped_data);
41//!
42//!     // Crawl a website
43//!     let crawler_params = RequestParams {
44//!         limit: Some(1),
45//!         proxy_enabled: Some(true),
46//!         store_data: Some(false),
47//!         metadata: Some(false),
48//!         request: Some(RequestType::Http),
49//!         ..Default::default()
50//!     };
51//!
52//!     let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
53//!
54//!     println!("Crawl Result: {:?}", crawl_result);
55//! }
56//! ```
57//!
58//! ### Modules
59//!
60//! - `config`: Contains the configuration options for the Spider client.
61//! - `utils`: Utility functions used by the Spider client.
62//!
63
64pub mod shapes;
65
66use backon::ExponentialBuilder;
67use backon::Retryable;
68use reqwest::Client;
69use reqwest::{Error, Response};
70use serde::Serialize;
71use std::collections::HashMap;
72use tokio_stream::StreamExt;
73pub use shapes::{request::*, response::*};
74use std::sync::OnceLock;
75
76static API_URL: OnceLock<String> = OnceLock::new();
77
78/// The API endpoint.
79pub fn get_api_url() -> &'static str {
80    API_URL.get_or_init(|| {
81        std::env::var("SPIDER_API_URL").unwrap_or_else(|_| "https://api.spider.cloud".to_string())
82    })
83}
84
85/// Represents a Spider with API key and HTTP client.
86#[derive(Debug, Default)]
87pub struct Spider {
88    /// The Spider API key.
89    pub api_key: String,
90    /// The Spider Client to re-use.
91    pub client: Client,
92}
93
94/// Handle the json response.
95pub async fn handle_json(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
96    res.json().await
97}
98
99/// Handle the jsonl response.
100pub async fn handle_jsonl(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
101    let text = res.text().await?;
102    let lines = text
103        .lines()
104        .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
105        .collect::<Vec<_>>();
106    Ok(serde_json::Value::Array(lines))
107}
108
109/// Handle the CSV response.
110#[cfg(feature = "csv")]
111pub async fn handle_csv(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
112    use std::collections::HashMap;
113    let text = res.text().await?;
114    let mut rdr = csv::Reader::from_reader(text.as_bytes());
115    let records: Vec<HashMap<String, String>> = rdr.deserialize().filter_map(Result::ok).collect();
116
117    if let Ok(record) = serde_json::to_value(records) {
118        Ok(record)
119    } else {
120        Ok(serde_json::Value::String(text))
121    }
122}
123
124#[cfg(not(feature = "csv"))]
125pub async fn handle_csv(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
126    handle_text(res).await
127}
128
129/// Basic handle response to text
130pub async fn handle_text(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
131    Ok(serde_json::Value::String(
132        res.text().await.unwrap_or_default(),
133    ))
134}
135
136/// Handle the XML response.
137#[cfg(feature = "csv")]
138pub async fn handle_xml(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
139    let text = res.text().await?;
140    match quick_xml::de::from_str::<serde_json::Value>(&text) {
141        Ok(val) => Ok(val),
142        Err(_) => Ok(serde_json::Value::String(text)),
143    }
144}
145
146#[cfg(not(feature = "csv"))]
147/// Handle the XML response.
148pub async fn handle_xml(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
149    handle_text(res).await
150}
151
152pub async fn parse_response(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
153    let content_type = res
154        .headers()
155        .get(reqwest::header::CONTENT_TYPE)
156        .and_then(|v| v.to_str().ok())
157        .unwrap_or_default()
158        .to_ascii_lowercase();
159
160    if content_type.contains("json") && !content_type.contains("jsonl") {
161        handle_json(res).await
162    } else if content_type.contains("jsonl") || content_type.contains("ndjson") {
163        handle_jsonl(res).await
164    } else if content_type.contains("csv") {
165        handle_csv(res).await
166    } else if content_type.contains("xml") {
167        handle_xml(res).await
168    } else {
169        handle_text(res).await
170    }
171}
172
173impl Spider {
174    /// Creates a new instance of Spider.
175    ///
176    /// # Arguments
177    ///
178    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
179    ///
180    /// # Returns
181    ///
182    /// A new instance of Spider or an error string if no API key is provided.
183    pub fn new(api_key: Option<String>) -> Result<Self, &'static str> {
184        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
185
186        match api_key {
187            Some(key) => Ok(Self {
188                api_key: key,
189                client: Client::new(),
190            }),
191            None => Err("No API key provided"),
192        }
193    }
194
195    /// Creates a new instance of Spider.
196    ///
197    /// # Arguments
198    ///
199    /// * `api_key` - An optional API key. Defaults to using the 'SPIDER_API_KEY' env variable.
200    /// * `client` - A custom client to pass in.
201    ///
202    /// # Returns
203    ///
204    /// A new instance of Spider or an error string if no API key is provided.
205    pub fn new_with_client(api_key: Option<String>, client: Client) -> Result<Self, &'static str> {
206        let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
207
208        match api_key {
209            Some(key) => Ok(Self {
210                api_key: key,
211                client,
212            }),
213            None => Err("No API key provided"),
214        }
215    }
216
217    /// Sends a POST request to the API.
218    ///
219    /// # Arguments
220    ///
221    /// * `endpoint` - The API endpoint.
222    /// * `data` - The request data as a HashMap.
223    /// * `stream` - Whether streaming is enabled.
224    /// * `content_type` - The content type of the request.
225    ///
226    /// # Returns
227    ///
228    /// The response from the API.
229    async fn api_post_base(
230        &self,
231        endpoint: &str,
232        data: impl Serialize + Sized + std::fmt::Debug,
233        content_type: &str,
234    ) -> Result<Response, Error> {
235        let url: String = format!("{}/{}", get_api_url(), endpoint);
236
237        self.client
238            .post(&url)
239            .header(
240                "User-Agent",
241                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
242            )
243            .header("Content-Type", content_type)
244            .header("Authorization", format!("Bearer {}", self.api_key))
245            .json(&data)
246            .send()
247            .await
248    }
249
250    /// Sends a POST request to the API.
251    ///
252    /// # Arguments
253    ///
254    /// * `endpoint` - The API endpoint.
255    /// * `data` - The request data as a HashMap.
256    /// * `stream` - Whether streaming is enabled.
257    /// * `content_type` - The content type of the request.
258    ///
259    /// # Returns
260    ///
261    /// The response from the API.
262    pub async fn api_post(
263        &self,
264        endpoint: &str,
265        data: impl Serialize + std::fmt::Debug + Clone + Send + Sync,
266        content_type: &str,
267    ) -> Result<Response, Error> {
268        let fetch = || async {
269            self.api_post_base(endpoint, data.to_owned(), content_type)
270                .await
271        };
272
273        fetch
274            .retry(ExponentialBuilder::default().with_max_times(5))
275            .when(|err: &reqwest::Error| {
276                if let Some(status) = err.status() {
277                    status.is_server_error()
278                } else {
279                    err.is_timeout()
280                }
281            })
282            .await
283    }
284
285    /// Sends a GET request to the API.
286    ///
287    /// # Arguments
288    ///
289    /// * `endpoint` - The API endpoint.
290    ///
291    /// # Returns
292    ///
293    /// The response from the API as a JSON value.
294    async fn api_get_base<T: Serialize>(
295        &self,
296        endpoint: &str,
297        query_params: Option<&T>,
298    ) -> Result<serde_json::Value, reqwest::Error> {
299        let url = format!("{}/{}", get_api_url(), endpoint);
300        let res = self
301            .client
302            .get(&url)
303            .query(&query_params)
304            .header(
305                "User-Agent",
306                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
307            )
308            .header("Content-Type", "application/json")
309            .header("Authorization", format!("Bearer {}", self.api_key))
310            .send()
311            .await?;
312        parse_response(res).await
313    }
314
315    /// Sends a GET request to the API.
316    ///
317    /// # Arguments
318    ///
319    /// * `endpoint` - The API endpoint.
320    ///
321    /// # Returns
322    ///
323    /// The response from the API as a JSON value.
324    pub async fn api_get<T: Serialize>(
325        &self,
326        endpoint: &str,
327        query_params: Option<&T>,
328    ) -> Result<serde_json::Value, reqwest::Error> {
329        let fetch = || async { self.api_get_base(endpoint, query_params.to_owned()).await };
330
331        fetch
332            .retry(ExponentialBuilder::default().with_max_times(5))
333            .when(|err: &reqwest::Error| {
334                if let Some(status) = err.status() {
335                    status.is_server_error()
336                } else {
337                    err.is_timeout()
338                }
339            })
340            .await
341    }
342
343    /// Sends a DELETE request to the API.
344    ///
345    /// # Arguments
346    ///
347    /// * `endpoint` - The API endpoint.
348    /// * `params` - Optional request parameters.
349    /// * `stream` - Whether streaming is enabled.
350    /// * `content_type` - The content type of the request.
351    ///
352    /// # Returns
353    ///
354    /// The response from the API.
355    async fn api_delete_base(
356        &self,
357        endpoint: &str,
358        params: Option<HashMap<String, serde_json::Value>>,
359    ) -> Result<Response, Error> {
360        let url = format!("{}/v1/{}", get_api_url(), endpoint);
361        let request_builder = self
362            .client
363            .delete(&url)
364            .header(
365                "User-Agent",
366                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
367            )
368            .header("Content-Type", "application/json")
369            .header("Authorization", format!("Bearer {}", self.api_key));
370
371        let request_builder = if let Some(params) = params {
372            request_builder.json(&params)
373        } else {
374            request_builder
375        };
376
377        request_builder.send().await
378    }
379
380    /// Sends a DELETE request to the API.
381    ///
382    /// # Arguments
383    ///
384    /// * `endpoint` - The API endpoint.
385    /// * `params` - Optional request parameters.
386    /// * `stream` - Whether streaming is enabled.
387    /// * `content_type` - The content type of the request.
388    ///
389    /// # Returns
390    ///
391    /// The response from the API.
392    pub async fn api_delete(
393        &self,
394        endpoint: &str,
395        params: Option<HashMap<String, serde_json::Value>>,
396    ) -> Result<Response, Error> {
397        let fetch = || async { self.api_delete_base(endpoint, params.to_owned()).await };
398
399        fetch
400            .retry(ExponentialBuilder::default().with_max_times(5))
401            .when(|err: &reqwest::Error| {
402                if let Some(status) = err.status() {
403                    status.is_server_error()
404                } else {
405                    err.is_timeout()
406                }
407            })
408            .await
409    }
410
411    /// Scrapes a URL.
412    ///
413    /// # Arguments
414    ///
415    /// * `url` - The URL to scrape.
416    /// * `params` - Optional request parameters.
417    /// * `stream` - Whether streaming is enabled.
418    /// * `content_type` - The content type of the request.
419    ///
420    /// # Returns
421    ///
422    /// The response from the API as a JSON value.
423    pub async fn scrape_url(
424        &self,
425        url: &str,
426        params: Option<RequestParams>,
427        content_type: &str,
428    ) -> Result<serde_json::Value, reqwest::Error> {
429        let mut data = HashMap::new();
430
431        if let Ok(params) = serde_json::to_value(params) {
432            if let Some(ref p) = params.as_object() {
433                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
434            }
435        }
436
437        if !url.is_empty() {
438            data.insert(
439                "url".to_string(),
440                serde_json::Value::String(url.to_string()),
441            );
442        }
443
444        data.insert("limit".to_string(), serde_json::Value::Number(1.into()));
445
446        let res = self.api_post("crawl", data, content_type).await?;
447        parse_response(res).await
448    }
449
450    /// Scrapes multi URLs.
451    ///
452    /// # Arguments
453    ///
454    /// * `url` - The URL to scrape.
455    /// * `params` - Optional request parameters.
456    /// * `stream` - Whether streaming is enabled.
457    /// * `content_type` - The content type of the request.
458    ///
459    /// # Returns
460    ///
461    /// The response from the API as a JSON value.
462    pub async fn multi_scrape_url(
463        &self,
464        params: Option<Vec<RequestParams>>,
465        content_type: &str,
466    ) -> Result<serde_json::Value, reqwest::Error> {
467        let mut data = HashMap::new();
468
469if let Ok(mut params) = serde_json::to_value(params) {
470    if let Some(obj) = params.as_object_mut() {
471        obj.insert("limit".to_string(), serde_json::Value::Number(1.into()));
472        data.extend(obj.iter().map(|(k, v)| (k.clone(), v.clone())));
473    }
474}
475        let res = self.api_post("crawl", data, content_type).await?;
476        parse_response(res).await
477    }
478
479
480    /// Crawls a URL.
481    ///
482    /// # Arguments
483    ///
484    /// * `url` - The URL to crawl.
485    /// * `params` - Optional request parameters.
486    /// * `stream` - Whether streaming is enabled.
487    /// * `content_type` - The content type of the request.
488    /// * `callback` - Optional callback function to handle each streamed chunk.
489    ///
490    /// # Returns
491    ///
492    /// The response from the API as a JSON value.
493    pub async fn crawl_url(
494        &self,
495        url: &str,
496        params: Option<RequestParams>,
497        stream: bool,
498        content_type: &str,
499        callback: Option<impl Fn(serde_json::Value) + Send>,
500    ) -> Result<serde_json::Value, reqwest::Error> {
501        use tokio_util::codec::{FramedRead, LinesCodec};
502
503        let mut data = HashMap::new();
504
505        if let Ok(params) = serde_json::to_value(params) {
506            if let Some(ref p) = params.as_object() {
507                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
508            }
509        }
510
511        data.insert("url".into(), serde_json::Value::String(url.to_string()));
512
513        let res = self.api_post("crawl", data, content_type).await?;
514
515        if stream {
516            if let Some(callback) = callback {
517                let stream = res.bytes_stream();
518
519                let stream_reader = tokio_util::io::StreamReader::new(
520                    stream
521                        .map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
522                );
523
524                let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
525
526                while let Some(line_result) = lines.next().await {
527                    match line_result {
528                        Ok(line) => match serde_json::from_str::<serde_json::Value>(&line) {
529                            Ok(value) => {
530                                callback(value);
531                            }
532                            Err(_e) => {
533                                continue;
534                            }
535                        },
536                        Err(_e) => return Ok(serde_json::Value::Null),
537                    }
538                }
539
540                Ok(serde_json::Value::Null)
541            } else {
542                Ok(serde_json::Value::Null)
543            }
544        } else {
545            parse_response(res).await
546        }
547    }
548
549    /// Crawls multiple URLs.
550    ///
551    /// # Arguments
552    ///
553    /// * `url` - The URL to crawl.
554    /// * `params` - Optional request parameters.
555    /// * `stream` - Whether streaming is enabled.
556    /// * `content_type` - The content type of the request.
557    /// * `callback` - Optional callback function to handle each streamed chunk.
558    ///
559    /// # Returns
560    ///
561    /// The response from the API as a JSON value.
562    pub async fn multi_crawl_url(
563        &self,
564        params: Option<Vec<RequestParams>>,
565        stream: bool,
566        content_type: &str,
567        callback: Option<impl Fn(serde_json::Value) + Send>,
568    ) -> Result<serde_json::Value, reqwest::Error> {
569        use tokio_util::codec::{FramedRead, LinesCodec};
570
571
572        let res = self.api_post("crawl", params, content_type).await?;
573
574        if stream {
575            if let Some(callback) = callback {
576                let stream = res.bytes_stream();
577
578                let stream_reader = tokio_util::io::StreamReader::new(
579                    stream
580                        .map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
581                );
582
583                let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
584
585                while let Some(line_result) = lines.next().await {
586                    match line_result {
587                        Ok(line) => match serde_json::from_str::<serde_json::Value>(&line) {
588                            Ok(value) => {
589                                callback(value);
590                            }
591                            Err(_e) => {
592                                continue;
593                            }
594                        },
595                        Err(_e) => return Ok(serde_json::Value::Null),
596                    }
597                }
598
599                Ok(serde_json::Value::Null)
600            } else {
601                Ok(serde_json::Value::Null)
602            }
603        } else {
604            parse_response(res).await
605        }
606    }
607
608
609    /// Fetches links from a URL.
610    ///
611    /// # Arguments
612    ///
613    /// * `url` - The URL to fetch links from.
614    /// * `params` - Optional request parameters.
615    /// * `stream` - Whether streaming is enabled.
616    /// * `content_type` - The content type of the request.
617    ///
618    /// # Returns
619    ///
620    /// The response from the API as a JSON value.
621    pub async fn links(
622        &self,
623        url: &str,
624        params: Option<RequestParams>,
625        _stream: bool,
626        content_type: &str,
627    ) -> Result<serde_json::Value, reqwest::Error> {
628        let mut data = HashMap::new();
629
630        if let Ok(params) = serde_json::to_value(params) {
631            if let Some(ref p) = params.as_object() {
632                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
633            }
634        }
635
636        data.insert("url".into(), serde_json::Value::String(url.to_string()));
637
638        let res = self.api_post("links", data, content_type).await?;
639        parse_response(res).await
640    }
641
642
643    /// Fetches links from a URLs.
644    ///
645    /// # Arguments
646    ///
647    /// * `url` - The URL to fetch links from.
648    /// * `params` - Optional request parameters.
649    /// * `stream` - Whether streaming is enabled.
650    /// * `content_type` - The content type of the request.
651    ///
652    /// # Returns
653    ///
654    /// The response from the API as a JSON value.
655    pub async fn multi_links(
656        &self,
657        params: Option<Vec<RequestParams>>,
658        _stream: bool,
659        content_type: &str,
660    ) -> Result<serde_json::Value, reqwest::Error> {
661        let res = self.api_post("links", params, content_type).await?;
662        parse_response(res).await
663    }
664
665    
666    /// Takes a screenshot of a URL.
667    ///
668    /// # Arguments
669    ///
670    /// * `url` - The URL to take a screenshot of.
671    /// * `params` - Optional request parameters.
672    /// * `stream` - Whether streaming is enabled.
673    /// * `content_type` - The content type of the request.
674    ///
675    /// # Returns
676    ///
677    /// The response from the API as a JSON value.
678    pub async fn screenshot(
679        &self,
680        url: &str,
681        params: Option<RequestParams>,
682        _stream: bool,
683        content_type: &str,
684    ) -> Result<serde_json::Value, reqwest::Error> {
685        let mut data = HashMap::new();
686
687        if let Ok(params) = serde_json::to_value(params) {
688            if let Some(ref p) = params.as_object() {
689                data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
690            }
691        }
692
693        data.insert("url".into(), serde_json::Value::String(url.to_string()));
694
695        let res = self.api_post("screenshot", data, content_type).await?;
696        parse_response(res).await
697    }
698
699    /// Takes a screenshot of multiple URLs.
700    ///
701    /// # Arguments
702    ///
703    /// * `url` - The URL to take a screenshot of.
704    /// * `params` - Optional request parameters.
705    /// * `stream` - Whether streaming is enabled.
706    /// * `content_type` - The content type of the request.
707    ///
708    /// # Returns
709    ///
710    /// The response from the API as a JSON value.
711    pub async fn multi_screenshot(
712        &self,
713        params: Option<Vec<RequestParams>>,
714        _stream: bool,
715        content_type: &str,
716    ) -> Result<serde_json::Value, reqwest::Error> {
717        let res = self.api_post("screenshot", params, content_type).await?;
718        parse_response(res).await
719    }
720
721    /// Searches for a query.
722    ///
723    /// # Arguments
724    ///
725    /// * `q` - The query to search for.
726    /// * `params` - Optional request parameters.
727    /// * `stream` - Whether streaming is enabled.
728    /// * `content_type` - The content type of the request.
729    ///
730    /// # Returns
731    ///
732    /// The response from the API as a JSON value.
733    pub async fn search(
734        &self,
735        q: &str,
736        params: Option<SearchRequestParams>,
737        _stream: bool,
738        content_type: &str,
739    ) -> Result<serde_json::Value, reqwest::Error> {
740        let body = match params {
741            Some(mut params) => {
742                params.search = q.to_string();
743                params
744            }
745            _ => {
746                let mut params = SearchRequestParams::default();
747                params.search = q.to_string();
748                params
749            }
750        };
751
752        let res = self.api_post("search", body, content_type).await?;
753
754        parse_response(res).await
755    }
756
757    /// Searches for multiple querys.
758    ///
759    /// # Arguments
760    ///
761    /// * `q` - The query to search for.
762    /// * `params` - Optional request parameters.
763    /// * `stream` - Whether streaming is enabled.
764    /// * `content_type` - The content type of the request.
765    ///
766    /// # Returns
767    ///
768    /// The response from the API as a JSON value.
769    pub async fn multi_search(
770        &self,
771        params: Option<Vec<SearchRequestParams>>,
772        content_type: &str,
773    ) -> Result<serde_json::Value, reqwest::Error> {
774        let res = self.api_post("search", params, content_type).await?;
775        parse_response(res).await
776    }
777
778    /// Transforms data.
779    ///
780    /// # Arguments
781    ///
782    /// * `data` - The data to transform.
783    /// * `params` - Optional request parameters.
784    /// * `stream` - Whether streaming is enabled.
785    /// * `content_type` - The content type of the request.
786    ///
787    /// # Returns
788    ///
789    /// The response from the API as a JSON value.
790    pub async fn transform(
791        &self,
792        data: Vec<HashMap<&str, &str>>,
793        params: Option<TransformParams>,
794        _stream: bool,
795        content_type: &str,
796    ) -> Result<serde_json::Value, reqwest::Error> {
797        let mut payload = HashMap::new();
798
799        if let Ok(params) = serde_json::to_value(params) {
800            if let Some(ref p) = params.as_object() {
801                payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
802            }
803        }
804
805        if let Ok(d) = serde_json::to_value(data) {
806            payload.insert("data".into(), d);
807        }
808
809        let res = self.api_post("transform", payload, content_type).await?;
810
811        parse_response(res).await
812    }
813
814    /// Extracts contacts from a URL.
815    ///
816    /// # Arguments
817    ///
818    /// * `url` - The URL to extract contacts from.
819    /// * `params` - Optional request parameters.
820    /// * `stream` - Whether streaming is enabled.
821    /// * `content_type` - The content type of the request.
822    ///
823    /// # Returns
824    ///
825    /// The response from the API as a JSON value.
826    pub async fn extract_contacts(
827        &self,
828        url: &str,
829        params: Option<RequestParams>,
830        _stream: bool,
831        content_type: &str,
832    ) -> Result<serde_json::Value, reqwest::Error> {
833        let mut data = HashMap::new();
834
835        if let Ok(params) = serde_json::to_value(params) {
836            if let Ok(params) = serde_json::to_value(params) {
837                if let Some(ref p) = params.as_object() {
838                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
839                }
840            }
841        }
842
843        match serde_json::to_value(url) {
844            Ok(u) => {
845                data.insert("url".into(), u);
846            }
847            _ => (),
848        }
849
850        let res = self
851            .api_post("pipeline/extract-contacts", data, content_type)
852            .await?;
853
854        parse_response(res).await
855    }
856
857    /// Labels data from a URL.
858    ///
859    /// # Arguments
860    ///
861    /// * `url` - The URL to label data from.
862    /// * `params` - Optional request parameters.
863    /// * `stream` - Whether streaming is enabled.
864    /// * `content_type` - The content type of the request.
865    ///
866    /// # Returns
867    ///
868    /// The response from the API as a JSON value.
869    pub async fn label(
870        &self,
871        url: &str,
872        params: Option<RequestParams>,
873        _stream: bool,
874        content_type: &str,
875    ) -> Result<serde_json::Value, reqwest::Error> {
876        let mut data = HashMap::new();
877
878        if let Ok(params) = serde_json::to_value(params) {
879            if let Ok(params) = serde_json::to_value(params) {
880                if let Some(ref p) = params.as_object() {
881                    data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
882                }
883            }
884        }
885
886        data.insert("url".into(), serde_json::Value::String(url.to_string()));
887
888        let res = self.api_post("pipeline/label", data, content_type).await?;
889        parse_response(res).await
890    }
891
892    /// Download a record from storage.
893    ///
894    /// # Arguments
895    ///
896    /// * `url` - Optional exact url of the file in storage.
897    /// * `options` - Optional options.
898    /// * `stream` - Whether streaming is enabled.
899    ///
900    /// # Returns
901    ///
902    /// The response from the API.
903    pub async fn download(
904        &self,
905        url: Option<&str>,
906        options: Option<HashMap<&str, i32>>,
907    ) -> Result<reqwest::Response, reqwest::Error> {
908        let mut params = HashMap::new();
909
910        if let Some(url) = url {
911            params.insert("url".to_string(), url.to_string());
912        }
913
914        if let Some(options) = options {
915            for (key, value) in options {
916                params.insert(key.to_string(), value.to_string());
917            }
918        }
919
920        let url = format!("{}/v1/data/download", get_api_url());
921        let request = self
922            .client
923            .get(&url)
924            .header(
925                "User-Agent",
926                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
927            )
928            .header("Content-Type", "application/octet-stream")
929            .header("Authorization", format!("Bearer {}", self.api_key))
930            .query(&params);
931
932        let res = request.send().await?;
933
934        Ok(res)
935    }
936
937    /// Creates a signed URL of a file from storage.
938    ///
939    /// # Arguments
940    ///
941    /// * `url` - Optional exact url of the file in storage.
942    /// * `options` - Optional options.
943    /// * `stream` - Whether streaming is enabled.
944    ///
945    /// # Returns
946    ///
947    /// The response from the API.
948    pub async fn create_signed_url(
949        &self,
950        url: Option<&str>,
951        options: Option<HashMap<&str, i32>>,
952    ) -> Result<serde_json::Value, reqwest::Error> {
953        let mut params = HashMap::new();
954
955        if let Some(options) = options {
956            for (key, value) in options {
957                params.insert(key.to_string(), value.to_string());
958            }
959        }
960
961        if let Some(url) = url {
962            params.insert("url".to_string(), url.to_string());
963        }
964
965        let url = format!("{}/v1/data/sign-url", get_api_url());
966        let request = self
967            .client
968            .get(&url)
969            .header(
970                "User-Agent",
971                format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
972            )
973            .header("Authorization", format!("Bearer {}", self.api_key))
974            .query(&params);
975
976        let res = request.send().await?;
977
978        parse_response(res).await
979    }
980
981    /// Gets the crawl state of a URL.
982    ///
983    /// # Arguments
984    ///
985    /// * `url` - The URL to get the crawl state of.
986    /// * `params` - Optional request parameters.
987    /// * `stream` - Whether streaming is enabled.
988    /// * `content_type` - The content type of the request.
989    ///
990    /// # Returns
991    ///
992    pub async fn get_crawl_state(
993        &self,
994        url: &str,
995        params: Option<RequestParams>,
996        content_type: &str,
997    ) -> Result<serde_json::Value, reqwest::Error> {
998        let mut payload = HashMap::new();
999        payload.insert("url".into(), serde_json::Value::String(url.to_string()));
1000        payload.insert(
1001            "contentType".into(),
1002            serde_json::Value::String(content_type.to_string()),
1003        );
1004
1005        if let Ok(params) = serde_json::to_value(params) {
1006            if let Ok(params) = serde_json::to_value(params) {
1007                if let Some(ref p) = params.as_object() {
1008                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1009                }
1010            }
1011        }
1012
1013        let res = self
1014            .api_post("data/crawl_state", payload, content_type)
1015            .await?;
1016        parse_response(res).await
1017    }
1018
1019    /// Get the account credits left.
1020    pub async fn get_credits(&self) -> Result<serde_json::Value, reqwest::Error> {
1021        self.api_get::<serde_json::Value>("data/credits", None)
1022            .await
1023    }
1024
1025    /// Send a request for a data record.
1026    pub async fn data_post(
1027        &self,
1028        table: &str,
1029        data: Option<RequestParams>,
1030    ) -> Result<serde_json::Value, reqwest::Error> {
1031        let res = self
1032            .api_post(&format!("data/{}", table), data, "application/json")
1033            .await?;
1034        parse_response(res).await
1035    }
1036
1037    /// Query a record from the global DB.
1038    pub async fn query(&self, params: &QueryRequest) -> Result<serde_json::Value, reqwest::Error> {
1039        let res = self
1040            .api_get::<QueryRequest>(&"data/query", Some(params))
1041            .await?;
1042
1043        Ok(res)
1044    }
1045
1046    /// Get a table record.
1047    pub async fn data_get(
1048        &self,
1049        table: &str,
1050        params: Option<RequestParams>,
1051    ) -> Result<serde_json::Value, reqwest::Error> {
1052        let mut payload = HashMap::new();
1053
1054        if let Some(params) = params {
1055            if let Ok(p) = serde_json::to_value(params) {
1056                if let Some(o) = p.as_object() {
1057                    payload.extend(o.iter().map(|(k, v)| (k.as_str(), v.clone())));
1058                }
1059            }
1060        }
1061
1062        let res = self
1063            .api_get::<serde_json::Value>(&format!("data/{}", table), None)
1064            .await?;
1065        Ok(res)
1066    }
1067
1068    /// Delete a record.
1069    pub async fn data_delete(
1070        &self,
1071        table: &str,
1072        params: Option<RequestParams>,
1073    ) -> Result<serde_json::Value, reqwest::Error> {
1074        let mut payload = HashMap::new();
1075
1076        if let Ok(params) = serde_json::to_value(params) {
1077            if let Ok(params) = serde_json::to_value(params) {
1078                if let Some(ref p) = params.as_object() {
1079                    payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
1080                }
1081            }
1082        }
1083
1084        let res = self
1085            .api_delete(&format!("data/{}", table), Some(payload))
1086            .await?;
1087        parse_response(res).await
1088    }
1089}
1090
1091#[cfg(test)]
1092mod tests {
1093    use super::*;
1094    use dotenv::dotenv;
1095    use lazy_static::lazy_static;
1096    use reqwest::ClientBuilder;
1097
1098    lazy_static! {
1099        static ref SPIDER_CLIENT: Spider = {
1100            dotenv().ok();
1101            let client = ClientBuilder::new();
1102            let client = client.user_agent("SpiderBot").build().unwrap();
1103
1104            Spider::new_with_client(None, client).expect("client to build")
1105        };
1106    }
1107
1108    #[tokio::test]
1109    #[ignore]
1110    async fn test_scrape_url() {
1111        let response = SPIDER_CLIENT
1112            .scrape_url("https://example.com", None, "application/json")
1113            .await;
1114        assert!(response.is_ok());
1115    }
1116
1117    #[tokio::test]
1118    async fn test_crawl_url() {
1119        let response = SPIDER_CLIENT
1120            .crawl_url(
1121                "https://example.com",
1122                None,
1123                false,
1124                "application/json",
1125                None::<fn(serde_json::Value)>,
1126            )
1127            .await;
1128        assert!(response.is_ok());
1129    }
1130
1131    #[tokio::test]
1132    #[ignore]
1133    async fn test_links() {
1134        let response: Result<serde_json::Value, Error> = SPIDER_CLIENT
1135            .links("https://example.com", None, false, "application/json")
1136            .await;
1137        assert!(response.is_ok());
1138    }
1139
1140    #[tokio::test]
1141    #[ignore]
1142    async fn test_screenshot() {
1143        let mut params = RequestParams::default();
1144        params.limit = Some(1);
1145
1146        let response = SPIDER_CLIENT
1147            .screenshot(
1148                "https://example.com",
1149                Some(params),
1150                false,
1151                "application/json",
1152            )
1153            .await;
1154        assert!(response.is_ok());
1155    }
1156
1157    // #[tokio::test(flavor = "multi_thread")]
1158    // async fn test_search() {
1159    //     let mut params = SearchRequestParams::default();
1160
1161    //     params.search_limit = Some(1);
1162    //     params.num = Some(1);
1163    //     params.fetch_page_content = Some(false);
1164
1165    //     let response = SPIDER_CLIENT
1166    //         .search("a sports website", Some(params), false, "application/json")
1167    //         .await;
1168
1169    //     assert!(response.is_ok());
1170    // }
1171
1172    #[tokio::test]
1173    #[ignore]
1174    async fn test_transform() {
1175        let data = vec![HashMap::from([(
1176            "<html><body><h1>Transformation</h1></body></html>".into(),
1177            "".into(),
1178        )])];
1179        let response = SPIDER_CLIENT
1180            .transform(data, None, false, "application/json")
1181            .await;
1182        assert!(response.is_ok());
1183    }
1184
1185    #[tokio::test]
1186    #[ignore]
1187    async fn test_extract_contacts() {
1188        let response = SPIDER_CLIENT
1189            .extract_contacts("https://example.com", None, false, "application/json")
1190            .await;
1191        assert!(response.is_ok());
1192    }
1193
1194    #[tokio::test]
1195    #[ignore]
1196    async fn test_label() {
1197        let response = SPIDER_CLIENT
1198            .label("https://example.com", None, false, "application/json")
1199            .await;
1200        assert!(response.is_ok());
1201    }
1202
1203    #[tokio::test]
1204    async fn test_create_signed_url() {
1205        let response = SPIDER_CLIENT
1206            .create_signed_url(Some("example.com"), None)
1207            .await;
1208        assert!(response.is_ok());
1209    }
1210
1211    #[tokio::test]
1212    async fn test_get_crawl_state() {
1213        let response = SPIDER_CLIENT
1214            .get_crawl_state("https://example.com", None, "application/json")
1215            .await;
1216        assert!(response.is_ok());
1217    }
1218
1219    #[tokio::test]
1220    async fn test_query() {
1221        let mut query = QueryRequest::default();
1222
1223        query.domain = Some("spider.cloud".into());
1224
1225        let response = SPIDER_CLIENT.query(&query).await;
1226        assert!(response.is_ok());
1227    }
1228
1229    #[tokio::test]
1230    async fn test_get_credits() {
1231        let response = SPIDER_CLIENT.get_credits().await;
1232        assert!(response.is_ok());
1233    }
1234}