1pub mod shapes;
65
66use backon::ExponentialBuilder;
67use backon::Retryable;
68use reqwest::Client;
69use reqwest::{Error, Response};
70use serde::Serialize;
71use std::collections::HashMap;
72use tokio_stream::StreamExt;
73pub use shapes::{request::*, response::*};
74use std::sync::OnceLock;
75
76static API_URL: OnceLock<String> = OnceLock::new();
77
78pub fn get_api_url() -> &'static str {
80 API_URL.get_or_init(|| {
81 std::env::var("SPIDER_API_URL").unwrap_or_else(|_| "https://api.spider.cloud".to_string())
82 })
83}
84
85#[derive(Debug, Default)]
87pub struct Spider {
88 pub api_key: String,
90 pub client: Client,
92}
93
94pub async fn handle_json(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
96 res.json().await
97}
98
99pub async fn handle_jsonl(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
101 let text = res.text().await?;
102 let lines = text
103 .lines()
104 .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
105 .collect::<Vec<_>>();
106 Ok(serde_json::Value::Array(lines))
107}
108
109#[cfg(feature = "csv")]
111pub async fn handle_csv(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
112 use std::collections::HashMap;
113 let text = res.text().await?;
114 let mut rdr = csv::Reader::from_reader(text.as_bytes());
115 let records: Vec<HashMap<String, String>> = rdr.deserialize().filter_map(Result::ok).collect();
116
117 if let Ok(record) = serde_json::to_value(records) {
118 Ok(record)
119 } else {
120 Ok(serde_json::Value::String(text))
121 }
122}
123
124#[cfg(not(feature = "csv"))]
125pub async fn handle_csv(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
126 handle_text(res).await
127}
128
129pub async fn handle_text(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
131 Ok(serde_json::Value::String(
132 res.text().await.unwrap_or_default(),
133 ))
134}
135
136#[cfg(feature = "csv")]
138pub async fn handle_xml(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
139 let text = res.text().await?;
140 match quick_xml::de::from_str::<serde_json::Value>(&text) {
141 Ok(val) => Ok(val),
142 Err(_) => Ok(serde_json::Value::String(text)),
143 }
144}
145
146#[cfg(not(feature = "csv"))]
147pub async fn handle_xml(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
149 handle_text(res).await
150}
151
152pub async fn parse_response(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
153 let content_type = res
154 .headers()
155 .get(reqwest::header::CONTENT_TYPE)
156 .and_then(|v| v.to_str().ok())
157 .unwrap_or_default()
158 .to_ascii_lowercase();
159
160 if content_type.contains("json") && !content_type.contains("jsonl") {
161 handle_json(res).await
162 } else if content_type.contains("jsonl") || content_type.contains("ndjson") {
163 handle_jsonl(res).await
164 } else if content_type.contains("csv") {
165 handle_csv(res).await
166 } else if content_type.contains("xml") {
167 handle_xml(res).await
168 } else {
169 handle_text(res).await
170 }
171}
172
173impl Spider {
174 pub fn new(api_key: Option<String>) -> Result<Self, &'static str> {
184 let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
185
186 match api_key {
187 Some(key) => Ok(Self {
188 api_key: key,
189 client: Client::new(),
190 }),
191 None => Err("No API key provided"),
192 }
193 }
194
195 pub fn new_with_client(api_key: Option<String>, client: Client) -> Result<Self, &'static str> {
206 let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
207
208 match api_key {
209 Some(key) => Ok(Self {
210 api_key: key,
211 client,
212 }),
213 None => Err("No API key provided"),
214 }
215 }
216
217 async fn api_post_base(
230 &self,
231 endpoint: &str,
232 data: impl Serialize + Sized + std::fmt::Debug,
233 content_type: &str,
234 ) -> Result<Response, Error> {
235 let url: String = format!("{}/{}", get_api_url(), endpoint);
236
237 self.client
238 .post(&url)
239 .header(
240 "User-Agent",
241 format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
242 )
243 .header("Content-Type", content_type)
244 .header("Authorization", format!("Bearer {}", self.api_key))
245 .json(&data)
246 .send()
247 .await
248 }
249
250 async fn api_post(
263 &self,
264 endpoint: &str,
265 data: impl Serialize + std::fmt::Debug + Clone + Send + Sync,
266 content_type: &str,
267 ) -> Result<Response, Error> {
268 let fetch = || async {
269 self.api_post_base(endpoint, data.to_owned(), content_type)
270 .await
271 };
272
273 fetch
274 .retry(ExponentialBuilder::default().with_max_times(5))
275 .when(|err: &reqwest::Error| {
276 if let Some(status) = err.status() {
277 status.is_server_error()
278 } else {
279 err.is_timeout()
280 }
281 })
282 .await
283 }
284
285 async fn api_get_base<T: Serialize>(
295 &self,
296 endpoint: &str,
297 query_params: Option<&T>,
298 ) -> Result<serde_json::Value, reqwest::Error> {
299 let url = format!("{}/{}", get_api_url(), endpoint);
300 let res = self
301 .client
302 .get(&url)
303 .query(&query_params)
304 .header(
305 "User-Agent",
306 format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
307 )
308 .header("Content-Type", "application/json")
309 .header("Authorization", format!("Bearer {}", self.api_key))
310 .send()
311 .await?;
312 parse_response(res).await
313 }
314
315 async fn api_get<T: Serialize>(
325 &self,
326 endpoint: &str,
327 query_params: Option<&T>,
328 ) -> Result<serde_json::Value, reqwest::Error> {
329 let fetch = || async { self.api_get_base(endpoint, query_params.to_owned()).await };
330
331 fetch
332 .retry(ExponentialBuilder::default().with_max_times(5))
333 .when(|err: &reqwest::Error| {
334 if let Some(status) = err.status() {
335 status.is_server_error()
336 } else {
337 err.is_timeout()
338 }
339 })
340 .await
341 }
342
343 async fn api_delete_base(
356 &self,
357 endpoint: &str,
358 params: Option<HashMap<String, serde_json::Value>>,
359 ) -> Result<Response, Error> {
360 let url = format!("{}/v1/{}", get_api_url(), endpoint);
361 let request_builder = self
362 .client
363 .delete(&url)
364 .header(
365 "User-Agent",
366 format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
367 )
368 .header("Content-Type", "application/json")
369 .header("Authorization", format!("Bearer {}", self.api_key));
370
371 let request_builder = if let Some(params) = params {
372 request_builder.json(¶ms)
373 } else {
374 request_builder
375 };
376
377 request_builder.send().await
378 }
379
380 async fn api_delete(
393 &self,
394 endpoint: &str,
395 params: Option<HashMap<String, serde_json::Value>>,
396 ) -> Result<Response, Error> {
397 let fetch = || async { self.api_delete_base(endpoint, params.to_owned()).await };
398
399 fetch
400 .retry(ExponentialBuilder::default().with_max_times(5))
401 .when(|err: &reqwest::Error| {
402 if let Some(status) = err.status() {
403 status.is_server_error()
404 } else {
405 err.is_timeout()
406 }
407 })
408 .await
409 }
410
411 pub async fn scrape_url(
424 &self,
425 url: &str,
426 params: Option<RequestParams>,
427 content_type: &str,
428 ) -> Result<serde_json::Value, reqwest::Error> {
429 let mut data = HashMap::new();
430
431 if let Ok(params) = serde_json::to_value(params) {
432 if let Some(ref p) = params.as_object() {
433 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
434 }
435 }
436
437 if !url.is_empty() {
438 data.insert(
439 "url".to_string(),
440 serde_json::Value::String(url.to_string()),
441 );
442 }
443
444 data.insert("limit".to_string(), serde_json::Value::Number(1.into()));
445
446 let res = self.api_post("crawl", data, content_type).await?;
447 parse_response(res).await
448 }
449
450 pub async fn crawl_url(
464 &self,
465 url: &str,
466 params: Option<RequestParams>,
467 stream: bool,
468 content_type: &str,
469 callback: Option<impl Fn(serde_json::Value) + Send>,
470 ) -> Result<serde_json::Value, reqwest::Error> {
471 use tokio_util::codec::{FramedRead, LinesCodec};
472
473 let mut data = HashMap::new();
474
475 if let Ok(params) = serde_json::to_value(params) {
476 if let Some(ref p) = params.as_object() {
477 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
478 }
479 }
480
481 data.insert("url".into(), serde_json::Value::String(url.to_string()));
482
483 let res = self.api_post("crawl", data, content_type).await?;
484
485 if stream {
486 if let Some(callback) = callback {
487 let stream = res.bytes_stream();
488
489 let stream_reader = tokio_util::io::StreamReader::new(
490 stream
491 .map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
492 );
493
494 let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
495
496 while let Some(line_result) = lines.next().await {
497 match line_result {
498 Ok(line) => match serde_json::from_str::<serde_json::Value>(&line) {
499 Ok(value) => {
500 callback(value);
501 }
502 Err(_e) => {
503 continue;
504 }
505 },
506 Err(_e) => return Ok(serde_json::Value::Null),
507 }
508 }
509
510 Ok(serde_json::Value::Null)
511 } else {
512 Ok(serde_json::Value::Null)
513 }
514 } else {
515 parse_response(res).await
516 }
517 }
518
519 pub async fn links(
532 &self,
533 url: &str,
534 params: Option<RequestParams>,
535 _stream: bool,
536 content_type: &str,
537 ) -> Result<serde_json::Value, reqwest::Error> {
538 let mut data = HashMap::new();
539
540 if let Ok(params) = serde_json::to_value(params) {
541 if let Some(ref p) = params.as_object() {
542 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
543 }
544 }
545
546 data.insert("url".into(), serde_json::Value::String(url.to_string()));
547
548 let res = self.api_post("links", data, content_type).await?;
549 parse_response(res).await
550 }
551
552 pub async fn screenshot(
565 &self,
566 url: &str,
567 params: Option<RequestParams>,
568 _stream: bool,
569 content_type: &str,
570 ) -> Result<serde_json::Value, reqwest::Error> {
571 let mut data = HashMap::new();
572
573 if let Ok(params) = serde_json::to_value(params) {
574 if let Some(ref p) = params.as_object() {
575 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
576 }
577 }
578
579 data.insert("url".into(), serde_json::Value::String(url.to_string()));
580
581 let res = self.api_post("screenshot", data, content_type).await?;
582 parse_response(res).await
583 }
584
585 pub async fn search(
598 &self,
599 q: &str,
600 params: Option<SearchRequestParams>,
601 _stream: bool,
602 content_type: &str,
603 ) -> Result<serde_json::Value, reqwest::Error> {
604 let body = match params {
605 Some(mut params) => {
606 params.search = q.to_string();
607 params
608 }
609 _ => {
610 let mut params = SearchRequestParams::default();
611 params.search = q.to_string();
612 params
613 }
614 };
615
616 let res = self.api_post("search", body, content_type).await?;
617
618 parse_response(res).await
619 }
620
621 pub async fn transform(
634 &self,
635 data: Vec<HashMap<&str, &str>>,
636 params: Option<TransformParams>,
637 _stream: bool,
638 content_type: &str,
639 ) -> Result<serde_json::Value, reqwest::Error> {
640 let mut payload = HashMap::new();
641
642 if let Ok(params) = serde_json::to_value(params) {
643 if let Some(ref p) = params.as_object() {
644 payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
645 }
646 }
647
648 if let Ok(d) = serde_json::to_value(data) {
649 payload.insert("data".into(), d);
650 }
651
652 let res = self.api_post("transform", payload, content_type).await?;
653
654 parse_response(res).await
655 }
656
657 pub async fn extract_contacts(
670 &self,
671 url: &str,
672 params: Option<RequestParams>,
673 _stream: bool,
674 content_type: &str,
675 ) -> Result<serde_json::Value, reqwest::Error> {
676 let mut data = HashMap::new();
677
678 if let Ok(params) = serde_json::to_value(params) {
679 if let Ok(params) = serde_json::to_value(params) {
680 if let Some(ref p) = params.as_object() {
681 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
682 }
683 }
684 }
685
686 match serde_json::to_value(url) {
687 Ok(u) => {
688 data.insert("url".into(), u);
689 }
690 _ => (),
691 }
692
693 let res = self
694 .api_post("pipeline/extract-contacts", data, content_type)
695 .await?;
696
697 parse_response(res).await
698 }
699
700 pub async fn label(
713 &self,
714 url: &str,
715 params: Option<RequestParams>,
716 _stream: bool,
717 content_type: &str,
718 ) -> Result<serde_json::Value, reqwest::Error> {
719 let mut data = HashMap::new();
720
721 if let Ok(params) = serde_json::to_value(params) {
722 if let Ok(params) = serde_json::to_value(params) {
723 if let Some(ref p) = params.as_object() {
724 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
725 }
726 }
727 }
728
729 data.insert("url".into(), serde_json::Value::String(url.to_string()));
730
731 let res = self.api_post("pipeline/label", data, content_type).await?;
732 parse_response(res).await
733 }
734
735 pub async fn download(
747 &self,
748 url: Option<&str>,
749 options: Option<HashMap<&str, i32>>,
750 ) -> Result<reqwest::Response, reqwest::Error> {
751 let mut params = HashMap::new();
752
753 if let Some(url) = url {
754 params.insert("url".to_string(), url.to_string());
755 }
756
757 if let Some(options) = options {
758 for (key, value) in options {
759 params.insert(key.to_string(), value.to_string());
760 }
761 }
762
763 let url = format!("{}/v1/data/download", get_api_url());
764 let request = self
765 .client
766 .get(&url)
767 .header(
768 "User-Agent",
769 format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
770 )
771 .header("Content-Type", "application/octet-stream")
772 .header("Authorization", format!("Bearer {}", self.api_key))
773 .query(¶ms);
774
775 let res = request.send().await?;
776
777 Ok(res)
778 }
779
780 pub async fn create_signed_url(
792 &self,
793 url: Option<&str>,
794 options: Option<HashMap<&str, i32>>,
795 ) -> Result<serde_json::Value, reqwest::Error> {
796 let mut params = HashMap::new();
797
798 if let Some(options) = options {
799 for (key, value) in options {
800 params.insert(key.to_string(), value.to_string());
801 }
802 }
803
804 if let Some(url) = url {
805 params.insert("url".to_string(), url.to_string());
806 }
807
808 let url = format!("{}/v1/data/sign-url", get_api_url());
809 let request = self
810 .client
811 .get(&url)
812 .header(
813 "User-Agent",
814 format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
815 )
816 .header("Authorization", format!("Bearer {}", self.api_key))
817 .query(¶ms);
818
819 let res = request.send().await?;
820
821 parse_response(res).await
822 }
823
824 pub async fn get_crawl_state(
836 &self,
837 url: &str,
838 params: Option<RequestParams>,
839 content_type: &str,
840 ) -> Result<serde_json::Value, reqwest::Error> {
841 let mut payload = HashMap::new();
842 payload.insert("url".into(), serde_json::Value::String(url.to_string()));
843 payload.insert(
844 "contentType".into(),
845 serde_json::Value::String(content_type.to_string()),
846 );
847
848 if let Ok(params) = serde_json::to_value(params) {
849 if let Ok(params) = serde_json::to_value(params) {
850 if let Some(ref p) = params.as_object() {
851 payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
852 }
853 }
854 }
855
856 let res = self
857 .api_post("data/crawl_state", payload, content_type)
858 .await?;
859 parse_response(res).await
860 }
861
862 pub async fn get_credits(&self) -> Result<serde_json::Value, reqwest::Error> {
864 self.api_get::<serde_json::Value>("data/credits", None)
865 .await
866 }
867
868 pub async fn data_post(
870 &self,
871 table: &str,
872 data: Option<RequestParams>,
873 ) -> Result<serde_json::Value, reqwest::Error> {
874 let res = self
875 .api_post(&format!("data/{}", table), data, "application/json")
876 .await?;
877 parse_response(res).await
878 }
879
880 pub async fn query(&self, params: &QueryRequest) -> Result<serde_json::Value, reqwest::Error> {
882 let res = self
883 .api_get::<QueryRequest>(&"data/query", Some(params))
884 .await?;
885
886 Ok(res)
887 }
888
889 pub async fn data_get(
891 &self,
892 table: &str,
893 params: Option<RequestParams>,
894 ) -> Result<serde_json::Value, reqwest::Error> {
895 let mut payload = HashMap::new();
896
897 if let Some(params) = params {
898 if let Ok(p) = serde_json::to_value(params) {
899 if let Some(o) = p.as_object() {
900 payload.extend(o.iter().map(|(k, v)| (k.as_str(), v.clone())));
901 }
902 }
903 }
904
905 let res = self
906 .api_get::<serde_json::Value>(&format!("data/{}", table), None)
907 .await?;
908 Ok(res)
909 }
910
911 pub async fn data_delete(
913 &self,
914 table: &str,
915 params: Option<RequestParams>,
916 ) -> Result<serde_json::Value, reqwest::Error> {
917 let mut payload = HashMap::new();
918
919 if let Ok(params) = serde_json::to_value(params) {
920 if let Ok(params) = serde_json::to_value(params) {
921 if let Some(ref p) = params.as_object() {
922 payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
923 }
924 }
925 }
926
927 let res = self
928 .api_delete(&format!("data/{}", table), Some(payload))
929 .await?;
930 parse_response(res).await
931 }
932}
933
934#[cfg(test)]
935mod tests {
936 use super::*;
937 use dotenv::dotenv;
938 use lazy_static::lazy_static;
939 use reqwest::ClientBuilder;
940
941 lazy_static! {
942 static ref SPIDER_CLIENT: Spider = {
943 dotenv().ok();
944 let client = ClientBuilder::new();
945 let client = client.user_agent("SpiderBot").build().unwrap();
946
947 Spider::new_with_client(None, client).expect("client to build")
948 };
949 }
950
951 #[tokio::test]
952 #[ignore]
953 async fn test_scrape_url() {
954 let response = SPIDER_CLIENT
955 .scrape_url("https://example.com", None, "application/json")
956 .await;
957 assert!(response.is_ok());
958 }
959
960 #[tokio::test]
961 async fn test_crawl_url() {
962 let response = SPIDER_CLIENT
963 .crawl_url(
964 "https://example.com",
965 None,
966 false,
967 "application/json",
968 None::<fn(serde_json::Value)>,
969 )
970 .await;
971 assert!(response.is_ok());
972 }
973
974 #[tokio::test]
975 #[ignore]
976 async fn test_links() {
977 let response: Result<serde_json::Value, Error> = SPIDER_CLIENT
978 .links("https://example.com", None, false, "application/json")
979 .await;
980 assert!(response.is_ok());
981 }
982
983 #[tokio::test]
984 #[ignore]
985 async fn test_screenshot() {
986 let mut params = RequestParams::default();
987 params.limit = Some(1);
988
989 let response = SPIDER_CLIENT
990 .screenshot(
991 "https://example.com",
992 Some(params),
993 false,
994 "application/json",
995 )
996 .await;
997 assert!(response.is_ok());
998 }
999
1000 #[tokio::test]
1016 #[ignore]
1017 async fn test_transform() {
1018 let data = vec![HashMap::from([(
1019 "<html><body><h1>Transformation</h1></body></html>".into(),
1020 "".into(),
1021 )])];
1022 let response = SPIDER_CLIENT
1023 .transform(data, None, false, "application/json")
1024 .await;
1025 assert!(response.is_ok());
1026 }
1027
1028 #[tokio::test]
1029 #[ignore]
1030 async fn test_extract_contacts() {
1031 let response = SPIDER_CLIENT
1032 .extract_contacts("https://example.com", None, false, "application/json")
1033 .await;
1034 assert!(response.is_ok());
1035 }
1036
1037 #[tokio::test]
1038 #[ignore]
1039 async fn test_label() {
1040 let response = SPIDER_CLIENT
1041 .label("https://example.com", None, false, "application/json")
1042 .await;
1043 assert!(response.is_ok());
1044 }
1045
1046 #[tokio::test]
1047 async fn test_create_signed_url() {
1048 let response = SPIDER_CLIENT
1049 .create_signed_url(Some("example.com"), None)
1050 .await;
1051 assert!(response.is_ok());
1052 }
1053
1054 #[tokio::test]
1055 async fn test_get_crawl_state() {
1056 let response = SPIDER_CLIENT
1057 .get_crawl_state("https://example.com", None, "application/json")
1058 .await;
1059 assert!(response.is_ok());
1060 }
1061
1062 #[tokio::test]
1063 async fn test_query() {
1064 let mut query = QueryRequest::default();
1065
1066 query.domain = Some("spider.cloud".into());
1067
1068 let response = SPIDER_CLIENT.query(&query).await;
1069 assert!(response.is_ok());
1070 }
1071
1072 #[tokio::test]
1073 async fn test_get_credits() {
1074 let response = SPIDER_CLIENT.get_credits().await;
1075 assert!(response.is_ok());
1076 }
1077}