1pub mod shapes;
64
65use backon::ExponentialBuilder;
66use backon::Retryable;
67use reqwest::Client;
68use reqwest::{Error, Response};
69use serde::Serialize;
70pub use shapes::{request::*, response::*};
71use std::collections::HashMap;
72use std::sync::atomic::{AtomicU32, Ordering};
73use std::sync::OnceLock;
74use tokio_stream::StreamExt;
75
76#[derive(Debug, Default)]
79pub struct RateLimitInfo {
80 pub limit: AtomicU32,
82 pub remaining: AtomicU32,
84 pub reset_seconds: AtomicU32,
86}
87
88impl RateLimitInfo {
89 pub fn snapshot(&self) -> (u32, u32, u32) {
91 (
92 self.limit.load(Ordering::Relaxed),
93 self.remaining.load(Ordering::Relaxed),
94 self.reset_seconds.load(Ordering::Relaxed),
95 )
96 }
97}
98
99static API_URL: OnceLock<String> = OnceLock::new();
100
101pub fn get_api_url() -> &'static str {
103 API_URL.get_or_init(|| {
104 std::env::var("SPIDER_API_URL").unwrap_or_else(|_| "https://api.spider.cloud".to_string())
105 })
106}
107
108#[derive(Debug, Default)]
110pub struct Spider {
111 pub api_key: String,
113 pub client: Client,
115 pub rate_limit: RateLimitInfo,
117}
118
119pub async fn handle_json(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
121 res.json().await
122}
123
124pub async fn handle_jsonl(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
126 let text = res.text().await?;
127 let lines = text
128 .lines()
129 .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
130 .collect::<Vec<_>>();
131 Ok(serde_json::Value::Array(lines))
132}
133
134#[cfg(feature = "csv")]
136pub async fn handle_csv(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
137 use std::collections::HashMap;
138 let text = res.text().await?;
139 let mut rdr = csv::Reader::from_reader(text.as_bytes());
140 let records: Vec<HashMap<String, String>> = rdr.deserialize().filter_map(Result::ok).collect();
141
142 if let Ok(record) = serde_json::to_value(records) {
143 Ok(record)
144 } else {
145 Ok(serde_json::Value::String(text))
146 }
147}
148
149#[cfg(not(feature = "csv"))]
150pub async fn handle_csv(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
151 handle_text(res).await
152}
153
154pub async fn handle_text(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
156 Ok(serde_json::Value::String(
157 res.text().await.unwrap_or_default(),
158 ))
159}
160
161#[cfg(feature = "csv")]
163pub async fn handle_xml(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
164 let text = res.text().await?;
165 match quick_xml::de::from_str::<serde_json::Value>(&text) {
166 Ok(val) => Ok(val),
167 Err(_) => Ok(serde_json::Value::String(text)),
168 }
169}
170
171#[cfg(not(feature = "csv"))]
172pub async fn handle_xml(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
174 handle_text(res).await
175}
176
177pub async fn parse_response(res: reqwest::Response) -> Result<serde_json::Value, reqwest::Error> {
178 let content_type = res
179 .headers()
180 .get(reqwest::header::CONTENT_TYPE)
181 .and_then(|v| v.to_str().ok())
182 .unwrap_or_default()
183 .to_ascii_lowercase();
184
185 if content_type.contains("json") && !content_type.contains("jsonl") {
186 handle_json(res).await
187 } else if content_type.contains("jsonl") || content_type.contains("ndjson") {
188 handle_jsonl(res).await
189 } else if content_type.contains("csv") {
190 handle_csv(res).await
191 } else if content_type.contains("xml") {
192 handle_xml(res).await
193 } else {
194 handle_text(res).await
195 }
196}
197
198impl Spider {
199 pub fn new(api_key: Option<String>) -> Result<Self, &'static str> {
209 let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
210
211 match api_key {
212 Some(key) => Ok(Self {
213 api_key: key,
214 client: Client::new(),
215 rate_limit: RateLimitInfo::default(),
216 }),
217 None => Err("No API key provided"),
218 }
219 }
220
221 pub fn new_with_client(api_key: Option<String>, client: Client) -> Result<Self, &'static str> {
232 let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok());
233
234 match api_key {
235 Some(key) => Ok(Self {
236 api_key: key,
237 client,
238 rate_limit: RateLimitInfo::default(),
239 }),
240 None => Err("No API key provided"),
241 }
242 }
243
244 fn update_rate_limit(&self, headers: &reqwest::header::HeaderMap) {
246 if let Some(v) = headers.get("RateLimit-Limit").and_then(|v| v.to_str().ok()) {
247 if let Ok(n) = v.parse::<u32>() {
248 self.rate_limit.limit.store(n, Ordering::Relaxed);
249 }
250 }
251 if let Some(v) = headers
252 .get("RateLimit-Remaining")
253 .and_then(|v| v.to_str().ok())
254 {
255 if let Ok(n) = v.parse::<u32>() {
256 self.rate_limit.remaining.store(n, Ordering::Relaxed);
257 }
258 }
259 if let Some(v) = headers
260 .get("RateLimit-Reset")
261 .and_then(|v| v.to_str().ok())
262 {
263 if let Ok(n) = v.parse::<u32>() {
264 self.rate_limit.reset_seconds.store(n, Ordering::Relaxed);
265 }
266 }
267 }
268
269 async fn api_post_base(
282 &self,
283 endpoint: &str,
284 data: impl Serialize + Sized + std::fmt::Debug,
285 content_type: &str,
286 ) -> Result<Response, Error> {
287 let url: String = format!("{}/{}", get_api_url(), endpoint);
288
289 let resp = self
290 .client
291 .post(&url)
292 .header(
293 "User-Agent",
294 format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
295 )
296 .header("Content-Type", content_type)
297 .header("Authorization", format!("Bearer {}", self.api_key))
298 .json(&data)
299 .send()
300 .await?;
301
302 self.update_rate_limit(resp.headers());
303
304 if resp.status() == reqwest::StatusCode::TOO_MANY_REQUESTS {
305 let retry_after = resp
306 .headers()
307 .get("Retry-After")
308 .and_then(|v| v.to_str().ok())
309 .and_then(|v| v.parse::<u64>().ok())
310 .unwrap_or(1);
311 tokio::time::sleep(std::time::Duration::from_secs(retry_after)).await;
312 return resp.error_for_status();
313 }
314
315 Ok(resp)
316 }
317
318 pub async fn api_post(
331 &self,
332 endpoint: &str,
333 data: impl Serialize + std::fmt::Debug + Clone + Send + Sync,
334 content_type: &str,
335 ) -> Result<Response, Error> {
336 let fetch = || async {
337 self.api_post_base(endpoint, data.to_owned(), content_type)
338 .await
339 };
340
341 fetch
342 .retry(ExponentialBuilder::default().with_max_times(5))
343 .when(|err: &reqwest::Error| {
344 if let Some(status) = err.status() {
345 status.is_server_error()
346 || status == reqwest::StatusCode::TOO_MANY_REQUESTS
347 } else {
348 err.is_timeout()
349 }
350 })
351 .await
352 }
353
354 async fn api_get_base<T: Serialize>(
364 &self,
365 endpoint: &str,
366 query_params: Option<&T>,
367 ) -> Result<serde_json::Value, reqwest::Error> {
368 let url = format!("{}/{}", get_api_url(), endpoint);
369 let res = self
370 .client
371 .get(&url)
372 .query(&query_params)
373 .header(
374 "User-Agent",
375 format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
376 )
377 .header("Content-Type", "application/json")
378 .header("Authorization", format!("Bearer {}", self.api_key))
379 .send()
380 .await?;
381
382 self.update_rate_limit(res.headers());
383
384 if res.status() == reqwest::StatusCode::TOO_MANY_REQUESTS {
385 let retry_after = res
386 .headers()
387 .get("Retry-After")
388 .and_then(|v| v.to_str().ok())
389 .and_then(|v| v.parse::<u64>().ok())
390 .unwrap_or(1);
391 tokio::time::sleep(std::time::Duration::from_secs(retry_after)).await;
392 return Err(res.error_for_status().unwrap_err());
393 }
394
395 parse_response(res).await
396 }
397
398 pub async fn api_get<T: Serialize>(
408 &self,
409 endpoint: &str,
410 query_params: Option<&T>,
411 ) -> Result<serde_json::Value, reqwest::Error> {
412 let fetch = || async { self.api_get_base(endpoint, query_params.to_owned()).await };
413
414 fetch
415 .retry(ExponentialBuilder::default().with_max_times(5))
416 .when(|err: &reqwest::Error| {
417 if let Some(status) = err.status() {
418 status.is_server_error()
419 || status == reqwest::StatusCode::TOO_MANY_REQUESTS
420 } else {
421 err.is_timeout()
422 }
423 })
424 .await
425 }
426
427 async fn api_delete_base(
440 &self,
441 endpoint: &str,
442 params: Option<HashMap<String, serde_json::Value>>,
443 ) -> Result<Response, Error> {
444 let url = format!("{}/v1/{}", get_api_url(), endpoint);
445 let request_builder = self
446 .client
447 .delete(&url)
448 .header(
449 "User-Agent",
450 format!("Spider-Client/{}", env!("CARGO_PKG_VERSION")),
451 )
452 .header("Content-Type", "application/json")
453 .header("Authorization", format!("Bearer {}", self.api_key));
454
455 let request_builder = if let Some(params) = params {
456 request_builder.json(¶ms)
457 } else {
458 request_builder
459 };
460
461 request_builder.send().await
462 }
463
464 pub async fn api_delete(
477 &self,
478 endpoint: &str,
479 params: Option<HashMap<String, serde_json::Value>>,
480 ) -> Result<Response, Error> {
481 let fetch = || async { self.api_delete_base(endpoint, params.to_owned()).await };
482
483 fetch
484 .retry(ExponentialBuilder::default().with_max_times(5))
485 .when(|err: &reqwest::Error| {
486 if let Some(status) = err.status() {
487 status.is_server_error()
488 } else {
489 err.is_timeout()
490 }
491 })
492 .await
493 }
494
495 pub async fn scrape_url(
508 &self,
509 url: &str,
510 params: Option<RequestParams>,
511 content_type: &str,
512 ) -> Result<serde_json::Value, reqwest::Error> {
513 let mut data = HashMap::new();
514
515 if let Ok(params) = serde_json::to_value(params) {
516 if let Some(ref p) = params.as_object() {
517 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
518 }
519 }
520
521 if !url.is_empty() {
522 data.insert(
523 "url".to_string(),
524 serde_json::Value::String(url.to_string()),
525 );
526 }
527
528 let res = self.api_post("scrape", data, content_type).await?;
529 parse_response(res).await
530 }
531
532 pub async fn multi_scrape_url(
545 &self,
546 params: Option<Vec<RequestParams>>,
547 content_type: &str,
548 ) -> Result<serde_json::Value, reqwest::Error> {
549 let mut data = HashMap::new();
550
551 if let Ok(mut params) = serde_json::to_value(params) {
552 if let Some(obj) = params.as_object_mut() {
553 obj.insert("limit".to_string(), serde_json::Value::Number(1.into()));
554 data.extend(obj.iter().map(|(k, v)| (k.clone(), v.clone())));
555 }
556 }
557 let res = self.api_post("scrape", data, content_type).await?;
558 parse_response(res).await
559 }
560
561 pub async fn crawl_url(
575 &self,
576 url: &str,
577 params: Option<RequestParams>,
578 stream: bool,
579 content_type: &str,
580 callback: Option<impl Fn(serde_json::Value) + Send>,
581 ) -> Result<serde_json::Value, reqwest::Error> {
582 use tokio_util::codec::{FramedRead, LinesCodec};
583
584 let mut data = HashMap::new();
585
586 if let Ok(params) = serde_json::to_value(params) {
587 if let Some(ref p) = params.as_object() {
588 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
589 }
590 }
591
592 data.insert("url".into(), serde_json::Value::String(url.to_string()));
593
594 let res = self.api_post("crawl", data, content_type).await?;
595
596 if stream {
597 if let Some(callback) = callback {
598 let stream = res.bytes_stream();
599
600 let stream_reader = tokio_util::io::StreamReader::new(
601 stream
602 .map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
603 );
604
605 let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
606
607 while let Some(line_result) = lines.next().await {
608 match line_result {
609 Ok(line) => match serde_json::from_str::<serde_json::Value>(&line) {
610 Ok(value) => {
611 callback(value);
612 }
613 Err(_e) => {
614 continue;
615 }
616 },
617 Err(_e) => return Ok(serde_json::Value::Null),
618 }
619 }
620
621 Ok(serde_json::Value::Null)
622 } else {
623 Ok(serde_json::Value::Null)
624 }
625 } else {
626 parse_response(res).await
627 }
628 }
629
630 pub async fn multi_crawl_url(
644 &self,
645 params: Option<Vec<RequestParams>>,
646 stream: bool,
647 content_type: &str,
648 callback: Option<impl Fn(serde_json::Value) + Send>,
649 ) -> Result<serde_json::Value, reqwest::Error> {
650 use tokio_util::codec::{FramedRead, LinesCodec};
651
652 let res = self.api_post("crawl", params, content_type).await?;
653
654 if stream {
655 if let Some(callback) = callback {
656 let stream = res.bytes_stream();
657
658 let stream_reader = tokio_util::io::StreamReader::new(
659 stream
660 .map(|r| r.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))),
661 );
662
663 let mut lines = FramedRead::new(stream_reader, LinesCodec::new());
664
665 while let Some(line_result) = lines.next().await {
666 match line_result {
667 Ok(line) => match serde_json::from_str::<serde_json::Value>(&line) {
668 Ok(value) => {
669 callback(value);
670 }
671 Err(_e) => {
672 continue;
673 }
674 },
675 Err(_e) => return Ok(serde_json::Value::Null),
676 }
677 }
678
679 Ok(serde_json::Value::Null)
680 } else {
681 Ok(serde_json::Value::Null)
682 }
683 } else {
684 parse_response(res).await
685 }
686 }
687
688 pub async fn links(
701 &self,
702 url: &str,
703 params: Option<RequestParams>,
704 _stream: bool,
705 content_type: &str,
706 ) -> Result<serde_json::Value, reqwest::Error> {
707 let mut data = HashMap::new();
708
709 if let Ok(params) = serde_json::to_value(params) {
710 if let Some(ref p) = params.as_object() {
711 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
712 }
713 }
714
715 data.insert("url".into(), serde_json::Value::String(url.to_string()));
716
717 let res = self.api_post("links", data, content_type).await?;
718 parse_response(res).await
719 }
720
721 pub async fn multi_links(
734 &self,
735 params: Option<Vec<RequestParams>>,
736 _stream: bool,
737 content_type: &str,
738 ) -> Result<serde_json::Value, reqwest::Error> {
739 let res = self.api_post("links", params, content_type).await?;
740 parse_response(res).await
741 }
742
743 pub async fn screenshot(
756 &self,
757 url: &str,
758 params: Option<RequestParams>,
759 _stream: bool,
760 content_type: &str,
761 ) -> Result<serde_json::Value, reqwest::Error> {
762 let mut data = HashMap::new();
763
764 if let Ok(params) = serde_json::to_value(params) {
765 if let Some(ref p) = params.as_object() {
766 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
767 }
768 }
769
770 data.insert("url".into(), serde_json::Value::String(url.to_string()));
771
772 let res = self.api_post("screenshot", data, content_type).await?;
773 parse_response(res).await
774 }
775
776 pub async fn multi_screenshot(
789 &self,
790 params: Option<Vec<RequestParams>>,
791 _stream: bool,
792 content_type: &str,
793 ) -> Result<serde_json::Value, reqwest::Error> {
794 let res = self.api_post("screenshot", params, content_type).await?;
795 parse_response(res).await
796 }
797
798 pub async fn search(
811 &self,
812 q: &str,
813 params: Option<SearchRequestParams>,
814 _stream: bool,
815 content_type: &str,
816 ) -> Result<serde_json::Value, reqwest::Error> {
817 let body = match params {
818 Some(mut params) => {
819 params.search = q.to_string();
820 params
821 }
822 _ => {
823 let mut params = SearchRequestParams::default();
824 params.search = q.to_string();
825 params
826 }
827 };
828
829 let res = self.api_post("search", body, content_type).await?;
830
831 parse_response(res).await
832 }
833
834 pub async fn multi_search(
847 &self,
848 params: Option<Vec<SearchRequestParams>>,
849 content_type: &str,
850 ) -> Result<serde_json::Value, reqwest::Error> {
851 let res = self.api_post("search", params, content_type).await?;
852 parse_response(res).await
853 }
854
855 pub async fn unblock_url(
868 &self,
869 url: &str,
870 params: Option<RequestParams>,
871 content_type: &str,
872 ) -> Result<serde_json::Value, reqwest::Error> {
873 let mut data = HashMap::new();
874
875 if let Ok(params) = serde_json::to_value(params) {
876 if let Some(ref p) = params.as_object() {
877 data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
878 }
879 }
880
881 if !url.is_empty() {
882 data.insert(
883 "url".to_string(),
884 serde_json::Value::String(url.to_string()),
885 );
886 }
887
888 let res = self.api_post("unblocker", data, content_type).await?;
889 parse_response(res).await
890 }
891
892 pub async fn multi_unblock_url(
905 &self,
906 params: Option<Vec<RequestParams>>,
907 content_type: &str,
908 ) -> Result<serde_json::Value, reqwest::Error> {
909 let mut data = HashMap::new();
910
911 if let Ok(mut params) = serde_json::to_value(params) {
912 if let Some(obj) = params.as_object_mut() {
913 obj.insert("limit".to_string(), serde_json::Value::Number(1.into()));
914 data.extend(obj.iter().map(|(k, v)| (k.clone(), v.clone())));
915 }
916 }
917 let res = self.api_post("unblocker", data, content_type).await?;
918 parse_response(res).await
919 }
920
921 pub async fn transform(
934 &self,
935 data: Vec<HashMap<&str, &str>>,
936 params: Option<TransformParams>,
937 _stream: bool,
938 content_type: &str,
939 ) -> Result<serde_json::Value, reqwest::Error> {
940 let mut payload = HashMap::new();
941
942 if let Ok(params) = serde_json::to_value(params) {
943 if let Some(ref p) = params.as_object() {
944 payload.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone())));
945 }
946 }
947
948 if let Ok(d) = serde_json::to_value(data) {
949 payload.insert("data".into(), d);
950 }
951
952 let res = self.api_post("transform", payload, content_type).await?;
953
954 parse_response(res).await
955 }
956
957 pub async fn get_credits(&self) -> Result<serde_json::Value, reqwest::Error> {
959 self.api_get::<serde_json::Value>("data/credits", None)
960 .await
961 }
962
963 pub async fn data_post(
965 &self,
966 table: &str,
967 data: Option<RequestParams>,
968 ) -> Result<serde_json::Value, reqwest::Error> {
969 let res = self
970 .api_post(&format!("data/{}", table), data, "application/json")
971 .await?;
972 parse_response(res).await
973 }
974
975 pub async fn data_get(
977 &self,
978 table: &str,
979 params: Option<RequestParams>,
980 ) -> Result<serde_json::Value, reqwest::Error> {
981 let mut payload = HashMap::new();
982
983 if let Some(params) = params {
984 if let Ok(p) = serde_json::to_value(params) {
985 if let Some(o) = p.as_object() {
986 payload.extend(o.iter().map(|(k, v)| (k.as_str(), v.clone())));
987 }
988 }
989 }
990
991 let res = self
992 .api_get::<serde_json::Value>(&format!("data/{}", table), None)
993 .await?;
994 Ok(res)
995 }
996}
997
998#[cfg(test)]
999mod tests {
1000 use super::*;
1001 use dotenv::dotenv;
1002 use lazy_static::lazy_static;
1003 use reqwest::ClientBuilder;
1004
1005 lazy_static! {
1006 static ref SPIDER_CLIENT: Spider = {
1007 dotenv().ok();
1008 let client = ClientBuilder::new();
1009 let client = client.user_agent("SpiderBot").build().unwrap();
1010
1011 Spider::new_with_client(None, client).expect("client to build")
1012 };
1013 }
1014
1015 #[tokio::test]
1016 #[ignore]
1017 async fn test_scrape_url() {
1018 let response = SPIDER_CLIENT
1019 .scrape_url("https://example.com", None, "application/json")
1020 .await;
1021 assert!(response.is_ok());
1022 }
1023
1024 #[tokio::test]
1025 async fn test_crawl_url() {
1026 let response = SPIDER_CLIENT
1027 .crawl_url(
1028 "https://example.com",
1029 None,
1030 false,
1031 "application/json",
1032 None::<fn(serde_json::Value)>,
1033 )
1034 .await;
1035 assert!(response.is_ok());
1036 }
1037
1038 #[tokio::test]
1039 #[ignore]
1040 async fn test_links() {
1041 let response: Result<serde_json::Value, Error> = SPIDER_CLIENT
1042 .links("https://example.com", None, false, "application/json")
1043 .await;
1044 assert!(response.is_ok());
1045 }
1046
1047 #[tokio::test]
1048 #[ignore]
1049 async fn test_screenshot() {
1050 let mut params = RequestParams::default();
1051 params.limit = Some(1);
1052
1053 let response = SPIDER_CLIENT
1054 .screenshot(
1055 "https://example.com",
1056 Some(params),
1057 false,
1058 "application/json",
1059 )
1060 .await;
1061 assert!(response.is_ok());
1062 }
1063
1064 #[tokio::test]
1080 #[ignore]
1081 async fn test_transform() {
1082 let data = vec![HashMap::from([(
1083 "<html><body><h1>Transformation</h1></body></html>".into(),
1084 "".into(),
1085 )])];
1086 let response = SPIDER_CLIENT
1087 .transform(data, None, false, "application/json")
1088 .await;
1089 assert!(response.is_ok());
1090 }
1091
1092 #[tokio::test]
1093 async fn test_get_credits() {
1094 let response = SPIDER_CLIENT.get_credits().await;
1095 assert!(response.is_ok());
1096 }
1097}