spider_util/request.rs
1//! Data structures for representing HTTP requests in `spider-lib`.
2//!
3//! This module defines the [`Request`] struct, which is a central component
4//! for constructing and managing outgoing HTTP requests within the
5//! `spider-lib` framework. It encapsulates all necessary details of an
6//! HTTP request, including:
7//! - The target URL and HTTP method
8//! - Request headers and an optional request body (supporting JSON, form data, or raw bytes)
9//! - Metadata for tracking retry attempts or other custom information
10//!
11//! Additionally, the module provides methods for building requests,
12//! incrementing retry counters, and generating unique fingerprints
13//! for request deduplication and caching.
14//!
15//! ## Example
16//!
17//! ```rust
18//! use spider_util::request::{Request, Body};
19//! use url::Url;
20//! use serde_json::json;
21//!
22//! // Create a simple GET request
23//! let url = Url::parse("https://example.com").unwrap();
24//! let request = Request::new(url);
25//!
26//! // Create a POST request with JSON body
27//! let post_request = Request::new(Url::parse("https://api.example.com/data").unwrap())
28//! .with_method(reqwest::Method::POST)
29//! .with_json(json!({"key": "value"}));
30//! ```
31
32use bytes::Bytes;
33use dashmap::DashMap;
34use http::header::HeaderMap;
35use reqwest::{Method, Url};
36use serde::{Deserialize, Serialize};
37use serde_json::Value;
38use std::collections::HashMap;
39use std::hash::Hasher;
40use std::str::FromStr;
41use std::sync::Arc;
42use twox_hash::XxHash64;
43
44use crate::error::SpiderError;
45
46/// The body of an HTTP request.
47///
48/// [`Body`] encapsulates the different types of request bodies that can be sent
49/// with an HTTP request. It supports JSON payloads, form data, and raw bytes.
50///
51/// ## Variants
52///
53/// - `Json`: A JSON value (typically an object or array)
54/// - `Form`: Key-value form data encoded as `application/x-www-form-urlencoded`
55/// - `Bytes`: Raw binary data
56///
57/// ## Example
58///
59/// ```rust
60/// use spider_util::request::Body;
61/// use serde_json::json;
62/// use dashmap::DashMap;
63/// use bytes::Bytes;
64///
65/// // JSON body
66/// let json_body = Body::Json(json!({"name": "test"}));
67///
68/// // Form data
69/// let mut form = DashMap::new();
70/// form.insert("key".to_string(), "value".to_string());
71/// let form_body = Body::Form(form);
72///
73/// // Raw bytes
74/// let bytes_body = Body::Bytes(Bytes::from("raw data"));
75/// ```
76#[derive(Debug, Clone)]
77pub enum Body {
78 /// JSON payload.
79 Json(serde_json::Value),
80 /// Form data (key-value pairs).
81 Form(DashMap<String, String>),
82 /// Raw binary data.
83 Bytes(Bytes),
84}
85
86// Custom serialization for Body enum
87impl Serialize for Body {
88 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
89 where
90 S: serde::Serializer,
91 {
92 use serde::ser::SerializeMap;
93 let mut map = serializer.serialize_map(Some(1))?;
94
95 match self {
96 Body::Json(value) => map.serialize_entry("Json", value)?,
97 Body::Form(dashmap) => {
98 let hmap: HashMap<String, String> = dashmap.clone().into_iter().collect();
99 map.serialize_entry("Form", &hmap)?
100 }
101 Body::Bytes(bytes) => map.serialize_entry("Bytes", bytes)?,
102 }
103
104 map.end()
105 }
106}
107
108impl<'de> Deserialize<'de> for Body {
109 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
110 where
111 D: serde::Deserializer<'de>,
112 {
113 use serde::de::{self, MapAccess, Visitor};
114 use std::fmt;
115
116 struct BodyVisitor;
117
118 impl<'de> Visitor<'de> for BodyVisitor {
119 type Value = Body;
120
121 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
122 formatter.write_str("a body object")
123 }
124
125 fn visit_map<V>(self, mut map: V) -> Result<Body, V::Error>
126 where
127 V: MapAccess<'de>,
128 {
129 let entry = map.next_entry::<String, Value>()?;
130 let (key, value) = match entry {
131 Some((k, v)) => (k, v),
132 None => return Err(de::Error::custom("Expected a body variant")),
133 };
134
135 match key.as_str() {
136 "Json" => Ok(Body::Json(value)),
137 "Form" => {
138 let form_data: HashMap<String, String> =
139 serde_json::from_value(value).map_err(de::Error::custom)?;
140 let dashmap = DashMap::new();
141 for (k, v) in form_data {
142 dashmap.insert(k, v);
143 }
144 Ok(Body::Form(dashmap))
145 }
146 "Bytes" => {
147 let bytes: Bytes =
148 serde_json::from_value(value).map_err(de::Error::custom)?;
149 Ok(Body::Bytes(bytes))
150 }
151 _ => Err(de::Error::custom(format!("Unknown body variant: {}", key))),
152 }
153 }
154 }
155
156 deserializer.deserialize_map(BodyVisitor)
157 }
158}
159
160/// An HTTP request to be processed by the crawler.
161///
162/// [`Request`] is the primary data structure for representing outgoing HTTP
163/// requests in the spider framework. It contains all information needed to
164/// execute an HTTP request, including the URL, method, headers, body, and
165/// optional metadata.
166///
167/// ## Memory Efficiency
168///
169/// The `meta` field uses lazy initialization - the metadata map is only
170/// allocated when actually used. This reduces memory overhead for simple
171/// requests that don't need metadata.
172///
173/// ## Example
174///
175/// ```rust
176/// use spider_util::request::Request;
177/// use url::Url;
178///
179/// // Create a basic GET request
180/// let request = Request::new(Url::parse("https://example.com").unwrap());
181///
182/// // Build a request with headers and method
183/// let post_request = Request::new(Url::parse("https://api.example.com").unwrap())
184/// .with_method(reqwest::Method::POST)
185/// .with_header("Accept", "application/json")
186/// .unwrap();
187/// ```
188#[derive(Debug, Clone)]
189pub struct Request {
190 /// The target URL for this request.
191 pub url: Url,
192 /// The HTTP method (GET, POST, etc.).
193 pub method: reqwest::Method,
194 /// HTTP headers for the request.
195 pub headers: http::header::HeaderMap,
196 /// Optional request body.
197 pub body: Option<Body>,
198 /// Lazy-initialized metadata - only allocated when actually used.
199 /// This reduces memory allocation for simple requests without metadata.
200 meta: Option<Arc<DashMap<String, Value>>>,
201}
202
203// Custom serialization for Request struct
204impl Serialize for Request {
205 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
206 where
207 S: serde::Serializer,
208 {
209 use serde::ser::SerializeStruct;
210 // Convert HeaderMap to a serializable format
211 let headers_vec: Vec<(String, String)> = self
212 .headers
213 .iter()
214 .filter_map(|(name, value)| {
215 value
216 .to_str()
217 .ok()
218 .map(|val_str| (name.as_str().to_string(), val_str.to_string()))
219 })
220 .collect();
221
222 let mut s = serializer.serialize_struct("Request", 5)?;
223 s.serialize_field("url", &self.url.as_str())?;
224 s.serialize_field("method", &self.method.as_str())?;
225 s.serialize_field("headers", &headers_vec)?;
226 s.serialize_field("body", &self.body)?;
227 // Serialize meta as empty HashMap if None (for backward compatibility)
228 let meta_map: HashMap<String, Value> = self
229 .meta
230 .as_ref()
231 .map(|m| {
232 m.iter()
233 .map(|e| (e.key().clone(), e.value().clone()))
234 .collect()
235 })
236 .unwrap_or_default();
237 s.serialize_field("meta", &meta_map)?;
238 s.end()
239 }
240}
241
242impl<'de> Deserialize<'de> for Request {
243 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
244 where
245 D: serde::Deserializer<'de>,
246 {
247 use serde::de::{self, MapAccess, Visitor};
248 use std::fmt;
249
250 #[derive(Deserialize)]
251 #[serde(field_identifier, rename_all = "lowercase")]
252 enum Field {
253 Url,
254 Method,
255 Headers,
256 Body,
257 Meta,
258 }
259
260 struct RequestVisitor;
261
262 impl<'de> Visitor<'de> for RequestVisitor {
263 type Value = Request;
264
265 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
266 formatter.write_str("struct Request")
267 }
268
269 fn visit_map<V>(self, mut map: V) -> Result<Request, V::Error>
270 where
271 V: MapAccess<'de>,
272 {
273 let mut url = None;
274 let mut method = None;
275 let mut headers = None;
276 let mut body = None;
277 let mut meta = None;
278
279 while let Some(key) = map.next_key()? {
280 match key {
281 Field::Url => {
282 if url.is_some() {
283 return Err(de::Error::duplicate_field("url"));
284 }
285 let url_str: String = map.next_value()?;
286 let parsed_url = Url::parse(&url_str).map_err(de::Error::custom)?;
287 url = Some(parsed_url);
288 }
289 Field::Method => {
290 if method.is_some() {
291 return Err(de::Error::duplicate_field("method"));
292 }
293 let method_str: String = map.next_value()?;
294 let parsed_method =
295 Method::from_str(&method_str).map_err(de::Error::custom)?;
296 method = Some(parsed_method);
297 }
298 Field::Headers => {
299 if headers.is_some() {
300 return Err(de::Error::duplicate_field("headers"));
301 }
302 // Deserialize headers vector and convert back to HeaderMap
303 let headers_vec: Vec<(String, String)> = map.next_value()?;
304 let mut header_map = HeaderMap::new();
305 for (name, value) in headers_vec {
306 if let Ok(header_name) =
307 http::header::HeaderName::from_bytes(name.as_bytes())
308 && let Ok(header_value) =
309 http::header::HeaderValue::from_str(&value)
310 {
311 header_map.insert(header_name, header_value);
312 }
313 }
314 headers = Some(header_map);
315 }
316 Field::Body => {
317 if body.is_some() {
318 return Err(de::Error::duplicate_field("body"));
319 }
320 body = Some(map.next_value()?);
321 }
322 Field::Meta => {
323 // Deserialize meta HashMap and convert to DashMap
324 let meta_map: HashMap<String, Value> = map.next_value()?;
325 if !meta_map.is_empty() {
326 let dashmap = DashMap::new();
327 for (k, v) in meta_map {
328 dashmap.insert(k, v);
329 }
330 meta = Some(Arc::new(dashmap));
331 }
332 }
333 }
334 }
335
336 let url = url.ok_or_else(|| de::Error::missing_field("url"))?;
337 let method = method.ok_or_else(|| de::Error::missing_field("method"))?;
338 let headers = headers.ok_or_else(|| de::Error::missing_field("headers"))?;
339 let body = body; // Optional field
340
341 Ok(Request {
342 url,
343 method,
344 headers,
345 body,
346 meta, // May be None if no meta was serialized
347 })
348 }
349 }
350
351 const FIELDS: &[&str] = &["url", "method", "headers", "body", "meta"];
352 deserializer.deserialize_struct("Request", FIELDS, RequestVisitor)
353 }
354}
355
356impl Default for Request {
357 fn default() -> Self {
358 let default_url = match Url::parse("http://default.invalid") {
359 Ok(url) => url,
360 Err(err) => panic!("invalid hardcoded default URL: {}", err),
361 };
362 Self {
363 url: default_url,
364 method: reqwest::Method::GET,
365 headers: http::header::HeaderMap::new(),
366 body: None,
367 meta: None, // Lazy initialization - no allocation until needed
368 }
369 }
370}
371
372impl Request {
373 /// Creates a new [`Request`] with the given URL.
374 ///
375 /// Does not allocate memory for metadata unless [`with_meta`](Request::with_meta) is called.
376 ///
377 /// ## Example
378 ///
379 /// ```rust
380 /// use spider_util::request::Request;
381 /// use url::Url;
382 ///
383 /// let request = Request::new(Url::parse("https://example.com").unwrap());
384 /// ```
385 pub fn new(url: Url) -> Self {
386 Request {
387 url,
388 method: reqwest::Method::GET,
389 headers: http::header::HeaderMap::new(),
390 body: None,
391 meta: None,
392 }
393 }
394
395 /// Sets the HTTP method for the request.
396 ///
397 /// ## Example
398 ///
399 /// ```rust
400 /// use spider_util::request::Request;
401 /// use url::Url;
402 ///
403 /// let request = Request::new(Url::parse("https://example.com").unwrap())
404 /// .with_method(reqwest::Method::POST);
405 /// ```
406 pub fn with_method(mut self, method: reqwest::Method) -> Self {
407 self.method = method;
408 self
409 }
410
411 /// Adds a header to the request.
412 ///
413 /// # Errors
414 ///
415 /// Returns a [`SpiderError::HeaderValueError`] if the header name or value is invalid.
416 ///
417 /// ## Example
418 ///
419 /// ```rust
420 /// use spider_util::request::Request;
421 /// use url::Url;
422 ///
423 /// let request = Request::new(Url::parse("https://example.com").unwrap())
424 /// .with_header("Accept", "application/json")
425 /// .unwrap();
426 /// ```
427 pub fn with_header(mut self, name: &str, value: &str) -> Result<Self, SpiderError> {
428 let header_name =
429 reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|e| {
430 SpiderError::HeaderValueError(format!("Invalid header name '{}': {}", name, e))
431 })?;
432 let header_value = reqwest::header::HeaderValue::from_str(value).map_err(|e| {
433 SpiderError::HeaderValueError(format!("Invalid header value '{}': {}", value, e))
434 })?;
435
436 self.headers.insert(header_name, header_value);
437 Ok(self)
438 }
439
440 /// Sets the body of the request and defaults the method to POST.
441 ///
442 /// ## Example
443 ///
444 /// ```rust
445 /// use spider_util::request::{Request, Body};
446 /// use url::Url;
447 /// use serde_json::json;
448 ///
449 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
450 /// .with_body(Body::Json(json!({"key": "value"})));
451 /// ```
452 pub fn with_body(mut self, body: Body) -> Self {
453 self.body = Some(body);
454 self.with_method(reqwest::Method::POST)
455 }
456
457 /// Sets the body of the request to a JSON value and defaults the method to POST.
458 ///
459 /// ## Example
460 ///
461 /// ```rust
462 /// use spider_util::request::Request;
463 /// use url::Url;
464 /// use serde_json::json;
465 ///
466 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
467 /// .with_json(json!({"name": "test"}));
468 /// ```
469 pub fn with_json(self, json: serde_json::Value) -> Self {
470 self.with_body(Body::Json(json))
471 }
472
473 /// Sets the body of the request to form data and defaults the method to POST.
474 ///
475 /// ## Example
476 ///
477 /// ```rust
478 /// use spider_util::request::Request;
479 /// use url::Url;
480 /// use dashmap::DashMap;
481 ///
482 /// let mut form = DashMap::new();
483 /// form.insert("key".to_string(), "value".to_string());
484 ///
485 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
486 /// .with_form(form);
487 /// ```
488 pub fn with_form(self, form: DashMap<String, String>) -> Self {
489 self.with_body(Body::Form(form))
490 }
491
492 /// Sets the body of the request to raw bytes and defaults the method to POST.
493 ///
494 /// ## Example
495 ///
496 /// ```rust
497 /// use spider_util::request::Request;
498 /// use url::Url;
499 /// use bytes::Bytes;
500 ///
501 /// let data = Bytes::from("binary data");
502 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
503 /// .with_bytes(data);
504 /// ```
505 pub fn with_bytes(self, bytes: bytes::Bytes) -> Self {
506 self.with_body(Body::Bytes(bytes))
507 }
508
509 /// Adds a value to the request's metadata.
510 ///
511 /// Lazily allocates the metadata map on first use.
512 ///
513 /// ## Example
514 ///
515 /// ```rust
516 /// use spider_util::request::Request;
517 /// use url::Url;
518 /// use serde_json::json;
519 ///
520 /// let request = Request::new(Url::parse("https://example.com").unwrap())
521 /// .with_meta("priority", json!(1))
522 /// .with_meta("source", json!("manual"));
523 /// ```
524 pub fn with_meta(mut self, key: &str, value: serde_json::Value) -> Self {
525 self.meta
526 .get_or_insert_with(|| Arc::new(DashMap::new()))
527 .insert(key.to_string(), value);
528 self
529 }
530
531 /// Gets a reference to a metadata value, if it exists.
532 ///
533 /// Returns `None` if the key doesn't exist or if metadata hasn't been set.
534 pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
535 self.meta
536 .as_ref()
537 .and_then(|m| m.get(key).map(|e| e.value().clone()))
538 }
539
540 /// Returns `true` if the request has metadata.
541 pub fn has_meta(&self) -> bool {
542 self.meta.as_ref().is_some_and(|m| !m.is_empty())
543 }
544
545 /// Returns a reference to the internal metadata map, if it exists.
546 pub fn meta_map(&self) -> Option<&Arc<DashMap<String, serde_json::Value>>> {
547 self.meta.as_ref()
548 }
549
550 /// Inserts a value into metadata, creating the map if needed.
551 ///
552 /// This is intended for internal framework use.
553 pub fn insert_meta(&mut self, key: String, value: serde_json::Value) {
554 self.meta
555 .get_or_insert_with(|| Arc::new(DashMap::new()))
556 .insert(key, value);
557 }
558
559 /// Gets a value from metadata using DashMap's API.
560 ///
561 /// This is intended for internal framework use where direct access is needed.
562 pub fn get_meta_ref(
563 &self,
564 key: &str,
565 ) -> Option<dashmap::mapref::one::Ref<'_, String, serde_json::Value>> {
566 self.meta.as_ref().and_then(|m| m.get(key))
567 }
568
569 /// Sets the metadata map directly.
570 ///
571 /// Used for internal framework operations.
572 pub fn set_meta_from_option(&mut self, meta: Option<Arc<DashMap<String, serde_json::Value>>>) {
573 self.meta = meta;
574 }
575
576 /// Clones the metadata map.
577 ///
578 /// Used for internal framework operations where metadata needs to be copied.
579 pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
580 self.meta.clone()
581 }
582
583 /// Takes the metadata map, leaving `None` in its place.
584 ///
585 /// Used for internal framework operations.
586 pub fn take_meta(&mut self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
587 self.meta.take()
588 }
589
590 /// Returns a reference to the metadata Arc for internal framework use.
591 pub fn meta_inner(&self) -> &Option<Arc<DashMap<String, serde_json::Value>>> {
592 &self.meta
593 }
594
595 const RETRY_ATTEMPTS_KEY: &str = "retry_attempts";
596
597 /// Gets the number of times the request has been retried.
598 ///
599 /// Returns `0` if no retry attempts have been recorded.
600 pub fn get_retry_attempts(&self) -> u32 {
601 self.meta
602 .as_ref()
603 .and_then(|m| m.get(Self::RETRY_ATTEMPTS_KEY))
604 .and_then(|v| v.value().as_u64())
605 .unwrap_or(0) as u32
606 }
607
608 /// Increments the retry count for the request.
609 ///
610 /// Lazily allocates the metadata map if not already present.
611 pub fn increment_retry_attempts(&mut self) {
612 let current_attempts = self.get_retry_attempts();
613 self.meta
614 .get_or_insert_with(|| Arc::new(DashMap::new()))
615 .insert(
616 Self::RETRY_ATTEMPTS_KEY.to_string(),
617 serde_json::Value::from(current_attempts + 1),
618 );
619 }
620
621 /// Generates a unique fingerprint for the request based on its URL, method, and body.
622 ///
623 /// The fingerprint is used for duplicate detection and caching. It combines:
624 /// - The request URL
625 /// - The HTTP method
626 /// - The request body (if present)
627 ///
628 /// ## Example
629 ///
630 /// ```rust
631 /// use spider_util::request::Request;
632 /// use url::Url;
633 ///
634 /// let request = Request::new(Url::parse("https://example.com").unwrap());
635 /// let fingerprint = request.fingerprint();
636 /// ```
637 pub fn fingerprint(&self) -> String {
638 let mut hasher = XxHash64::default();
639 hasher.write(self.url.as_str().as_bytes());
640 hasher.write(self.method.as_str().as_bytes());
641
642 if let Some(ref body) = self.body {
643 match body {
644 Body::Json(json_val) => {
645 if let Ok(serialized) = serde_json::to_string(json_val) {
646 hasher.write(serialized.as_bytes());
647 }
648 }
649 Body::Form(form_val) => {
650 // Optimized: hash components directly without building intermediate String
651 for r in form_val.iter() {
652 hasher.write(r.key().as_bytes());
653 hasher.write(r.value().as_bytes());
654 }
655 }
656 Body::Bytes(bytes_val) => {
657 hasher.write(bytes_val);
658 }
659 }
660 }
661 format!("{:x}", hasher.finish())
662 }
663}