Skip to main content

spider_downloader/
reqwest_client.rs

1//! Reqwest-based Downloader implementation for the `spider-lib` framework.
2//!
3//! This module provides `ReqwestClientDownloader`, a concrete implementation
4//! of the `Downloader` trait that leverages the `reqwest` HTTP client library.
5//! It is responsible for executing HTTP requests defined by `Request` objects
6//! and converting the received HTTP responses into `Response` objects suitable
7//! for further processing by the crawler.
8//!
9//! This downloader handles various HTTP methods, request bodies (JSON, form data, bytes),
10//! and integrates with the framework's error handling.
11
12use crate::{
13    Downloader,
14    SimpleHttpClient,
15};
16use spider_util::request::{Request, Body};
17use spider_util::response::Response;
18use spider_util::error::SpiderError;
19use async_trait::async_trait;
20use bytes::Bytes;
21use http::StatusCode;
22use reqwest::{Client, Proxy};
23use std::time::Duration;
24use tracing::info;
25
26#[async_trait]
27impl SimpleHttpClient for Client {
28    async fn get_text(
29        &self,
30        url: &str,
31        timeout: Duration,
32    ) -> Result<(StatusCode, Bytes), SpiderError> {
33        let resp = self.get(url).timeout(timeout).send().await?;
34        let status = resp.status();
35        let body = resp.bytes().await?;
36        Ok((status, body))
37    }
38}
39
40/// Concrete implementation of Downloader using reqwest client
41pub struct ReqwestClientDownloader {
42    client: Client,
43    timeout: Duration,
44}
45
46#[async_trait]
47impl Downloader for ReqwestClientDownloader {
48    type Client = Client;
49
50    /// Returns a reference to the underlying HTTP client.
51    fn client(&self) -> &Self::Client {
52        &self.client
53    }
54
55    async fn download(&self, request: Request) -> Result<Response, SpiderError> {
56        info!(
57            "Downloading {} (fingerprint: {})",
58            request.url,
59            request.fingerprint()
60        );
61
62        let Request {
63            url,
64            method,
65            headers,
66            body,
67            meta,
68            ..
69        } = request;
70
71        let mut client_to_use = self.client.clone();
72
73        if let Some(proxy_val) = meta.get("proxy") && let Some(proxy_str) = proxy_val.as_str() {
74            match Proxy::all(proxy_str) {
75                Ok(proxy) => {
76                    let new_client = Client::builder()
77                        .timeout(self.timeout)
78                        .proxy(proxy)
79                        .build()
80                        .map_err(|e| SpiderError::ReqwestError(e.into()))?;
81                    client_to_use = new_client;
82                }
83                Err(e) => {
84                    return Err(SpiderError::ReqwestError(e.into()));
85                }
86            }
87        }
88
89        let mut req_builder = client_to_use.request(method, url.clone());
90
91        if let Some(body_content) = body {
92            req_builder = match body_content {
93                Body::Json(json_val) => req_builder.json(&json_val),
94                Body::Form(form_val) => {
95                    let mut form_map = std::collections::HashMap::new();
96                    for entry in form_val.iter() {
97                        form_map.insert(entry.key().clone(), entry.value().clone());
98                    }
99                    req_builder.form(&form_map)
100                },
101                Body::Bytes(bytes_val) => req_builder.body(bytes_val),
102            };
103        }
104
105        let res = req_builder.headers(headers).send().await?;
106
107        let response_url = res.url().clone();
108        let status = res.status();
109        let response_headers = res.headers().clone();
110        let response_body = res.bytes().await?;
111
112        Ok(Response {
113            url: response_url,
114            status,
115            headers: response_headers,
116            body: response_body,
117            request_url: url,
118            meta,
119            cached: false,
120        })
121    }
122}
123
124impl ReqwestClientDownloader {
125    /// Creates a new `ReqwestClientDownloader` with a default timeout of 30 seconds.
126    pub fn new() -> Self {
127        Self::new_with_timeout(Duration::from_secs(30))
128    }
129
130    /// Creates a new `ReqwestClientDownloader` with a specified request timeout.
131    pub fn new_with_timeout(timeout: Duration) -> Self {
132        ReqwestClientDownloader {
133            client: Client::builder().timeout(timeout).build().unwrap(),
134            timeout,
135        }
136    }
137}
138
139impl Default for ReqwestClientDownloader {
140    fn default() -> Self {
141        Self::new()
142    }
143}