Skip to main content

spider_lib/downloaders/
reqwest_client.rs

1//! Reqwest-based Downloader implementation for the `spider-lib` framework.
2//!
3//! This module provides `ReqwestClientDownloader`, a concrete implementation
4//! of the `Downloader` trait that leverages the `reqwest` HTTP client library.
5//! It is responsible for executing HTTP requests defined by `Request` objects
6//! and converting the received HTTP responses into `Response` objects suitable
7//! for further processing by the crawler.
8//!
9//! This downloader handles various HTTP methods, request bodies (JSON, form data, bytes),
10//! and integrates with the framework's error handling.
11
12use crate::{
13    Downloader, Request, Response, SpiderError, downloader::SimpleHttpClient, request::Body,
14};
15use async_trait::async_trait;
16use bytes::Bytes;
17use http::StatusCode;
18use reqwest::{Client, Proxy};
19use std::time::Duration;
20use tracing::info;
21
22#[async_trait]
23impl SimpleHttpClient for Client {
24    async fn get_text(
25        &self,
26        url: &str,
27        timeout: Duration,
28    ) -> Result<(StatusCode, Bytes), SpiderError> {
29        let resp = self.get(url).timeout(timeout).send().await?;
30        let status = resp.status();
31        let body = resp.bytes().await?;
32        Ok((status, body))
33    }
34}
35
36/// Concrete implementation of Downloader using reqwest client
37pub struct ReqwestClientDownloader {
38    client: Client,
39    timeout: Duration,
40}
41
42#[async_trait]
43impl Downloader for ReqwestClientDownloader {
44    type Client = Client;
45
46    /// Returns a reference to the underlying HTTP client.
47    fn client(&self) -> &Self::Client {
48        &self.client
49    }
50
51    async fn download(&self, request: Request) -> Result<Response, SpiderError> {
52        info!(
53            "Downloading {} (fingerprint: {})",
54            request.url,
55            request.fingerprint()
56        );
57
58        let Request {
59            url,
60            method,
61            headers,
62            body,
63            meta,
64            ..
65        } = request;
66
67        let mut client_to_use = self.client.clone();
68
69        if let Some(proxy_val) = meta.get("proxy")
70            && let Some(proxy_str) = proxy_val.as_str()
71        {
72            match Proxy::all(proxy_str) {
73                Ok(proxy) => {
74                    let new_client = Client::builder()
75                        .timeout(self.timeout)
76                        .proxy(proxy)
77                        .build()
78                        .map_err(|e| SpiderError::ReqwestError(e.into()))?;
79                    client_to_use = new_client;
80                }
81                Err(e) => {
82                    return Err(SpiderError::ReqwestError(e.into()));
83                }
84            }
85        }
86
87        let mut req_builder = client_to_use.request(method, url.clone());
88
89        if let Some(body_content) = body {
90            req_builder = match body_content {
91                Body::Json(json_val) => req_builder.json(&json_val),
92                Body::Form(form_val) => req_builder.form(&form_val),
93                Body::Bytes(bytes_val) => req_builder.body(bytes_val),
94            };
95        }
96
97        let res = req_builder.headers(headers).send().await?;
98
99        let response_url = res.url().clone();
100        let status = res.status();
101        let response_headers = res.headers().clone();
102        let response_body = res.bytes().await?;
103
104        Ok(Response {
105            url: response_url,
106            status,
107            headers: response_headers,
108            body: response_body,
109            request_url: url,
110            meta,
111            cached: false,
112        })
113    }
114}
115
116impl ReqwestClientDownloader {
117    /// Creates a new `ReqwestClientDownloader` with a default timeout of 30 seconds.
118    pub fn new() -> Self {
119        Self::new_with_timeout(Duration::from_secs(30))
120    }
121
122    /// Creates a new `ReqwestClientDownloader` with a specified request timeout.
123    pub fn new_with_timeout(timeout: Duration) -> Self {
124        ReqwestClientDownloader {
125            client: Client::builder().timeout(timeout).build().unwrap(),
126            timeout,
127        }
128    }
129}
130
131impl Default for ReqwestClientDownloader {
132    fn default() -> Self {
133        Self::new()
134    }
135}