Skip to main content

spider_middleware/
robots_txt.rs

1//! Robots.txt Middleware for respecting website crawling policies.
2//!
3//! This module provides the `RobotsTxtMiddleware`, which automatically
4//! fetches, caches, and interprets `robots.txt` files from websites.
5//! Before each outgoing request, this middleware checks if the request's
6//! URL and User-Agent are permitted by the target host's `robots.txt` rules.
7//!
8//! This ensures that the crawler adheres to the website's specified crawling
9//! policies, preventing access to disallowed paths and promoting polite web scraping.
10//! It uses a caching mechanism to avoid repeatedly fetching `robots.txt` files.
11
12use std::sync::Arc;
13use std::time::Duration;
14
15use async_trait::async_trait;
16use http::header::USER_AGENT;
17use moka::future::Cache;
18use robotstxt::DefaultMatcher;
19use tracing::{debug, info, warn};
20
21use spider_util::error::SpiderError;
22use crate::middleware::{Middleware, MiddlewareAction};
23use spider_util::request::Request;
24use reqwest::StatusCode;
25use bytes::Bytes;
26use reqwest::Client;
27
28/// A simple HTTP client trait for fetching web content.
29#[async_trait]
30pub trait SimpleHttpClient: Send + Sync {
31    /// Fetches the content of a URL as text.
32    async fn get_text(
33        &self,
34        url: &str,
35        timeout: Duration,
36    ) -> Result<(StatusCode, Bytes), SpiderError>;
37}
38
39// Implement the trait for reqwest::Client
40#[async_trait]
41impl SimpleHttpClient for Client {
42    async fn get_text(
43        &self,
44        url: &str,
45        timeout: Duration,
46    ) -> Result<(StatusCode, Bytes), SpiderError> {
47        let request_builder = self.get(url).timeout(timeout);
48        let response = request_builder.send().await?;
49        let status = response.status();
50        let body = response.bytes().await?;
51        Ok((status, body))
52    }
53}
54
55/// Robots.txt middleware
56#[derive(Debug)]
57pub struct RobotsTxtMiddleware {
58    cache_ttl: Duration,
59    cache_capacity: u64,
60    request_timeout: Duration,
61    cache: Cache<String, Arc<String>>,
62}
63
64impl Default for RobotsTxtMiddleware {
65    fn default() -> Self {
66        let cache_ttl = Duration::from_secs(60 * 60 * 24);
67        let cache_capacity = 10_000;
68        let cache = Cache::builder()
69            .time_to_live(cache_ttl)
70            .max_capacity(cache_capacity)
71            .build();
72
73        let middleware = Self {
74            cache_ttl,
75            cache_capacity,
76            request_timeout: Duration::from_secs(5),
77            cache,
78        };
79        info!(
80            "Initializing RobotsTxtMiddleware with config: {:?}",
81            middleware
82        );
83        middleware
84    }
85}
86
87impl RobotsTxtMiddleware {
88    /// Creates a new `RobotsTxtMiddleware` with default settings.
89    pub fn new() -> Self {
90        Self::default()
91    }
92
93    /// Set the time-to-live for the cache.
94    pub fn cache_ttl(mut self, cache_ttl: Duration) -> Self {
95        self.cache_ttl = cache_ttl;
96        self.rebuild_cache();
97        self
98    }
99
100    /// Set the max capacity for the cache.
101    pub fn cache_capacity(mut self, cache_capacity: u64) -> Self {
102        self.cache_capacity = cache_capacity;
103        self.rebuild_cache();
104        self
105    }
106
107    /// Set the timeout for fetching robots.txt files.
108    pub fn request_timeout(mut self, request_timeout: Duration) -> Self {
109        self.request_timeout = request_timeout;
110        self
111    }
112
113    /// Rebuilds the cache with the current settings.
114    fn rebuild_cache(&mut self) {
115        self.cache = Cache::builder()
116            .time_to_live(self.cache_ttl)
117            .max_capacity(self.cache_capacity)
118            .build();
119    }
120
121    async fn fetch_robots_content<C: SimpleHttpClient>(
122        &self,
123        client: &C,
124        origin: &str,
125    ) -> Arc<String> {
126        let robots_url = format!("{}/robots.txt", origin);
127        debug!("Fetching robots.txt from: {}", robots_url);
128
129        let permissive = || Arc::new(String::new());
130
131        match client.get_text(&robots_url, self.request_timeout).await {
132            Ok((status, body)) if status.is_success() => match String::from_utf8(body.into()) {
133                Ok(text) => Arc::new(text),
134                Err(e) => {
135                    warn!("Failed to read robots.txt {}: {}", robots_url, e);
136                    permissive()
137                }
138            },
139            Ok((status, _)) => {
140                debug!(
141                    "robots.txt {} returned {} — allowing all",
142                    robots_url, status
143                );
144                permissive()
145            }
146            Err(e) => {
147                warn!("Failed to fetch robots.txt {}: {}", robots_url, e);
148                permissive()
149            }
150        }
151    }
152}
153
154#[async_trait]
155impl<C: SimpleHttpClient> Middleware<C> for RobotsTxtMiddleware {
156    fn name(&self) -> &str {
157        "RobotsTxtMiddleware"
158    }
159
160    async fn process_request(
161        &mut self,
162        client: &C,
163        request: Request,
164    ) -> Result<MiddlewareAction<Request>, SpiderError> {
165        let url = request.url.clone();
166        let origin = match url.origin().unicode_serialization() {
167            s if s == "null" => return Ok(MiddlewareAction::Continue(request)),
168            s => s,
169        };
170
171        let robots_body = match self.cache.get(&origin).await {
172            Some(body) => body,
173            None => {
174                let body = self.fetch_robots_content(client, &origin).await;
175                self.cache.insert(origin.clone(), body.clone()).await;
176                body
177            }
178        };
179
180        if let Some(user_agent) = request.headers.get(USER_AGENT) {
181            let ua = user_agent
182                .to_str()
183                .map_err(|e| SpiderError::HeaderValueError(e.to_string()))?;
184
185            let mut matcher = DefaultMatcher::default();
186            if matcher.one_agent_allowed_by_robots(robots_body.as_str(), ua, url.as_str()) {
187                return Ok(MiddlewareAction::Continue(request));
188            }
189        }
190
191        debug!("Blocked by robots.txt: {}", url);
192        Err(SpiderError::BlockedByRobotsTxt)
193    }
194}
195