spider_lib/middlewares/
robots_txt.rs

1//! Robots.txt Middleware for respecting website crawling policies.
2//!
3//! This module provides the `RobotsTxtMiddleware`, which automatically
4//! fetches, caches, and interprets `robots.txt` files from websites.
5//! Before each outgoing request, this middleware checks if the request's
6//! URL and User-Agent are permitted by the target host's `robots.txt` rules.
7//!
8//! This ensures that the crawler adheres to the website's specified crawling
9//! policies, preventing access to disallowed paths and promoting polite web scraping.
10//! It uses a caching mechanism to avoid repeatedly fetching `robots.txt` files.
11
12use std::sync::Arc;
13use std::time::Duration;
14
15use async_trait::async_trait;
16use http::header::USER_AGENT;
17use moka::future::Cache;
18use robotstxt::DefaultMatcher;
19use tracing::{debug, info, warn};
20
21use crate::downloader::SimpleHttpClient;
22use crate::error::SpiderError;
23use crate::middleware::{Middleware, MiddlewareAction};
24use crate::request::Request;
25
26/// Robots.txt middleware
27#[derive(Debug)]
28pub struct RobotsTxtMiddleware {
29    cache_ttl: Duration,
30    cache_capacity: u64,
31    request_timeout: Duration,
32    cache: Cache<String, Arc<String>>,
33}
34
35impl Default for RobotsTxtMiddleware {
36    fn default() -> Self {
37        let cache_ttl = Duration::from_secs(60 * 60 * 24);
38        let cache_capacity = 10_000;
39        let cache = Cache::builder()
40            .time_to_live(cache_ttl)
41            .max_capacity(cache_capacity)
42            .build();
43
44        let middleware = Self {
45            cache_ttl,
46            cache_capacity,
47            request_timeout: Duration::from_secs(5),
48            cache,
49        };
50        info!(
51            "Initializing RobotsTxtMiddleware with config: {:?}",
52            middleware
53        );
54        middleware
55    }
56}
57
58impl RobotsTxtMiddleware {
59    /// Creates a new `RobotsTxtMiddleware` with default settings.
60    pub fn new() -> Self {
61        Self::default()
62    }
63
64    /// Set the time-to-live for the cache.
65    pub fn cache_ttl(mut self, cache_ttl: Duration) -> Self {
66        self.cache_ttl = cache_ttl;
67        self.rebuild_cache();
68        self
69    }
70
71    /// Set the max capacity for the cache.
72    pub fn cache_capacity(mut self, cache_capacity: u64) -> Self {
73        self.cache_capacity = cache_capacity;
74        self.rebuild_cache();
75        self
76    }
77
78    /// Set the timeout for fetching robots.txt files.
79    pub fn request_timeout(mut self, request_timeout: Duration) -> Self {
80        self.request_timeout = request_timeout;
81        self
82    }
83
84    /// Rebuilds the cache with the current settings.
85    fn rebuild_cache(&mut self) {
86        self.cache = Cache::builder()
87            .time_to_live(self.cache_ttl)
88            .max_capacity(self.cache_capacity)
89            .build();
90    }
91
92    async fn fetch_robots_content<C: SimpleHttpClient>(
93        &self,
94        client: &C,
95        origin: &str,
96    ) -> Arc<String> {
97        let robots_url = format!("{}/robots.txt", origin);
98        debug!("Fetching robots.txt from: {}", robots_url);
99
100        let permissive = || Arc::new(String::new());
101
102        match client.get_text(&robots_url, self.request_timeout).await {
103            Ok((status, body)) if status.is_success() => match String::from_utf8(body.into()) {
104                Ok(text) => Arc::new(text),
105                Err(e) => {
106                    warn!("Failed to read robots.txt {}: {}", robots_url, e);
107                    permissive()
108                }
109            },
110            Ok((status, _)) => {
111                debug!(
112                    "robots.txt {} returned {} — allowing all",
113                    robots_url, status
114                );
115                permissive()
116            }
117            Err(e) => {
118                warn!("Failed to fetch robots.txt {}: {}", robots_url, e);
119                permissive()
120            }
121        }
122    }
123}
124
125#[async_trait]
126impl<C: SimpleHttpClient> Middleware<C> for RobotsTxtMiddleware {
127    fn name(&self) -> &str {
128        "RobotsTxtMiddleware"
129    }
130
131    async fn process_request(
132        &mut self,
133        client: &C,
134        request: Request,
135    ) -> Result<MiddlewareAction<Request>, SpiderError> {
136        let url = request.url.clone();
137        let origin = match url.origin().unicode_serialization() {
138            s if s == "null" => return Ok(MiddlewareAction::Continue(request)),
139            s => s,
140        };
141
142        let robots_body = match self.cache.get(&origin).await {
143            Some(body) => body,
144            None => {
145                let body = self.fetch_robots_content(client, &origin).await;
146                self.cache.insert(origin.clone(), body.clone()).await;
147                body
148            }
149        };
150
151        if let Some(user_agent) = request.headers.get(USER_AGENT) {
152            let ua = user_agent
153                .to_str()
154                .map_err(|e| SpiderError::HeaderValueError(e.to_string()))?;
155
156            let mut matcher = DefaultMatcher::default();
157            if matcher.one_agent_allowed_by_robots(robots_body.as_str(), ua, url.as_str()) {
158                return Ok(MiddlewareAction::Continue(request));
159            }
160        }
161
162        debug!("Blocked by robots.txt: {}", url);
163        Err(SpiderError::BlockedByRobotsTxt)
164    }
165}
spider_lib/middlewares/robots_txt.rs

spider_lib/middlewares/
robots_txt.rs