spider_lib/middlewares/
robots_txt.rs1use std::sync::Arc;
13use std::time::Duration;
14
15use async_trait::async_trait;
16use http::header::USER_AGENT;
17use moka::future::Cache;
18use robotstxt::DefaultMatcher;
19use tracing::{debug, info, warn};
20
21use crate::downloader::SimpleHttpClient;
22use crate::error::SpiderError;
23use crate::middleware::{Middleware, MiddlewareAction};
24use crate::request::Request;
25
26#[derive(Debug)]
28pub struct RobotsTxtMiddleware {
29 cache_ttl: Duration,
30 cache_capacity: u64,
31 request_timeout: Duration,
32 cache: Cache<String, Arc<String>>,
33}
34
35impl Default for RobotsTxtMiddleware {
36 fn default() -> Self {
37 let cache_ttl = Duration::from_secs(60 * 60 * 24);
38 let cache_capacity = 10_000;
39 let cache = Cache::builder()
40 .time_to_live(cache_ttl)
41 .max_capacity(cache_capacity)
42 .build();
43
44 let middleware = Self {
45 cache_ttl,
46 cache_capacity,
47 request_timeout: Duration::from_secs(5),
48 cache,
49 };
50 info!(
51 "Initializing RobotsTxtMiddleware with config: {:?}",
52 middleware
53 );
54 middleware
55 }
56}
57
58impl RobotsTxtMiddleware {
59 pub fn new() -> Self {
61 Self::default()
62 }
63
64 pub fn cache_ttl(mut self, cache_ttl: Duration) -> Self {
66 self.cache_ttl = cache_ttl;
67 self.rebuild_cache();
68 self
69 }
70
71 pub fn cache_capacity(mut self, cache_capacity: u64) -> Self {
73 self.cache_capacity = cache_capacity;
74 self.rebuild_cache();
75 self
76 }
77
78 pub fn request_timeout(mut self, request_timeout: Duration) -> Self {
80 self.request_timeout = request_timeout;
81 self
82 }
83
84 fn rebuild_cache(&mut self) {
86 self.cache = Cache::builder()
87 .time_to_live(self.cache_ttl)
88 .max_capacity(self.cache_capacity)
89 .build();
90 }
91
92 async fn fetch_robots_content<C: SimpleHttpClient>(
93 &self,
94 client: &C,
95 origin: &str,
96 ) -> Arc<String> {
97 let robots_url = format!("{}/robots.txt", origin);
98 debug!("Fetching robots.txt from: {}", robots_url);
99
100 let permissive = || Arc::new(String::new());
101
102 match client.get_text(&robots_url, self.request_timeout).await {
103 Ok((status, body)) if status.is_success() => match String::from_utf8(body.into()) {
104 Ok(text) => Arc::new(text),
105 Err(e) => {
106 warn!("Failed to read robots.txt {}: {}", robots_url, e);
107 permissive()
108 }
109 },
110 Ok((status, _)) => {
111 debug!(
112 "robots.txt {} returned {} — allowing all",
113 robots_url, status
114 );
115 permissive()
116 }
117 Err(e) => {
118 warn!("Failed to fetch robots.txt {}: {}", robots_url, e);
119 permissive()
120 }
121 }
122 }
123}
124
125#[async_trait]
126impl<C: SimpleHttpClient> Middleware<C> for RobotsTxtMiddleware {
127 fn name(&self) -> &str {
128 "RobotsTxtMiddleware"
129 }
130
131 async fn process_request(
132 &mut self,
133 client: &C,
134 request: Request,
135 ) -> Result<MiddlewareAction<Request>, SpiderError> {
136 let url = request.url.clone();
137 let origin = match url.origin().unicode_serialization() {
138 s if s == "null" => return Ok(MiddlewareAction::Continue(request)),
139 s => s,
140 };
141
142 let robots_body = match self.cache.get(&origin).await {
143 Some(body) => body,
144 None => {
145 let body = self.fetch_robots_content(client, &origin).await;
146 self.cache.insert(origin.clone(), body.clone()).await;
147 body
148 }
149 };
150
151 if let Some(user_agent) = request.headers.get(USER_AGENT) {
152 let ua = user_agent
153 .to_str()
154 .map_err(|e| SpiderError::HeaderValueError(e.to_string()))?;
155
156 let mut matcher = DefaultMatcher::default();
157 if matcher.one_agent_allowed_by_robots(robots_body.as_str(), ua, url.as_str()) {
158 return Ok(MiddlewareAction::Continue(request));
159 }
160 }
161
162 debug!("Blocked by robots.txt: {}", url);
163 Err(SpiderError::BlockedByRobotsTxt)
164 }
165}