spider_middleware/
robots_txt.rs1use std::sync::Arc;
13use std::time::Duration;
14
15use async_trait::async_trait;
16use http::header::USER_AGENT;
17use moka::future::Cache;
18use robotstxt::DefaultMatcher;
19use tracing::{debug, info, warn};
20
21use spider_util::error::SpiderError;
22use crate::middleware::{Middleware, MiddlewareAction};
23use spider_util::request::Request;
24use reqwest::StatusCode;
25use bytes::Bytes;
26use reqwest::Client;
27
28#[async_trait]
30pub trait SimpleHttpClient: Send + Sync {
31 async fn get_text(
33 &self,
34 url: &str,
35 timeout: Duration,
36 ) -> Result<(StatusCode, Bytes), SpiderError>;
37}
38
39#[async_trait]
41impl SimpleHttpClient for Client {
42 async fn get_text(
43 &self,
44 url: &str,
45 timeout: Duration,
46 ) -> Result<(StatusCode, Bytes), SpiderError> {
47 let request_builder = self.get(url).timeout(timeout);
48 let response = request_builder.send().await?;
49 let status = response.status();
50 let body = response.bytes().await?;
51 Ok((status, body))
52 }
53}
54
55#[derive(Debug)]
57pub struct RobotsTxtMiddleware {
58 cache_ttl: Duration,
59 cache_capacity: u64,
60 request_timeout: Duration,
61 cache: Cache<String, Arc<String>>,
62}
63
64impl Default for RobotsTxtMiddleware {
65 fn default() -> Self {
66 let cache_ttl = Duration::from_secs(60 * 60 * 24);
67 let cache_capacity = 10_000;
68 let cache = Cache::builder()
69 .time_to_live(cache_ttl)
70 .max_capacity(cache_capacity)
71 .build();
72
73 let middleware = Self {
74 cache_ttl,
75 cache_capacity,
76 request_timeout: Duration::from_secs(5),
77 cache,
78 };
79 info!(
80 "Initializing RobotsTxtMiddleware with config: {:?}",
81 middleware
82 );
83 middleware
84 }
85}
86
87impl RobotsTxtMiddleware {
88 pub fn new() -> Self {
90 Self::default()
91 }
92
93 pub fn cache_ttl(mut self, cache_ttl: Duration) -> Self {
95 self.cache_ttl = cache_ttl;
96 self.rebuild_cache();
97 self
98 }
99
100 pub fn cache_capacity(mut self, cache_capacity: u64) -> Self {
102 self.cache_capacity = cache_capacity;
103 self.rebuild_cache();
104 self
105 }
106
107 pub fn request_timeout(mut self, request_timeout: Duration) -> Self {
109 self.request_timeout = request_timeout;
110 self
111 }
112
113 fn rebuild_cache(&mut self) {
115 self.cache = Cache::builder()
116 .time_to_live(self.cache_ttl)
117 .max_capacity(self.cache_capacity)
118 .build();
119 }
120
121 async fn fetch_robots_content<C: SimpleHttpClient>(
122 &self,
123 client: &C,
124 origin: &str,
125 ) -> Arc<String> {
126 let robots_url = format!("{}/robots.txt", origin);
127 debug!("Fetching robots.txt from: {}", robots_url);
128
129 let permissive = || Arc::new(String::new());
130
131 match client.get_text(&robots_url, self.request_timeout).await {
132 Ok((status, body)) if status.is_success() => match String::from_utf8(body.into()) {
133 Ok(text) => Arc::new(text),
134 Err(e) => {
135 warn!("Failed to read robots.txt {}: {}", robots_url, e);
136 permissive()
137 }
138 },
139 Ok((status, _)) => {
140 debug!(
141 "robots.txt {} returned {} — allowing all",
142 robots_url, status
143 );
144 permissive()
145 }
146 Err(e) => {
147 warn!("Failed to fetch robots.txt {}: {}", robots_url, e);
148 permissive()
149 }
150 }
151 }
152}
153
154#[async_trait]
155impl<C: SimpleHttpClient> Middleware<C> for RobotsTxtMiddleware {
156 fn name(&self) -> &str {
157 "RobotsTxtMiddleware"
158 }
159
160 async fn process_request(
161 &mut self,
162 client: &C,
163 request: Request,
164 ) -> Result<MiddlewareAction<Request>, SpiderError> {
165 let url = request.url.clone();
166 let origin = match url.origin().unicode_serialization() {
167 s if s == "null" => return Ok(MiddlewareAction::Continue(request)),
168 s => s,
169 };
170
171 let robots_body = match self.cache.get(&origin).await {
172 Some(body) => body,
173 None => {
174 let body = self.fetch_robots_content(client, &origin).await;
175 self.cache.insert(origin.clone(), body.clone()).await;
176 body
177 }
178 };
179
180 if let Some(user_agent) = request.headers.get(USER_AGENT) {
181 let ua = user_agent
182 .to_str()
183 .map_err(|e| SpiderError::HeaderValueError(e.to_string()))?;
184
185 let mut matcher = DefaultMatcher::default();
186 if matcher.one_agent_allowed_by_robots(robots_body.as_str(), ua, url.as_str()) {
187 return Ok(MiddlewareAction::Continue(request));
188 }
189 }
190
191 debug!("Blocked by robots.txt: {}", url);
192 Err(SpiderError::BlockedByRobotsTxt)
193 }
194}
195