1use crate::cache::{parse_cache_control, DEFAULT_CACHE_TTL, MAX_CACHE_TTL};
9use crate::parser::RobotsParser;
10use crate::types::{RobotsCacheKey, RobotsConfig, RobotsPolicy};
11use reqwest::{Client, Response};
12use std::sync::atomic::{AtomicU64, Ordering};
13use std::sync::Arc;
14use std::time::{Duration, Instant};
15use tracing::{debug, info, warn};
16use url::Url;
17
18#[derive(Debug, Default)]
20pub struct FetchStats {
21 pub total: AtomicU64,
23 pub success: AtomicU64,
25 pub unavailable: AtomicU64,
27 pub unreachable: AtomicU64,
29 pub protected: AtomicU64,
31 pub conditional_hits: AtomicU64,
33 pub bytes_fetched: AtomicU64,
35 pub fetch_time_ms: AtomicU64,
37 pub min_fetch_time_ms: AtomicU64,
39 pub max_fetch_time_ms: AtomicU64,
41}
42
43impl FetchStats {
44 pub fn snapshot(&self) -> FetchStatsSnapshot {
46 FetchStatsSnapshot {
47 total: self.total.load(Ordering::Relaxed),
48 success: self.success.load(Ordering::Relaxed),
49 unavailable: self.unavailable.load(Ordering::Relaxed),
50 unreachable: self.unreachable.load(Ordering::Relaxed),
51 protected: self.protected.load(Ordering::Relaxed),
52 conditional_hits: self.conditional_hits.load(Ordering::Relaxed),
53 bytes_fetched: self.bytes_fetched.load(Ordering::Relaxed),
54 fetch_time_ms: self.fetch_time_ms.load(Ordering::Relaxed),
55 min_fetch_time_ms: self.min_fetch_time_ms.load(Ordering::Relaxed),
56 max_fetch_time_ms: self.max_fetch_time_ms.load(Ordering::Relaxed),
57 }
58 }
59
60 fn update_min_max(&self, elapsed_ms: u64) {
62 let mut current_min = self.min_fetch_time_ms.load(Ordering::Relaxed);
64 while current_min == 0 || elapsed_ms < current_min {
65 match self.min_fetch_time_ms.compare_exchange_weak(
66 current_min,
67 elapsed_ms,
68 Ordering::Relaxed,
69 Ordering::Relaxed,
70 ) {
71 Ok(_) => break,
72 Err(x) => current_min = x,
73 }
74 }
75
76 let mut current_max = self.max_fetch_time_ms.load(Ordering::Relaxed);
78 while elapsed_ms > current_max {
79 match self.max_fetch_time_ms.compare_exchange_weak(
80 current_max,
81 elapsed_ms,
82 Ordering::Relaxed,
83 Ordering::Relaxed,
84 ) {
85 Ok(_) => break,
86 Err(x) => current_max = x,
87 }
88 }
89 }
90}
91
92#[derive(Debug, Clone)]
94pub struct FetchStatsSnapshot {
95 pub total: u64,
97 pub success: u64,
99 pub unavailable: u64,
101 pub unreachable: u64,
103 pub protected: u64,
105 pub conditional_hits: u64,
107 pub bytes_fetched: u64,
109 pub fetch_time_ms: u64,
111 pub min_fetch_time_ms: u64,
113 pub max_fetch_time_ms: u64,
115}
116
117impl FetchStatsSnapshot {
118 pub fn avg_fetch_time_ms(&self) -> f64 {
120 if self.total == 0 {
121 0.0
122 } else {
123 self.fetch_time_ms as f64 / self.total as f64
124 }
125 }
126
127 pub fn success_rate(&self) -> f64 {
129 if self.total == 0 {
130 0.0
131 } else {
132 self.success as f64 / self.total as f64
133 }
134 }
135
136 pub fn conditional_hit_rate(&self) -> f64 {
138 if self.total == 0 {
139 0.0
140 } else {
141 self.conditional_hits as f64 / self.total as f64
142 }
143 }
144}
145
146pub struct RobotsFetcher {
148 client: Client,
150 config: RobotsConfig,
152 parser: RobotsParser,
154 stats: Arc<FetchStats>,
156}
157
158impl RobotsFetcher {
159 pub fn new(config: RobotsConfig) -> Self {
161 let max_robots_size = config.max_robots_size;
162 let client = Client::builder()
163 .user_agent(&config.user_agent)
164 .timeout(Duration::from_secs(config.fetch_timeout_secs))
165 .redirect(reqwest::redirect::Policy::limited(config.max_redirects as usize))
166 .build()
167 .expect("Failed to create HTTP client");
168
169 Self {
170 client,
171 config,
172 parser: RobotsParser::with_max_size(max_robots_size),
173 stats: Arc::new(FetchStats::default()),
174 }
175 }
176
177 pub fn stats(&self) -> Arc<FetchStats> {
179 self.stats.clone()
180 }
181
182 pub async fn fetch(&self, url: &Url) -> RobotsPolicy {
184 let key = match RobotsCacheKey::from_url(url) {
185 Some(k) => k,
186 None => {
187 return RobotsPolicy::unreachable(
188 "Invalid URL: no host".to_string(),
189 DEFAULT_CACHE_TTL,
190 );
191 }
192 };
193
194 self.fetch_for_key(&key).await
195 }
196
197 pub async fn fetch_for_key(&self, key: &RobotsCacheKey) -> RobotsPolicy {
199 self.fetch_for_key_conditional(key, None, None).await
200 }
201
202 pub async fn fetch_for_key_conditional(
208 &self,
209 key: &RobotsCacheKey,
210 etag: Option<&str>,
211 last_modified: Option<&str>,
212 ) -> RobotsPolicy {
213 let robots_url = key.robots_url();
214 let start = Instant::now();
215
216 self.stats.total.fetch_add(1, Ordering::Relaxed);
217 debug!("Fetching robots.txt from {}", robots_url);
218
219 let result = self.do_fetch_conditional(&robots_url, etag, last_modified).await;
220 let elapsed = start.elapsed();
221 let elapsed_ms = elapsed.as_millis() as u64;
222 self.stats.fetch_time_ms.fetch_add(elapsed_ms, Ordering::Relaxed);
223 self.stats.update_min_max(elapsed_ms);
224
225 match result {
226 Ok((response, ttl)) => {
227 self.handle_response(response, ttl).await
228 }
229 Err(e) => {
230 self.handle_network_error(e)
231 }
232 }
233 }
234
235 async fn do_fetch_conditional(
237 &self,
238 url: &str,
239 etag: Option<&str>,
240 last_modified: Option<&str>,
241 ) -> Result<(Response, Duration), reqwest::Error> {
242 let mut request = self.client.get(url);
243
244 if let Some(etag) = etag {
246 request = request.header("If-None-Match", etag);
247 }
248 if let Some(lm) = last_modified {
249 request = request.header("If-Modified-Since", lm);
250 }
251
252 let response = request.send().await?;
253
254 let ttl = response
256 .headers()
257 .get("cache-control")
258 .and_then(|v| v.to_str().ok())
259 .and_then(parse_cache_control)
260 .unwrap_or(Duration::from_secs(self.config.cache_ttl_secs))
261 .min(MAX_CACHE_TTL);
262
263 Ok((response, ttl))
264 }
265
266 async fn handle_response(&self, response: Response, ttl: Duration) -> RobotsPolicy {
268 let status = response.status();
269 let status_code = status.as_u16();
270
271 let etag = response
273 .headers()
274 .get("etag")
275 .and_then(|v| v.to_str().ok())
276 .map(|s| s.to_string());
277 let last_modified = response
278 .headers()
279 .get("last-modified")
280 .and_then(|v| v.to_str().ok())
281 .map(|s| s.to_string());
282
283 info!(
284 "Robots.txt response: {} ({})",
285 response.url(),
286 status_code
287 );
288
289 match status_code {
291 304 => {
293 self.stats.conditional_hits.fetch_add(1, Ordering::Relaxed);
294 debug!("Robots.txt not modified (304), use cached version");
295 RobotsPolicy::not_modified(ttl)
298 }
299
300 200..=299 => {
302 self.stats.success.fetch_add(1, Ordering::Relaxed);
303 self.parse_successful_response(response, ttl, etag, last_modified).await
304 }
305
306 400 | 404 | 405 | 410 | 451 => {
308 self.stats.unavailable.fetch_add(1, Ordering::Relaxed);
309 debug!("Robots.txt unavailable ({}), allowing all", status_code);
310 RobotsPolicy::unavailable(status_code, ttl)
311 }
312
313 401 | 403 => {
315 if self.config.safe_mode {
316 self.stats.protected.fetch_add(1, Ordering::Relaxed);
317 warn!("Robots.txt protected ({}), denying in safe mode", status_code);
318 RobotsPolicy::protected(status_code, ttl)
319 } else {
320 self.stats.unavailable.fetch_add(1, Ordering::Relaxed);
321 debug!("Robots.txt protected ({}), allowing per RFC", status_code);
322 RobotsPolicy::unavailable(status_code, ttl)
323 }
324 }
325
326 400..=499 => {
328 self.stats.unavailable.fetch_add(1, Ordering::Relaxed);
329 debug!("Robots.txt unavailable ({}), allowing all", status_code);
330 RobotsPolicy::unavailable(status_code, ttl)
331 }
332
333 500..=599 => {
335 self.stats.unreachable.fetch_add(1, Ordering::Relaxed);
336 warn!("Robots.txt unreachable ({}), denying all", status_code);
337 RobotsPolicy::unreachable(
338 format!("Server error: {}", status_code),
339 ttl,
340 )
341 }
342
343 _ => {
345 self.stats.unreachable.fetch_add(1, Ordering::Relaxed);
346 warn!("Unexpected robots.txt status ({}), denying all", status_code);
347 RobotsPolicy::unreachable(
348 format!("Unexpected status: {}", status_code),
349 ttl,
350 )
351 }
352 }
353 }
354
355 async fn parse_successful_response(
357 &self,
358 response: Response,
359 ttl: Duration,
360 etag: Option<String>,
361 last_modified: Option<String>,
362 ) -> RobotsPolicy {
363 let content_type = response
365 .headers()
366 .get("content-type")
367 .and_then(|v| v.to_str().ok())
368 .unwrap_or("text/plain");
369
370 if !content_type.contains("text/plain") && !content_type.contains("text/html") {
371 warn!(
372 "Unexpected Content-Type for robots.txt: {}, treating as empty",
373 content_type
374 );
375 return RobotsPolicy::unavailable(200, ttl);
376 }
377
378 match response.text().await {
380 Ok(content) => {
381 let bytes = content.len();
382 self.stats.bytes_fetched.fetch_add(bytes as u64, Ordering::Relaxed);
383
384 if content.is_empty() {
386 debug!("Empty robots.txt, allowing all");
387 return RobotsPolicy::unavailable(200, ttl);
388 }
389
390 let mut policy = self.parser.parse(&content, ttl);
392 policy.etag = etag;
393 policy.last_modified = last_modified;
394 policy
395 }
396 Err(e) => {
397 warn!("Failed to read robots.txt body: {}", e);
398 RobotsPolicy::unreachable(
399 format!("Body read error: {}", e),
400 DEFAULT_CACHE_TTL,
401 )
402 }
403 }
404 }
405
406 fn handle_network_error(&self, error: reqwest::Error) -> RobotsPolicy {
408 self.stats.unreachable.fetch_add(1, Ordering::Relaxed);
409
410 let reason = if error.is_timeout() {
411 "Network timeout".to_string()
412 } else if error.is_connect() {
413 "Connection failed".to_string()
414 } else if error.is_redirect() {
415 "Too many redirects".to_string()
416 } else {
417 format!("Network error: {}", error)
418 };
419
420 warn!("Robots.txt fetch failed: {}", reason);
421 RobotsPolicy::unreachable(reason, DEFAULT_CACHE_TTL)
422 }
423}
424
425#[cfg(test)]
426mod tests {
427 use super::*;
428
429 #[test]
430 fn test_fetch_stats_default() {
431 let stats = FetchStats::default();
432 let snapshot = stats.snapshot();
433 assert_eq!(snapshot.total, 0);
434 assert_eq!(snapshot.success, 0);
435 }
436
437 #[tokio::test]
438 async fn test_fetcher_creation() {
439 let config = RobotsConfig::default();
440 let _fetcher = RobotsFetcher::new(config);
441 }
442
443 }