halldyll_robots/
fetcher.rs

1//! Fetcher - RFC 9309 compliant robots.txt fetching
2//!
3//! This module provides HTTP fetching with:
4//! - Conditional GET (If-None-Match, If-Modified-Since)
5//! - Proper status code handling per RFC 9309
6//! - Comprehensive statistics
7
8use crate::cache::{parse_cache_control, DEFAULT_CACHE_TTL, MAX_CACHE_TTL};
9use crate::parser::RobotsParser;
10use crate::types::{RobotsCacheKey, RobotsConfig, RobotsPolicy};
11use reqwest::{Client, Response};
12use std::sync::atomic::{AtomicU64, Ordering};
13use std::sync::Arc;
14use std::time::{Duration, Instant};
15use tracing::{debug, info, warn};
16use url::Url;
17
18/// Fetch statistics with detailed metrics
19#[derive(Debug, Default)]
20pub struct FetchStats {
21    /// Total fetches
22    pub total: AtomicU64,
23    /// Successful fetches (200)
24    pub success: AtomicU64,
25    /// Unavailable (4xx)
26    pub unavailable: AtomicU64,
27    /// Unreachable (5xx/network)
28    pub unreachable: AtomicU64,
29    /// Protected (401/403)
30    pub protected: AtomicU64,
31    /// Conditional GET hits (304 Not Modified)
32    pub conditional_hits: AtomicU64,
33    /// Total bytes fetched
34    pub bytes_fetched: AtomicU64,
35    /// Total fetch time in milliseconds
36    pub fetch_time_ms: AtomicU64,
37    /// Minimum fetch time in milliseconds
38    pub min_fetch_time_ms: AtomicU64,
39    /// Maximum fetch time in milliseconds
40    pub max_fetch_time_ms: AtomicU64,
41}
42
43impl FetchStats {
44    /// Get snapshot
45    pub fn snapshot(&self) -> FetchStatsSnapshot {
46        FetchStatsSnapshot {
47            total: self.total.load(Ordering::Relaxed),
48            success: self.success.load(Ordering::Relaxed),
49            unavailable: self.unavailable.load(Ordering::Relaxed),
50            unreachable: self.unreachable.load(Ordering::Relaxed),
51            protected: self.protected.load(Ordering::Relaxed),
52            conditional_hits: self.conditional_hits.load(Ordering::Relaxed),
53            bytes_fetched: self.bytes_fetched.load(Ordering::Relaxed),
54            fetch_time_ms: self.fetch_time_ms.load(Ordering::Relaxed),
55            min_fetch_time_ms: self.min_fetch_time_ms.load(Ordering::Relaxed),
56            max_fetch_time_ms: self.max_fetch_time_ms.load(Ordering::Relaxed),
57        }
58    }
59
60    /// Update min/max fetch time
61    fn update_min_max(&self, elapsed_ms: u64) {
62        // Update min
63        let mut current_min = self.min_fetch_time_ms.load(Ordering::Relaxed);
64        while current_min == 0 || elapsed_ms < current_min {
65            match self.min_fetch_time_ms.compare_exchange_weak(
66                current_min,
67                elapsed_ms,
68                Ordering::Relaxed,
69                Ordering::Relaxed,
70            ) {
71                Ok(_) => break,
72                Err(x) => current_min = x,
73            }
74        }
75
76        // Update max
77        let mut current_max = self.max_fetch_time_ms.load(Ordering::Relaxed);
78        while elapsed_ms > current_max {
79            match self.max_fetch_time_ms.compare_exchange_weak(
80                current_max,
81                elapsed_ms,
82                Ordering::Relaxed,
83                Ordering::Relaxed,
84            ) {
85                Ok(_) => break,
86                Err(x) => current_max = x,
87            }
88        }
89    }
90}
91
92/// Snapshot of fetch statistics
93#[derive(Debug, Clone)]
94pub struct FetchStatsSnapshot {
95    /// Total fetches
96    pub total: u64,
97    /// Successful fetches
98    pub success: u64,
99    /// Unavailable (4xx)
100    pub unavailable: u64,
101    /// Unreachable (5xx/network)
102    pub unreachable: u64,
103    /// Protected (401/403)
104    pub protected: u64,
105    /// Conditional GET hits (304)
106    pub conditional_hits: u64,
107    /// Total bytes
108    pub bytes_fetched: u64,
109    /// Total time
110    pub fetch_time_ms: u64,
111    /// Minimum fetch time
112    pub min_fetch_time_ms: u64,
113    /// Maximum fetch time
114    pub max_fetch_time_ms: u64,
115}
116
117impl FetchStatsSnapshot {
118    /// Calculate average fetch time in milliseconds
119    pub fn avg_fetch_time_ms(&self) -> f64 {
120        if self.total == 0 {
121            0.0
122        } else {
123            self.fetch_time_ms as f64 / self.total as f64
124        }
125    }
126
127    /// Calculate success rate (0.0 - 1.0)
128    pub fn success_rate(&self) -> f64 {
129        if self.total == 0 {
130            0.0
131        } else {
132            self.success as f64 / self.total as f64
133        }
134    }
135
136    /// Calculate conditional hit rate (bandwidth savings)
137    pub fn conditional_hit_rate(&self) -> f64 {
138        if self.total == 0 {
139            0.0
140        } else {
141            self.conditional_hits as f64 / self.total as f64
142        }
143    }
144}
145
146/// Robots.txt fetcher with RFC 9309 compliance
147pub struct RobotsFetcher {
148    /// HTTP client
149    client: Client,
150    /// Configuration
151    config: RobotsConfig,
152    /// Parser
153    parser: RobotsParser,
154    /// Statistics
155    stats: Arc<FetchStats>,
156}
157
158impl RobotsFetcher {
159    /// Create a new fetcher
160    pub fn new(config: RobotsConfig) -> Self {
161        let max_robots_size = config.max_robots_size;
162        let client = Client::builder()
163            .user_agent(&config.user_agent)
164            .timeout(Duration::from_secs(config.fetch_timeout_secs))
165            .redirect(reqwest::redirect::Policy::limited(config.max_redirects as usize))
166            .build()
167            .expect("Failed to create HTTP client");
168
169        Self {
170            client,
171            config,
172            parser: RobotsParser::with_max_size(max_robots_size),
173            stats: Arc::new(FetchStats::default()),
174        }
175    }
176
177    /// Get fetch statistics
178    pub fn stats(&self) -> Arc<FetchStats> {
179        self.stats.clone()
180    }
181
182    /// Fetch and parse robots.txt for a URL
183    pub async fn fetch(&self, url: &Url) -> RobotsPolicy {
184        let key = match RobotsCacheKey::from_url(url) {
185            Some(k) => k,
186            None => {
187                return RobotsPolicy::unreachable(
188                    "Invalid URL: no host".to_string(),
189                    DEFAULT_CACHE_TTL,
190                );
191            }
192        };
193
194        self.fetch_for_key(&key).await
195    }
196
197    /// Fetch robots.txt for a cache key
198    pub async fn fetch_for_key(&self, key: &RobotsCacheKey) -> RobotsPolicy {
199        self.fetch_for_key_conditional(key, None, None).await
200    }
201
202    /// Fetch robots.txt with conditional GET support
203    ///
204    /// If `etag` or `last_modified` is provided, the request will include
205    /// the appropriate conditional headers. Returns the previous policy unchanged
206    /// if 304 Not Modified is received.
207    pub async fn fetch_for_key_conditional(
208        &self,
209        key: &RobotsCacheKey,
210        etag: Option<&str>,
211        last_modified: Option<&str>,
212    ) -> RobotsPolicy {
213        let robots_url = key.robots_url();
214        let start = Instant::now();
215        
216        self.stats.total.fetch_add(1, Ordering::Relaxed);
217        debug!("Fetching robots.txt from {}", robots_url);
218
219        let result = self.do_fetch_conditional(&robots_url, etag, last_modified).await;
220        let elapsed = start.elapsed();
221        let elapsed_ms = elapsed.as_millis() as u64;
222        self.stats.fetch_time_ms.fetch_add(elapsed_ms, Ordering::Relaxed);
223        self.stats.update_min_max(elapsed_ms);
224
225        match result {
226            Ok((response, ttl)) => {
227                self.handle_response(response, ttl).await
228            }
229            Err(e) => {
230                self.handle_network_error(e)
231            }
232        }
233    }
234
235    /// Perform the HTTP fetch with optional conditional headers
236    async fn do_fetch_conditional(
237        &self,
238        url: &str,
239        etag: Option<&str>,
240        last_modified: Option<&str>,
241    ) -> Result<(Response, Duration), reqwest::Error> {
242        let mut request = self.client.get(url);
243        
244        // Add conditional headers if provided
245        if let Some(etag) = etag {
246            request = request.header("If-None-Match", etag);
247        }
248        if let Some(lm) = last_modified {
249            request = request.header("If-Modified-Since", lm);
250        }
251        
252        let response = request.send().await?;
253        
254        // Extract TTL from Cache-Control
255        let ttl = response
256            .headers()
257            .get("cache-control")
258            .and_then(|v| v.to_str().ok())
259            .and_then(parse_cache_control)
260            .unwrap_or(Duration::from_secs(self.config.cache_ttl_secs))
261            .min(MAX_CACHE_TTL);
262
263        Ok((response, ttl))
264    }
265
266    /// Handle successful HTTP response
267    async fn handle_response(&self, response: Response, ttl: Duration) -> RobotsPolicy {
268        let status = response.status();
269        let status_code = status.as_u16();
270
271        // Extract ETag and Last-Modified for future conditional requests
272        let etag = response
273            .headers()
274            .get("etag")
275            .and_then(|v| v.to_str().ok())
276            .map(|s| s.to_string());
277        let last_modified = response
278            .headers()
279            .get("last-modified")
280            .and_then(|v| v.to_str().ok())
281            .map(|s| s.to_string());
282
283        info!(
284            "Robots.txt response: {} ({})",
285            response.url(),
286            status_code
287        );
288
289        // RFC 9309: Distinguish Unavailable vs Unreachable
290        match status_code {
291            // 304 Not Modified - conditional GET hit
292            304 => {
293                self.stats.conditional_hits.fetch_add(1, Ordering::Relaxed);
294                debug!("Robots.txt not modified (304), use cached version");
295                // Return a special policy indicating cache should be reused
296                // Caller should handle this by extending the existing policy's TTL
297                RobotsPolicy::not_modified(ttl)
298            }
299            
300            // Success - parse the content
301            200..=299 => {
302                self.stats.success.fetch_add(1, Ordering::Relaxed);
303                self.parse_successful_response(response, ttl, etag, last_modified).await
304            }
305            
306            // Unavailable (most 4xx) - allow everything
307            400 | 404 | 405 | 410 | 451 => {
308                self.stats.unavailable.fetch_add(1, Ordering::Relaxed);
309                debug!("Robots.txt unavailable ({}), allowing all", status_code);
310                RobotsPolicy::unavailable(status_code, ttl)
311            }
312            
313            // Protected (401/403) - in safe mode, deny; in strict RFC mode, allow
314            401 | 403 => {
315                if self.config.safe_mode {
316                    self.stats.protected.fetch_add(1, Ordering::Relaxed);
317                    warn!("Robots.txt protected ({}), denying in safe mode", status_code);
318                    RobotsPolicy::protected(status_code, ttl)
319                } else {
320                    self.stats.unavailable.fetch_add(1, Ordering::Relaxed);
321                    debug!("Robots.txt protected ({}), allowing per RFC", status_code);
322                    RobotsPolicy::unavailable(status_code, ttl)
323                }
324            }
325            
326            // Other 4xx - treat as unavailable
327            400..=499 => {
328                self.stats.unavailable.fetch_add(1, Ordering::Relaxed);
329                debug!("Robots.txt unavailable ({}), allowing all", status_code);
330                RobotsPolicy::unavailable(status_code, ttl)
331            }
332            
333            // 5xx - Unreachable, deny all
334            500..=599 => {
335                self.stats.unreachable.fetch_add(1, Ordering::Relaxed);
336                warn!("Robots.txt unreachable ({}), denying all", status_code);
337                RobotsPolicy::unreachable(
338                    format!("Server error: {}", status_code),
339                    ttl,
340                )
341            }
342            
343            // Other status codes - treat as unreachable
344            _ => {
345                self.stats.unreachable.fetch_add(1, Ordering::Relaxed);
346                warn!("Unexpected robots.txt status ({}), denying all", status_code);
347                RobotsPolicy::unreachable(
348                    format!("Unexpected status: {}", status_code),
349                    ttl,
350                )
351            }
352        }
353    }
354
355    /// Parse successful response content
356    async fn parse_successful_response(
357        &self,
358        response: Response,
359        ttl: Duration,
360        etag: Option<String>,
361        last_modified: Option<String>,
362    ) -> RobotsPolicy {
363        // Check content type
364        let content_type = response
365            .headers()
366            .get("content-type")
367            .and_then(|v| v.to_str().ok())
368            .unwrap_or("text/plain");
369
370        if !content_type.contains("text/plain") && !content_type.contains("text/html") {
371            warn!(
372                "Unexpected Content-Type for robots.txt: {}, treating as empty",
373                content_type
374            );
375            return RobotsPolicy::unavailable(200, ttl);
376        }
377
378        // Get content
379        match response.text().await {
380            Ok(content) => {
381                let bytes = content.len();
382                self.stats.bytes_fetched.fetch_add(bytes as u64, Ordering::Relaxed);
383                
384                // Validate UTF-8 (should be, but check anyway)
385                if content.is_empty() {
386                    debug!("Empty robots.txt, allowing all");
387                    return RobotsPolicy::unavailable(200, ttl);
388                }
389
390                // Parse content and set conditional headers
391                let mut policy = self.parser.parse(&content, ttl);
392                policy.etag = etag;
393                policy.last_modified = last_modified;
394                policy
395            }
396            Err(e) => {
397                warn!("Failed to read robots.txt body: {}", e);
398                RobotsPolicy::unreachable(
399                    format!("Body read error: {}", e),
400                    DEFAULT_CACHE_TTL,
401                )
402            }
403        }
404    }
405
406    /// Handle network error
407    fn handle_network_error(&self, error: reqwest::Error) -> RobotsPolicy {
408        self.stats.unreachable.fetch_add(1, Ordering::Relaxed);
409        
410        let reason = if error.is_timeout() {
411            "Network timeout".to_string()
412        } else if error.is_connect() {
413            "Connection failed".to_string()
414        } else if error.is_redirect() {
415            "Too many redirects".to_string()
416        } else {
417            format!("Network error: {}", error)
418        };
419
420        warn!("Robots.txt fetch failed: {}", reason);
421        RobotsPolicy::unreachable(reason, DEFAULT_CACHE_TTL)
422    }
423}
424
425#[cfg(test)]
426mod tests {
427    use super::*;
428
429    #[test]
430    fn test_fetch_stats_default() {
431        let stats = FetchStats::default();
432        let snapshot = stats.snapshot();
433        assert_eq!(snapshot.total, 0);
434        assert_eq!(snapshot.success, 0);
435    }
436
437    #[tokio::test]
438    async fn test_fetcher_creation() {
439        let config = RobotsConfig::default();
440        let _fetcher = RobotsFetcher::new(config);
441    }
442
443    // Note: Integration tests with real HTTP would go in tests/ directory
444    // or use wiremock for mocking
445}