halldyll_robots/
checker.rs

1//! Main robots.txt checker with caching and fetching.
2//!
3//! The `RobotsChecker` is the primary entry point for checking URLs against
4//! robots.txt policies. It handles fetching, caching, parsing, and matching.
5
6use std::collections::HashSet;
7use std::time::Duration;
8
9use tracing::debug;
10use url::Url;
11
12use crate::cache::{CacheStatsSnapshot, RobotsCache};
13use crate::fetcher::{FetchStatsSnapshot, RobotsFetcher};
14use crate::matcher::RobotsMatcher;
15use crate::types::{
16    Decision, DecisionReason, EffectiveRules, FetchStatus, RobotsCacheKey,
17    RobotsConfig, RobotsPolicy, Rule,
18};
19
20// ============================================================================
21// RobotsChecker
22// ============================================================================
23
24/// Main robots.txt checker with caching and fetching.
25///
26/// This is the primary entry point for checking URLs against robots.txt.
27/// It combines fetching, caching, parsing, and matching into a single API.
28///
29/// # Example
30///
31/// ```rust,no_run
32/// use halldyll_robots::{RobotsChecker, RobotsConfig};
33/// use url::Url;
34///
35/// #[tokio::main]
36/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
37///     let config = RobotsConfig::default();
38///     let checker = RobotsChecker::new(config);
39///     
40///     let url = Url::parse("https://example.com/some/path")?;
41///     let decision = checker.is_allowed(&url).await;
42///     
43///     if decision.allowed {
44///         println!("URL is allowed");
45///     } else {
46///         println!("URL is blocked: {:?}", decision.reason);
47///     }
48///     
49///     Ok(())
50/// }
51/// ```
52pub struct RobotsChecker {
53    config: RobotsConfig,
54    fetcher: RobotsFetcher,
55    cache: RobotsCache,
56    matcher: RobotsMatcher,
57}
58
59impl RobotsChecker {
60    /// Create a new robots.txt checker with the given configuration.
61    pub fn new(config: RobotsConfig) -> Self {
62        let matcher = RobotsMatcher::new(&config.user_agent);
63        let fetcher = RobotsFetcher::new(config.clone());
64        let cache = RobotsCache::new(Duration::from_secs(config.cache_ttl_secs));
65
66        Self {
67            config,
68            fetcher,
69            cache,
70            matcher,
71        }
72    }
73
74    /// Create a checker with file-based cache persistence.
75    ///
76    /// The cache will be saved to and loaded from the specified directory.
77    pub fn with_persistence(config: RobotsConfig, persist_dir: &str) -> Self {
78        let matcher = RobotsMatcher::new(&config.user_agent);
79        let fetcher = RobotsFetcher::new(config.clone());
80        let cache = RobotsCache::with_persistence(
81            Duration::from_secs(config.cache_ttl_secs),
82            persist_dir,
83        );
84
85        Self {
86            config,
87            fetcher,
88            cache,
89            matcher,
90        }
91    }
92
93    // ------------------------------------------------------------------------
94    // Core API
95    // ------------------------------------------------------------------------
96
97    /// Check if a URL is allowed by robots.txt.
98    ///
99    /// This is the main method for checking crawl permissions. It will:
100    /// 1. Extract the origin (scheme + authority) from the URL
101    /// 2. Check the cache for an existing policy
102    /// 3. Fetch robots.txt if not cached
103    /// 4. Match the URL path against the rules
104    ///
105    /// # Returns
106    ///
107    /// A `Decision` with `allowed` status and the reason for the decision.
108    pub async fn is_allowed(&self, url: &Url) -> Decision {
109        // Skip if robots.txt is disabled
110        if !self.config.respect_robots {
111            return Decision::allow(DecisionReason::RobotsDisabled);
112        }
113
114        // Get the policy (from cache or fetch)
115        let policy = self.get_policy(url).await;
116
117        // Build full path with query string
118        let full_path = Self::build_full_path(url);
119
120        self.matcher.is_allowed(&policy, &full_path)
121    }
122
123    /// Check if a path is allowed for a given origin.
124    ///
125    /// Use this when you already have the origin URL and want to check
126    /// multiple paths without re-fetching robots.txt.
127    pub async fn is_path_allowed(&self, origin: &Url, path: &str) -> Decision {
128        if !self.config.respect_robots {
129            return Decision::allow(DecisionReason::RobotsDisabled);
130        }
131
132        let policy = self.get_policy(origin).await;
133        self.matcher.is_allowed(&policy, path)
134    }
135
136    /// Get the robots.txt policy for a URL (cached or fetched).
137    ///
138    /// This fetches and parses the robots.txt if not already cached,
139    /// and returns the parsed policy. Uses conditional GET (If-None-Match,
140    /// If-Modified-Since) for cache revalidation to save bandwidth.
141    pub async fn get_policy(&self, url: &Url) -> RobotsPolicy {
142        let key = match RobotsCacheKey::from_url(url) {
143            Some(k) => k,
144            None => {
145                return RobotsPolicy::unreachable(
146                    "Invalid URL".to_string(),
147                    Duration::from_secs(60),
148                );
149            }
150        };
151
152        // Check cache first
153        if let Some(cached_policy) = self.cache.get(&key) {
154            // If not expired, return cached
155            if !cached_policy.is_expired() {
156                return cached_policy;
157            }
158
159            // Expired: try conditional GET if we have ETag or Last-Modified
160            if cached_policy.etag.is_some() || cached_policy.last_modified.is_some() {
161                let new_policy = self.fetcher.fetch_for_key_conditional(
162                    &key,
163                    cached_policy.etag.as_deref(),
164                    cached_policy.last_modified.as_deref(),
165                ).await;
166
167                // If 304 Not Modified, extend the cached policy's TTL
168                if new_policy.fetch_status.is_not_modified() {
169                    let mut refreshed = cached_policy.clone();
170                    refreshed.extend_ttl(new_policy.ttl());
171                    self.cache.insert(key, refreshed.clone());
172                    return refreshed;
173                }
174
175                // Otherwise use the new policy
176                self.cache.insert(key, new_policy.clone());
177                return new_policy;
178            }
179        }
180
181        // No cache or no conditional headers: full fetch
182        let policy = self.fetcher.fetch_for_key(&key).await;
183        self.cache.insert(key, policy.clone());
184        policy
185    }
186
187    // ------------------------------------------------------------------------
188    // Additional API
189    // ------------------------------------------------------------------------
190
191    /// Get crawl delay for a URL.
192    ///
193    /// Returns the effective delay between requests, using crawl-delay if set,
194    /// otherwise request-rate, otherwise `None`.
195    pub async fn get_crawl_delay(&self, url: &Url) -> Option<Duration> {
196        let policy = self.get_policy(url).await;
197        let effective = self.matcher.effective_rules(&policy);
198        effective.effective_delay()
199    }
200
201    /// Get request rate for a URL.
202    ///
203    /// Returns the request-rate directive if specified in robots.txt.
204    pub async fn get_request_rate(&self, url: &Url) -> Option<crate::types::RequestRate> {
205        let policy = self.get_policy(url).await;
206        let effective = self.matcher.effective_rules(&policy);
207        effective.request_rate
208    }
209
210    /// Get sitemaps listed in robots.txt.
211    ///
212    /// Returns all Sitemap directives found in the robots.txt.
213    pub async fn get_sitemaps(&self, url: &Url) -> Vec<String> {
214        let policy = self.get_policy(url).await;
215        policy.sitemaps
216    }
217
218    /// Get effective rules for a URL.
219    ///
220    /// Returns the merged rules that apply to this user-agent.
221    pub async fn get_effective_rules(&self, url: &Url) -> EffectiveRules {
222        let policy = self.get_policy(url).await;
223        self.matcher.effective_rules(&policy)
224    }
225
226    /// Get detailed diagnostics for a URL check.
227    ///
228    /// Useful for debugging why a URL was allowed or blocked.
229    pub async fn diagnose(&self, url: &Url) -> RobotsDiagnostics {
230        let policy = self.get_policy(url).await;
231        let effective = self.matcher.effective_rules(&policy);
232        let full_path = Self::build_full_path(url);
233        let decision = self.matcher.is_allowed(&policy, &full_path);
234
235        RobotsDiagnostics {
236            url: url.to_string(),
237            robots_url: RobotsCacheKey::from_url(url)
238                .map(|k| k.robots_url())
239                .unwrap_or_default(),
240            fetch_status: policy.fetch_status.clone(),
241            groups_count: policy.groups.len(),
242            matched_agents: effective.matched_agents,
243            applicable_rules: effective.rules,
244            decision,
245        }
246    }
247
248    // ------------------------------------------------------------------------
249    // Cache Management
250    // ------------------------------------------------------------------------
251
252    /// Get cache statistics.
253    pub fn cache_stats(&self) -> CacheStatsSnapshot {
254        self.cache.stats().snapshot()
255    }
256
257    /// Get fetch statistics.
258    pub fn fetch_stats(&self) -> FetchStatsSnapshot {
259        self.fetcher.stats().snapshot()
260    }
261
262    /// Clear all cached policies.
263    pub fn clear_cache(&self) {
264        self.cache.clear();
265    }
266
267    /// Evict expired cache entries.
268    ///
269    /// Returns the number of entries evicted.
270    pub fn evict_expired(&self) -> usize {
271        self.cache.evict_expired()
272    }
273
274    /// Save cache to disk (if persistence enabled).
275    pub async fn save_cache(&self) -> std::io::Result<usize> {
276        self.cache.save_to_disk().await
277    }
278
279    /// Load cache from disk (if persistence enabled).
280    pub async fn load_cache(&self) -> std::io::Result<usize> {
281        self.cache.load_from_disk().await
282    }
283
284    /// Get list of cached domains.
285    pub fn cached_domains(&self) -> Vec<String> {
286        self.cache.domains()
287    }
288
289    // ------------------------------------------------------------------------
290    // Configuration
291    // ------------------------------------------------------------------------
292
293    /// Get the configuration.
294    pub fn config(&self) -> &RobotsConfig {
295        &self.config
296    }
297
298    // ------------------------------------------------------------------------
299    // Bulk Operations
300    // ------------------------------------------------------------------------
301
302    /// Prefetch robots.txt for a list of URLs.
303    ///
304    /// This is useful for warming the cache before crawling.
305    pub async fn prefetch(&self, urls: &[Url]) {
306        let mut seen = HashSet::new();
307
308        for url in urls {
309            if let Some(key) = RobotsCacheKey::from_url(url) {
310                if seen.insert(key.clone()) && self.cache.get(&key).is_none() {
311                    debug!("Prefetching robots.txt for {}", key.authority);
312                    let policy = self.fetcher.fetch_for_key(&key).await;
313                    self.cache.insert(key, policy);
314                }
315            }
316        }
317    }
318
319    // ------------------------------------------------------------------------
320    // Helpers
321    // ------------------------------------------------------------------------
322
323    fn build_full_path(url: &Url) -> String {
324        let path = url.path();
325        match url.query() {
326            Some(q) => format!("{}?{}", path, q),
327            None => path.to_string(),
328        }
329    }
330}
331
332// ============================================================================
333// RobotsDiagnostics
334// ============================================================================
335
336/// Diagnostics for debugging robots.txt decisions.
337#[derive(Debug, Clone)]
338pub struct RobotsDiagnostics {
339    /// The URL being checked.
340    pub url: String,
341    /// The robots.txt URL.
342    pub robots_url: String,
343    /// Fetch status of robots.txt.
344    pub fetch_status: FetchStatus,
345    /// Number of groups in the policy.
346    pub groups_count: usize,
347    /// User-agents that matched.
348    pub matched_agents: Vec<String>,
349    /// Rules that apply to this user-agent.
350    pub applicable_rules: Vec<Rule>,
351    /// Final decision.
352    pub decision: Decision,
353}
354
355// ============================================================================
356// Tests
357// ============================================================================
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362
363    #[test]
364    fn test_config_default() {
365        let config = RobotsConfig::default();
366        assert!(config.respect_robots);
367        assert!(config.safe_mode);
368        assert!(config.max_robots_size >= 500 * 1024);
369    }
370
371    #[tokio::test]
372    async fn test_checker_creation() {
373        let config = RobotsConfig::default();
374        let _checker = RobotsChecker::new(config);
375    }
376
377    #[tokio::test]
378    async fn test_disabled_robots() {
379        let config = RobotsConfig {
380            respect_robots: false,
381            ..Default::default()
382        };
383        let checker = RobotsChecker::new(config);
384
385        let url = Url::parse("https://example.com/blocked").unwrap();
386        let decision = checker.is_allowed(&url).await;
387
388        assert!(decision.allowed);
389        assert_eq!(decision.reason, DecisionReason::RobotsDisabled);
390    }
391
392    #[test]
393    fn test_build_full_path() {
394        let url = Url::parse("https://example.com/path?query=1").unwrap();
395        assert_eq!(RobotsChecker::build_full_path(&url), "/path?query=1");
396
397        let url = Url::parse("https://example.com/path").unwrap();
398        assert_eq!(RobotsChecker::build_full_path(&url), "/path");
399    }
400}