halldyll_robots/checker.rs
1//! Main robots.txt checker with caching and fetching.
2//!
3//! The `RobotsChecker` is the primary entry point for checking URLs against
4//! robots.txt policies. It handles fetching, caching, parsing, and matching.
5
6use std::collections::HashSet;
7use std::time::Duration;
8
9use tracing::debug;
10use url::Url;
11
12use crate::cache::{CacheStatsSnapshot, RobotsCache};
13use crate::fetcher::{FetchStatsSnapshot, RobotsFetcher};
14use crate::matcher::RobotsMatcher;
15use crate::types::{
16 Decision, DecisionReason, EffectiveRules, FetchStatus, RobotsCacheKey,
17 RobotsConfig, RobotsPolicy, Rule,
18};
19
20// ============================================================================
21// RobotsChecker
22// ============================================================================
23
24/// Main robots.txt checker with caching and fetching.
25///
26/// This is the primary entry point for checking URLs against robots.txt.
27/// It combines fetching, caching, parsing, and matching into a single API.
28///
29/// # Example
30///
31/// ```rust,no_run
32/// use halldyll_robots::{RobotsChecker, RobotsConfig};
33/// use url::Url;
34///
35/// #[tokio::main]
36/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
37/// let config = RobotsConfig::default();
38/// let checker = RobotsChecker::new(config);
39///
40/// let url = Url::parse("https://example.com/some/path")?;
41/// let decision = checker.is_allowed(&url).await;
42///
43/// if decision.allowed {
44/// println!("URL is allowed");
45/// } else {
46/// println!("URL is blocked: {:?}", decision.reason);
47/// }
48///
49/// Ok(())
50/// }
51/// ```
52pub struct RobotsChecker {
53 config: RobotsConfig,
54 fetcher: RobotsFetcher,
55 cache: RobotsCache,
56 matcher: RobotsMatcher,
57}
58
59impl RobotsChecker {
60 /// Create a new robots.txt checker with the given configuration.
61 pub fn new(config: RobotsConfig) -> Self {
62 let matcher = RobotsMatcher::new(&config.user_agent);
63 let fetcher = RobotsFetcher::new(config.clone());
64 let cache = RobotsCache::new(Duration::from_secs(config.cache_ttl_secs));
65
66 Self {
67 config,
68 fetcher,
69 cache,
70 matcher,
71 }
72 }
73
74 /// Create a checker with file-based cache persistence.
75 ///
76 /// The cache will be saved to and loaded from the specified directory.
77 pub fn with_persistence(config: RobotsConfig, persist_dir: &str) -> Self {
78 let matcher = RobotsMatcher::new(&config.user_agent);
79 let fetcher = RobotsFetcher::new(config.clone());
80 let cache = RobotsCache::with_persistence(
81 Duration::from_secs(config.cache_ttl_secs),
82 persist_dir,
83 );
84
85 Self {
86 config,
87 fetcher,
88 cache,
89 matcher,
90 }
91 }
92
93 // ------------------------------------------------------------------------
94 // Core API
95 // ------------------------------------------------------------------------
96
97 /// Check if a URL is allowed by robots.txt.
98 ///
99 /// This is the main method for checking crawl permissions. It will:
100 /// 1. Extract the origin (scheme + authority) from the URL
101 /// 2. Check the cache for an existing policy
102 /// 3. Fetch robots.txt if not cached
103 /// 4. Match the URL path against the rules
104 ///
105 /// # Returns
106 ///
107 /// A `Decision` with `allowed` status and the reason for the decision.
108 pub async fn is_allowed(&self, url: &Url) -> Decision {
109 // Skip if robots.txt is disabled
110 if !self.config.respect_robots {
111 return Decision::allow(DecisionReason::RobotsDisabled);
112 }
113
114 // Get the policy (from cache or fetch)
115 let policy = self.get_policy(url).await;
116
117 // Build full path with query string
118 let full_path = Self::build_full_path(url);
119
120 self.matcher.is_allowed(&policy, &full_path)
121 }
122
123 /// Check if a path is allowed for a given origin.
124 ///
125 /// Use this when you already have the origin URL and want to check
126 /// multiple paths without re-fetching robots.txt.
127 pub async fn is_path_allowed(&self, origin: &Url, path: &str) -> Decision {
128 if !self.config.respect_robots {
129 return Decision::allow(DecisionReason::RobotsDisabled);
130 }
131
132 let policy = self.get_policy(origin).await;
133 self.matcher.is_allowed(&policy, path)
134 }
135
136 /// Get the robots.txt policy for a URL (cached or fetched).
137 ///
138 /// This fetches and parses the robots.txt if not already cached,
139 /// and returns the parsed policy. Uses conditional GET (If-None-Match,
140 /// If-Modified-Since) for cache revalidation to save bandwidth.
141 pub async fn get_policy(&self, url: &Url) -> RobotsPolicy {
142 let key = match RobotsCacheKey::from_url(url) {
143 Some(k) => k,
144 None => {
145 return RobotsPolicy::unreachable(
146 "Invalid URL".to_string(),
147 Duration::from_secs(60),
148 );
149 }
150 };
151
152 // Check cache first
153 if let Some(cached_policy) = self.cache.get(&key) {
154 // If not expired, return cached
155 if !cached_policy.is_expired() {
156 return cached_policy;
157 }
158
159 // Expired: try conditional GET if we have ETag or Last-Modified
160 if cached_policy.etag.is_some() || cached_policy.last_modified.is_some() {
161 let new_policy = self.fetcher.fetch_for_key_conditional(
162 &key,
163 cached_policy.etag.as_deref(),
164 cached_policy.last_modified.as_deref(),
165 ).await;
166
167 // If 304 Not Modified, extend the cached policy's TTL
168 if new_policy.fetch_status.is_not_modified() {
169 let mut refreshed = cached_policy.clone();
170 refreshed.extend_ttl(new_policy.ttl());
171 self.cache.insert(key, refreshed.clone());
172 return refreshed;
173 }
174
175 // Otherwise use the new policy
176 self.cache.insert(key, new_policy.clone());
177 return new_policy;
178 }
179 }
180
181 // No cache or no conditional headers: full fetch
182 let policy = self.fetcher.fetch_for_key(&key).await;
183 self.cache.insert(key, policy.clone());
184 policy
185 }
186
187 // ------------------------------------------------------------------------
188 // Additional API
189 // ------------------------------------------------------------------------
190
191 /// Get crawl delay for a URL.
192 ///
193 /// Returns the effective delay between requests, using crawl-delay if set,
194 /// otherwise request-rate, otherwise `None`.
195 pub async fn get_crawl_delay(&self, url: &Url) -> Option<Duration> {
196 let policy = self.get_policy(url).await;
197 let effective = self.matcher.effective_rules(&policy);
198 effective.effective_delay()
199 }
200
201 /// Get request rate for a URL.
202 ///
203 /// Returns the request-rate directive if specified in robots.txt.
204 pub async fn get_request_rate(&self, url: &Url) -> Option<crate::types::RequestRate> {
205 let policy = self.get_policy(url).await;
206 let effective = self.matcher.effective_rules(&policy);
207 effective.request_rate
208 }
209
210 /// Get sitemaps listed in robots.txt.
211 ///
212 /// Returns all Sitemap directives found in the robots.txt.
213 pub async fn get_sitemaps(&self, url: &Url) -> Vec<String> {
214 let policy = self.get_policy(url).await;
215 policy.sitemaps
216 }
217
218 /// Get effective rules for a URL.
219 ///
220 /// Returns the merged rules that apply to this user-agent.
221 pub async fn get_effective_rules(&self, url: &Url) -> EffectiveRules {
222 let policy = self.get_policy(url).await;
223 self.matcher.effective_rules(&policy)
224 }
225
226 /// Get detailed diagnostics for a URL check.
227 ///
228 /// Useful for debugging why a URL was allowed or blocked.
229 pub async fn diagnose(&self, url: &Url) -> RobotsDiagnostics {
230 let policy = self.get_policy(url).await;
231 let effective = self.matcher.effective_rules(&policy);
232 let full_path = Self::build_full_path(url);
233 let decision = self.matcher.is_allowed(&policy, &full_path);
234
235 RobotsDiagnostics {
236 url: url.to_string(),
237 robots_url: RobotsCacheKey::from_url(url)
238 .map(|k| k.robots_url())
239 .unwrap_or_default(),
240 fetch_status: policy.fetch_status.clone(),
241 groups_count: policy.groups.len(),
242 matched_agents: effective.matched_agents,
243 applicable_rules: effective.rules,
244 decision,
245 }
246 }
247
248 // ------------------------------------------------------------------------
249 // Cache Management
250 // ------------------------------------------------------------------------
251
252 /// Get cache statistics.
253 pub fn cache_stats(&self) -> CacheStatsSnapshot {
254 self.cache.stats().snapshot()
255 }
256
257 /// Get fetch statistics.
258 pub fn fetch_stats(&self) -> FetchStatsSnapshot {
259 self.fetcher.stats().snapshot()
260 }
261
262 /// Clear all cached policies.
263 pub fn clear_cache(&self) {
264 self.cache.clear();
265 }
266
267 /// Evict expired cache entries.
268 ///
269 /// Returns the number of entries evicted.
270 pub fn evict_expired(&self) -> usize {
271 self.cache.evict_expired()
272 }
273
274 /// Save cache to disk (if persistence enabled).
275 pub async fn save_cache(&self) -> std::io::Result<usize> {
276 self.cache.save_to_disk().await
277 }
278
279 /// Load cache from disk (if persistence enabled).
280 pub async fn load_cache(&self) -> std::io::Result<usize> {
281 self.cache.load_from_disk().await
282 }
283
284 /// Get list of cached domains.
285 pub fn cached_domains(&self) -> Vec<String> {
286 self.cache.domains()
287 }
288
289 // ------------------------------------------------------------------------
290 // Configuration
291 // ------------------------------------------------------------------------
292
293 /// Get the configuration.
294 pub fn config(&self) -> &RobotsConfig {
295 &self.config
296 }
297
298 // ------------------------------------------------------------------------
299 // Bulk Operations
300 // ------------------------------------------------------------------------
301
302 /// Prefetch robots.txt for a list of URLs.
303 ///
304 /// This is useful for warming the cache before crawling.
305 pub async fn prefetch(&self, urls: &[Url]) {
306 let mut seen = HashSet::new();
307
308 for url in urls {
309 if let Some(key) = RobotsCacheKey::from_url(url) {
310 if seen.insert(key.clone()) && self.cache.get(&key).is_none() {
311 debug!("Prefetching robots.txt for {}", key.authority);
312 let policy = self.fetcher.fetch_for_key(&key).await;
313 self.cache.insert(key, policy);
314 }
315 }
316 }
317 }
318
319 // ------------------------------------------------------------------------
320 // Helpers
321 // ------------------------------------------------------------------------
322
323 fn build_full_path(url: &Url) -> String {
324 let path = url.path();
325 match url.query() {
326 Some(q) => format!("{}?{}", path, q),
327 None => path.to_string(),
328 }
329 }
330}
331
332// ============================================================================
333// RobotsDiagnostics
334// ============================================================================
335
336/// Diagnostics for debugging robots.txt decisions.
337#[derive(Debug, Clone)]
338pub struct RobotsDiagnostics {
339 /// The URL being checked.
340 pub url: String,
341 /// The robots.txt URL.
342 pub robots_url: String,
343 /// Fetch status of robots.txt.
344 pub fetch_status: FetchStatus,
345 /// Number of groups in the policy.
346 pub groups_count: usize,
347 /// User-agents that matched.
348 pub matched_agents: Vec<String>,
349 /// Rules that apply to this user-agent.
350 pub applicable_rules: Vec<Rule>,
351 /// Final decision.
352 pub decision: Decision,
353}
354
355// ============================================================================
356// Tests
357// ============================================================================
358
359#[cfg(test)]
360mod tests {
361 use super::*;
362
363 #[test]
364 fn test_config_default() {
365 let config = RobotsConfig::default();
366 assert!(config.respect_robots);
367 assert!(config.safe_mode);
368 assert!(config.max_robots_size >= 500 * 1024);
369 }
370
371 #[tokio::test]
372 async fn test_checker_creation() {
373 let config = RobotsConfig::default();
374 let _checker = RobotsChecker::new(config);
375 }
376
377 #[tokio::test]
378 async fn test_disabled_robots() {
379 let config = RobotsConfig {
380 respect_robots: false,
381 ..Default::default()
382 };
383 let checker = RobotsChecker::new(config);
384
385 let url = Url::parse("https://example.com/blocked").unwrap();
386 let decision = checker.is_allowed(&url).await;
387
388 assert!(decision.allowed);
389 assert_eq!(decision.reason, DecisionReason::RobotsDisabled);
390 }
391
392 #[test]
393 fn test_build_full_path() {
394 let url = Url::parse("https://example.com/path?query=1").unwrap();
395 assert_eq!(RobotsChecker::build_full_path(&url), "/path?query=1");
396
397 let url = Url::parse("https://example.com/path").unwrap();
398 assert_eq!(RobotsChecker::build_full_path(&url), "/path");
399 }
400}