halldyll_core/politeness/
robots.rs

1//! Robots - Parsing and respecting robots.txt (RFC 9309)
2
3use std::collections::HashMap;
4use std::sync::RwLock;
5use std::time::{Duration, Instant};
6use texting_robots::Robot;
7use url::Url;
8
9/// Robots.txt cache
10pub struct RobotsCache {
11    /// Cached entries
12    entries: RwLock<HashMap<String, RobotsEntry>>,
13    /// Cache TTL (seconds)
14    ttl_secs: u64,
15}
16
17/// Cached robots.txt entry
18struct RobotsEntry {
19    /// Parsed content
20    content: String,
21    /// Caching timestamp
22    cached_at: Instant,
23    /// Crawl-delay in seconds
24    crawl_delay: Option<f64>,
25}
26
27impl Default for RobotsCache {
28    fn default() -> Self {
29        Self::new(3600) // 1 hour by default
30    }
31}
32
33impl RobotsCache {
34    /// New cache with TTL
35    pub fn new(ttl_secs: u64) -> Self {
36        Self {
37            entries: RwLock::new(HashMap::new()),
38            ttl_secs,
39        }
40    }
41
42    /// Cache key (domain)
43    fn cache_key(url: &Url) -> String {
44        format!("{}://{}", url.scheme(), url.host_str().unwrap_or(""))
45    }
46
47    /// Retrieves an entry (if not expired)
48    pub fn get(&self, url: &Url) -> Option<String> {
49        let key = Self::cache_key(url);
50        let entries = self.entries.read().unwrap();
51        
52        entries.get(&key).and_then(|entry| {
53            if entry.cached_at.elapsed() < Duration::from_secs(self.ttl_secs) {
54                Some(entry.content.clone())
55            } else {
56                None
57            }
58        })
59    }
60
61    /// Retrieves the crawl-delay
62    pub fn get_crawl_delay(&self, url: &Url) -> Option<f64> {
63        let key = Self::cache_key(url);
64        let entries = self.entries.read().unwrap();
65        
66        entries.get(&key).and_then(|entry| {
67            if entry.cached_at.elapsed() < Duration::from_secs(self.ttl_secs) {
68                entry.crawl_delay
69            } else {
70                None
71            }
72        })
73    }
74
75    /// Caches an entry
76    pub fn set(&self, url: &Url, content: String, crawl_delay: Option<f64>) {
77        let key = Self::cache_key(url);
78        let entry = RobotsEntry {
79            content,
80            cached_at: Instant::now(),
81            crawl_delay,
82        };
83        
84        self.entries.write().unwrap().insert(key, entry);
85    }
86
87    /// Removes an entry
88    pub fn remove(&self, url: &Url) {
89        let key = Self::cache_key(url);
90        self.entries.write().unwrap().remove(&key);
91    }
92
93    /// Clears the cache
94    pub fn clear(&self) {
95        self.entries.write().unwrap().clear();
96    }
97
98    /// Cache size
99    pub fn len(&self) -> usize {
100        self.entries.read().unwrap().len()
101    }
102
103    /// Is cache empty?
104    pub fn is_empty(&self) -> bool {
105        self.entries.read().unwrap().is_empty()
106    }
107}
108
109/// Robots.txt checker
110pub struct RobotsChecker {
111    /// User-Agent to use
112    user_agent: String,
113    /// Robots.txt cache
114    cache: RobotsCache,
115    /// Respect robots.txt?
116    enabled: bool,
117}
118
119impl RobotsChecker {
120    /// New checker
121    pub fn new(user_agent: &str, cache_ttl_secs: u64) -> Self {
122        Self {
123            user_agent: user_agent.to_string(),
124            cache: RobotsCache::new(cache_ttl_secs),
125            enabled: true,
126        }
127    }
128
129    /// Enable/disable robots.txt respect
130    pub fn set_enabled(&mut self, enabled: bool) {
131        self.enabled = enabled;
132    }
133
134    /// Is robots.txt respected?
135    pub fn is_enabled(&self) -> bool {
136        self.enabled
137    }
138
139    /// URL of the robots.txt for a site
140    pub fn robots_url(url: &Url) -> Option<Url> {
141        let base = format!("{}://{}/robots.txt", url.scheme(), url.host_str()?);
142        Url::parse(&base).ok()
143    }
144
145    /// Caches a robots.txt
146    pub fn cache_robots(&self, url: &Url, content: &str) {
147        // Extract crawl-delay
148        let crawl_delay = self.parse_crawl_delay(content);
149        self.cache.set(url, content.to_string(), crawl_delay);
150    }
151
152    /// Parses the crawl-delay from robots.txt
153    fn parse_crawl_delay(&self, content: &str) -> Option<f64> {
154        let ua_lower = self.user_agent.to_lowercase();
155        let mut in_matching_section = false;
156        let mut found_delay: Option<f64> = None;
157        let mut wildcard_delay: Option<f64> = None;
158
159        for line in content.lines() {
160            let line = line.trim();
161            
162            // Ignore comments
163            if line.starts_with('#') || line.is_empty() {
164                continue;
165            }
166
167            let parts: Vec<&str> = line.splitn(2, ':').collect();
168            if parts.len() != 2 {
169                continue;
170            }
171
172            let directive = parts[0].trim().to_lowercase();
173            let value = parts[1].trim();
174
175            if directive == "user-agent" {
176                let ua = value.to_lowercase();
177                if ua == "*" {
178                    in_matching_section = true;
179                } else if ua_lower.contains(&ua) || ua.contains(&ua_lower) {
180                    in_matching_section = true;
181                } else {
182                    in_matching_section = false;
183                }
184            } else if directive == "crawl-delay" && in_matching_section {
185                if let Ok(delay) = value.parse::<f64>() {
186                    if value == "*" {
187                        wildcard_delay = Some(delay);
188                    } else {
189                        found_delay = Some(delay);
190                    }
191                }
192            }
193        }
194
195        found_delay.or(wildcard_delay)
196    }
197
198    /// Checks if a URL is allowed
199    pub fn is_allowed(&self, url: &Url, robots_content: Option<&str>) -> bool {
200        if !self.enabled {
201            return true;
202        }
203
204        // Retrieve robots.txt content
205        let content = match robots_content {
206            Some(c) => c.to_string(),
207            None => match self.cache.get(url) {
208                Some(c) => c,
209                None => return true, // No robots.txt = everything allowed
210            },
211        };
212
213        // Parse with texting_robots
214        let robot = match Robot::new(&self.user_agent, content.as_bytes()) {
215            Ok(r) => r,
216            Err(_) => return true, // Parsing error = allowed
217        };
218
219        robot.allowed(url.path())
220    }
221
222    /// Retrieves the crawl-delay
223    pub fn get_crawl_delay(&self, url: &Url) -> Option<Duration> {
224        if !self.enabled {
225            return None;
226        }
227
228        self.cache
229            .get_crawl_delay(url)
230            .map(|secs| Duration::from_secs_f64(secs))
231    }
232
233    /// Access to the cache
234    pub fn cache(&self) -> &RobotsCache {
235        &self.cache
236    }
237}
238
239/// Extracts sitemap rules from robots.txt
240pub fn extract_sitemaps(robots_content: &str) -> Vec<String> {
241    robots_content
242        .lines()
243        .filter_map(|line| {
244            let line = line.trim();
245            if line.to_lowercase().starts_with("sitemap:") {
246                Some(line[8..].trim().to_string())
247            } else {
248                None
249            }
250        })
251        .collect()
252}