halldyll_core/politeness/
robots.rs1use std::collections::HashMap;
4use std::sync::RwLock;
5use std::time::{Duration, Instant};
6use texting_robots::Robot;
7use url::Url;
8
9pub struct RobotsCache {
11 entries: RwLock<HashMap<String, RobotsEntry>>,
13 ttl_secs: u64,
15}
16
17struct RobotsEntry {
19 content: String,
21 cached_at: Instant,
23 crawl_delay: Option<f64>,
25}
26
27impl Default for RobotsCache {
28 fn default() -> Self {
29 Self::new(3600) }
31}
32
33impl RobotsCache {
34 pub fn new(ttl_secs: u64) -> Self {
36 Self {
37 entries: RwLock::new(HashMap::new()),
38 ttl_secs,
39 }
40 }
41
42 fn cache_key(url: &Url) -> String {
44 format!("{}://{}", url.scheme(), url.host_str().unwrap_or(""))
45 }
46
47 pub fn get(&self, url: &Url) -> Option<String> {
49 let key = Self::cache_key(url);
50 let entries = self.entries.read().unwrap();
51
52 entries.get(&key).and_then(|entry| {
53 if entry.cached_at.elapsed() < Duration::from_secs(self.ttl_secs) {
54 Some(entry.content.clone())
55 } else {
56 None
57 }
58 })
59 }
60
61 pub fn get_crawl_delay(&self, url: &Url) -> Option<f64> {
63 let key = Self::cache_key(url);
64 let entries = self.entries.read().unwrap();
65
66 entries.get(&key).and_then(|entry| {
67 if entry.cached_at.elapsed() < Duration::from_secs(self.ttl_secs) {
68 entry.crawl_delay
69 } else {
70 None
71 }
72 })
73 }
74
75 pub fn set(&self, url: &Url, content: String, crawl_delay: Option<f64>) {
77 let key = Self::cache_key(url);
78 let entry = RobotsEntry {
79 content,
80 cached_at: Instant::now(),
81 crawl_delay,
82 };
83
84 self.entries.write().unwrap().insert(key, entry);
85 }
86
87 pub fn remove(&self, url: &Url) {
89 let key = Self::cache_key(url);
90 self.entries.write().unwrap().remove(&key);
91 }
92
93 pub fn clear(&self) {
95 self.entries.write().unwrap().clear();
96 }
97
98 pub fn len(&self) -> usize {
100 self.entries.read().unwrap().len()
101 }
102
103 pub fn is_empty(&self) -> bool {
105 self.entries.read().unwrap().is_empty()
106 }
107}
108
109pub struct RobotsChecker {
111 user_agent: String,
113 cache: RobotsCache,
115 enabled: bool,
117}
118
119impl RobotsChecker {
120 pub fn new(user_agent: &str, cache_ttl_secs: u64) -> Self {
122 Self {
123 user_agent: user_agent.to_string(),
124 cache: RobotsCache::new(cache_ttl_secs),
125 enabled: true,
126 }
127 }
128
129 pub fn set_enabled(&mut self, enabled: bool) {
131 self.enabled = enabled;
132 }
133
134 pub fn is_enabled(&self) -> bool {
136 self.enabled
137 }
138
139 pub fn robots_url(url: &Url) -> Option<Url> {
141 let base = format!("{}://{}/robots.txt", url.scheme(), url.host_str()?);
142 Url::parse(&base).ok()
143 }
144
145 pub fn cache_robots(&self, url: &Url, content: &str) {
147 let crawl_delay = self.parse_crawl_delay(content);
149 self.cache.set(url, content.to_string(), crawl_delay);
150 }
151
152 fn parse_crawl_delay(&self, content: &str) -> Option<f64> {
154 let ua_lower = self.user_agent.to_lowercase();
155 let mut in_matching_section = false;
156 let mut found_delay: Option<f64> = None;
157 let mut wildcard_delay: Option<f64> = None;
158
159 for line in content.lines() {
160 let line = line.trim();
161
162 if line.starts_with('#') || line.is_empty() {
164 continue;
165 }
166
167 let parts: Vec<&str> = line.splitn(2, ':').collect();
168 if parts.len() != 2 {
169 continue;
170 }
171
172 let directive = parts[0].trim().to_lowercase();
173 let value = parts[1].trim();
174
175 if directive == "user-agent" {
176 let ua = value.to_lowercase();
177 if ua == "*" {
178 in_matching_section = true;
179 } else if ua_lower.contains(&ua) || ua.contains(&ua_lower) {
180 in_matching_section = true;
181 } else {
182 in_matching_section = false;
183 }
184 } else if directive == "crawl-delay" && in_matching_section {
185 if let Ok(delay) = value.parse::<f64>() {
186 if value == "*" {
187 wildcard_delay = Some(delay);
188 } else {
189 found_delay = Some(delay);
190 }
191 }
192 }
193 }
194
195 found_delay.or(wildcard_delay)
196 }
197
198 pub fn is_allowed(&self, url: &Url, robots_content: Option<&str>) -> bool {
200 if !self.enabled {
201 return true;
202 }
203
204 let content = match robots_content {
206 Some(c) => c.to_string(),
207 None => match self.cache.get(url) {
208 Some(c) => c,
209 None => return true, },
211 };
212
213 let robot = match Robot::new(&self.user_agent, content.as_bytes()) {
215 Ok(r) => r,
216 Err(_) => return true, };
218
219 robot.allowed(url.path())
220 }
221
222 pub fn get_crawl_delay(&self, url: &Url) -> Option<Duration> {
224 if !self.enabled {
225 return None;
226 }
227
228 self.cache
229 .get_crawl_delay(url)
230 .map(|secs| Duration::from_secs_f64(secs))
231 }
232
233 pub fn cache(&self) -> &RobotsCache {
235 &self.cache
236 }
237}
238
239pub fn extract_sitemaps(robots_content: &str) -> Vec<String> {
241 robots_content
242 .lines()
243 .filter_map(|line| {
244 let line = line.trim();
245 if line.to_lowercase().starts_with("sitemap:") {
246 Some(line[8..].trim().to_string())
247 } else {
248 None
249 }
250 })
251 .collect()
252}