1use serde::{Deserialize, Serialize};
4use std::time::{Duration, SystemTime, UNIX_EPOCH};
5use url::Url;
6
7#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
9pub struct RobotsCacheKey {
10 pub scheme: String,
12 pub authority: String,
14}
15
16#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
19pub struct RequestRate {
20 pub requests: u32,
22 pub seconds: u32,
24}
25
26impl RequestRate {
27 pub fn new(requests: u32, seconds: u32) -> Self {
29 Self { requests, seconds }
30 }
31
32 pub fn delay_seconds(&self) -> f64 {
34 if self.requests == 0 {
35 f64::MAX
36 } else {
37 self.seconds as f64 / self.requests as f64
38 }
39 }
40
41 pub fn delay(&self) -> Duration {
43 Duration::from_secs_f64(self.delay_seconds())
44 }
45}
46
47impl RobotsCacheKey {
48 pub fn from_url(url: &Url) -> Option<Self> {
50 let host = url.host_str()?;
51 let authority = match url.port() {
52 Some(port) => format!("{}:{}", host, port),
53 None => host.to_string(),
54 };
55 Some(Self {
56 scheme: url.scheme().to_lowercase(),
57 authority: authority.to_lowercase(),
58 })
59 }
60
61 pub fn robots_url(&self) -> String {
63 format!("{}://{}/robots.txt", self.scheme, self.authority)
64 }
65}
66
67#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
69pub enum FetchStatus {
70 Success,
72 NotModified,
74 Unavailable {
76 status_code: u16,
78 },
79 Unreachable {
81 reason: String,
83 },
84 Protected {
86 status_code: u16,
88 },
89}
90
91impl FetchStatus {
92 pub fn allows_all(&self) -> bool {
94 matches!(self, FetchStatus::Unavailable { .. })
95 }
96
97 pub fn denies_all(&self) -> bool {
99 matches!(self, FetchStatus::Unreachable { .. } | FetchStatus::Protected { .. })
100 }
101
102 pub fn is_not_modified(&self) -> bool {
104 matches!(self, FetchStatus::NotModified)
105 }
106}
107
108#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
110pub struct Rule {
111 pub kind: RuleKind,
113 pub pattern: String,
115 #[serde(skip)]
117 pub compiled: Option<CompiledPattern>,
118}
119
120impl Rule {
121 pub fn new(kind: RuleKind, pattern: String) -> Self {
123 let compiled = CompiledPattern::compile(&pattern);
124 Self {
125 kind,
126 pattern,
127 compiled: Some(compiled),
128 }
129 }
130
131 pub fn matches(&self, path: &str) -> bool {
133 match &self.compiled {
134 Some(compiled) => compiled.matches(path),
135 None => CompiledPattern::compile(&self.pattern).matches(path),
136 }
137 }
138
139 pub fn specificity(&self) -> usize {
141 self.pattern.chars().filter(|&c| c != '*').count()
143 }
144}
145
146#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
148pub enum RuleKind {
149 Allow,
151 Disallow,
153}
154
155#[derive(Debug, Clone, PartialEq, Eq)]
157pub struct CompiledPattern {
158 segments: Vec<PatternSegment>,
160 anchored_end: bool,
162}
163
164#[derive(Debug, Clone, PartialEq, Eq)]
166enum PatternSegment {
167 Literal(String),
169 Wildcard,
171}
172
173impl CompiledPattern {
174 pub fn compile(pattern: &str) -> Self {
176 let anchored_end = pattern.ends_with('$');
177 let pattern = if anchored_end {
178 &pattern[..pattern.len() - 1]
179 } else {
180 pattern
181 };
182
183 let mut segments = Vec::new();
184 let mut current = String::new();
185
186 for c in pattern.chars() {
187 if c == '*' {
188 if !current.is_empty() {
189 segments.push(PatternSegment::Literal(current.clone()));
190 current.clear();
191 }
192 if !matches!(segments.last(), Some(PatternSegment::Wildcard)) {
194 segments.push(PatternSegment::Wildcard);
195 }
196 } else {
197 current.push(c);
198 }
199 }
200
201 if !current.is_empty() {
202 segments.push(PatternSegment::Literal(current));
203 }
204
205 Self {
206 segments,
207 anchored_end,
208 }
209 }
210
211 pub fn matches(&self, path: &str) -> bool {
213 self.matches_recursive(path, 0)
214 }
215
216 fn matches_recursive(&self, remaining: &str, segment_idx: usize) -> bool {
217 if segment_idx >= self.segments.len() {
219 return if self.anchored_end {
220 remaining.is_empty()
221 } else {
222 true
223 };
224 }
225
226 match &self.segments[segment_idx] {
227 PatternSegment::Literal(lit) => {
228 if remaining.starts_with(lit.as_str()) {
229 self.matches_recursive(&remaining[lit.len()..], segment_idx + 1)
230 } else {
231 false
232 }
233 }
234 PatternSegment::Wildcard => {
235 if segment_idx + 1 >= self.segments.len() {
237 if self.anchored_end {
239 remaining.is_empty() || self.matches_recursive(remaining, segment_idx + 1)
241 } else {
242 true
243 }
244 } else {
245 for i in 0..=remaining.len() {
247 if self.matches_recursive(&remaining[i..], segment_idx + 1) {
248 return true;
249 }
250 }
251 false
252 }
253 }
254 }
255 }
256}
257
258#[derive(Debug, Clone, Default, Serialize, Deserialize)]
260pub struct Group {
261 pub user_agents: Vec<String>,
263 pub rules: Vec<Rule>,
265 pub crawl_delay: Option<f64>,
267 pub request_rate: Option<RequestRate>,
269}
270
271impl Group {
272 pub fn matches_user_agent(&self, token: &str) -> bool {
274 let token_lower = token.to_lowercase();
275 self.user_agents.iter().any(|ua| {
276 let ua_lower = ua.to_lowercase();
277 token_lower.starts_with(&ua_lower) || ua_lower == "*"
279 })
280 }
281
282 pub fn is_wildcard(&self) -> bool {
284 self.user_agents.iter().any(|ua| ua == "*")
285 }
286}
287
288#[derive(Debug, Clone, Serialize, Deserialize)]
290pub struct RobotsPolicy {
291 pub fetched_at_ms: u64,
293 pub expires_at_ms: u64,
295 pub fetch_status: FetchStatus,
297 pub groups: Vec<Group>,
299 pub sitemaps: Vec<String>,
301 pub content_size: usize,
303 #[serde(skip_serializing_if = "Option::is_none")]
305 pub etag: Option<String>,
306 #[serde(skip_serializing_if = "Option::is_none")]
308 pub last_modified: Option<String>,
309}
310
311impl RobotsPolicy {
312 fn now_millis() -> u64 {
314 SystemTime::now()
315 .duration_since(UNIX_EPOCH)
316 .unwrap_or_default()
317 .as_millis() as u64
318 }
319
320 pub fn unavailable(status_code: u16, ttl: Duration) -> Self {
322 let now = Self::now_millis();
323 Self {
324 fetched_at_ms: now,
325 expires_at_ms: now + ttl.as_millis() as u64,
326 fetch_status: FetchStatus::Unavailable { status_code },
327 groups: Vec::new(),
328 sitemaps: Vec::new(),
329 content_size: 0,
330 etag: None,
331 last_modified: None,
332 }
333 }
334
335 pub fn unreachable(reason: String, ttl: Duration) -> Self {
337 let now = Self::now_millis();
338 Self {
339 fetched_at_ms: now,
340 expires_at_ms: now + ttl.as_millis() as u64,
341 fetch_status: FetchStatus::Unreachable { reason },
342 groups: Vec::new(),
343 sitemaps: Vec::new(),
344 content_size: 0,
345 etag: None,
346 last_modified: None,
347 }
348 }
349
350 pub fn protected(status_code: u16, ttl: Duration) -> Self {
352 let now = Self::now_millis();
353 Self {
354 fetched_at_ms: now,
355 expires_at_ms: now + ttl.as_millis() as u64,
356 fetch_status: FetchStatus::Protected { status_code },
357 groups: Vec::new(),
358 sitemaps: Vec::new(),
359 content_size: 0,
360 etag: None,
361 last_modified: None,
362 }
363 }
364
365 pub fn not_modified(ttl: Duration) -> Self {
368 let now = Self::now_millis();
369 Self {
370 fetched_at_ms: now,
371 expires_at_ms: now + ttl.as_millis() as u64,
372 fetch_status: FetchStatus::NotModified,
373 groups: Vec::new(),
374 sitemaps: Vec::new(),
375 content_size: 0,
376 etag: None,
377 last_modified: None,
378 }
379 }
380
381 pub fn extend_ttl(&mut self, ttl: Duration) {
383 self.expires_at_ms = Self::now_millis() + ttl.as_millis() as u64;
384 }
385
386 pub fn is_expired(&self) -> bool {
388 Self::now_millis() > self.expires_at_ms
389 }
390
391 pub fn ttl(&self) -> Duration {
393 let now = Self::now_millis();
394 if self.expires_at_ms > now {
395 Duration::from_millis(self.expires_at_ms - now)
396 } else {
397 Duration::ZERO
398 }
399 }
400}
401
402#[derive(Debug, Clone)]
404pub struct Decision {
405 pub allowed: bool,
407 pub matched_rule: Option<Rule>,
409 pub reason: DecisionReason,
411}
412
413impl Decision {
414 pub fn allow(reason: DecisionReason) -> Self {
416 Self {
417 allowed: true,
418 matched_rule: None,
419 reason,
420 }
421 }
422
423 pub fn deny(reason: DecisionReason) -> Self {
425 Self {
426 allowed: false,
427 matched_rule: None,
428 reason,
429 }
430 }
431
432 pub fn with_rule(mut self, rule: Rule) -> Self {
434 self.matched_rule = Some(rule);
435 self
436 }
437}
438
439#[derive(Debug, Clone, PartialEq, Eq)]
441pub enum DecisionReason {
442 RobotsUnavailable,
444 RobotsUnreachable,
446 RobotsProtected,
448 NoMatchingRule,
450 AllowRuleMatched,
452 DisallowRuleMatched,
454 RobotsTxtPath,
456 RobotsDisabled,
458}
459
460#[derive(Debug, Clone)]
462pub struct EffectiveRules {
463 pub rules: Vec<Rule>,
465 pub crawl_delay: Option<f64>,
467 pub request_rate: Option<RequestRate>,
469 pub matched_agents: Vec<String>,
471}
472
473impl EffectiveRules {
474 pub fn empty() -> Self {
476 Self {
477 rules: Vec::new(),
478 crawl_delay: None,
479 request_rate: None,
480 matched_agents: Vec::new(),
481 }
482 }
483
484 pub fn effective_delay(&self) -> Option<Duration> {
487 if let Some(delay) = self.crawl_delay {
488 return Some(Duration::from_secs_f64(delay));
489 }
490 if let Some(rate) = self.request_rate {
491 return Some(rate.delay());
492 }
493 None
494 }
495}
496
497#[derive(Debug, Clone, Serialize, Deserialize)]
499pub struct RobotsConfig {
500 pub user_agent: String,
502 pub cache_ttl_secs: u64,
504 pub respect_robots: bool,
506 pub default_crawl_delay_ms: u64,
508 pub max_robots_size: usize,
510 pub max_redirects: u32,
512 pub fetch_timeout_secs: u64,
514 pub safe_mode: bool,
516}
517
518impl Default for RobotsConfig {
519 fn default() -> Self {
520 Self {
521 user_agent: "HalldyllBot/1.0".to_string(),
522 cache_ttl_secs: 3600, respect_robots: true,
524 default_crawl_delay_ms: 100,
525 max_robots_size: 512 * 1024, max_redirects: 5,
527 fetch_timeout_secs: 10,
528 safe_mode: true, }
530 }
531}
532
533#[cfg(test)]
534mod tests {
535 use super::*;
536
537 #[test]
538 fn test_cache_key_from_url() {
539 let url = Url::parse("https://example.com:8080/path").unwrap();
540 let key = RobotsCacheKey::from_url(&url).unwrap();
541 assert_eq!(key.scheme, "https");
542 assert_eq!(key.authority, "example.com:8080");
543 assert_eq!(key.robots_url(), "https://example.com:8080/robots.txt");
544 }
545
546 #[test]
547 fn test_compiled_pattern_literal() {
548 let pattern = CompiledPattern::compile("/admin/");
549 assert!(pattern.matches("/admin/"));
550 assert!(pattern.matches("/admin/users"));
551 assert!(!pattern.matches("/administrator"));
552 }
553
554 #[test]
555 fn test_compiled_pattern_wildcard() {
556 let pattern = CompiledPattern::compile("/api/*/users");
557 assert!(pattern.matches("/api/v1/users"));
558 assert!(pattern.matches("/api/v2/users"));
559 assert!(pattern.matches("/api//users"));
560 assert!(!pattern.matches("/api/users"));
561 }
562
563 #[test]
564 fn test_compiled_pattern_anchored() {
565 let pattern = CompiledPattern::compile("/exact$");
566 assert!(pattern.matches("/exact"));
567 assert!(!pattern.matches("/exact/"));
568 assert!(!pattern.matches("/exact/more"));
569 }
570
571 #[test]
572 fn test_compiled_pattern_complex() {
573 let pattern = CompiledPattern::compile("/*.php$");
574 assert!(pattern.matches("/index.php"));
575 assert!(pattern.matches("/admin/login.php"));
576 assert!(!pattern.matches("/index.php5"));
577 assert!(!pattern.matches("/index.php/extra"));
578 }
579
580 #[test]
581 fn test_rule_specificity() {
582 let rule1 = Rule::new(RuleKind::Disallow, "/admin".to_string());
583 let rule2 = Rule::new(RuleKind::Allow, "/admin/public".to_string());
584 assert!(rule2.specificity() > rule1.specificity());
585 }
586
587 #[test]
588 fn test_group_matches_user_agent() {
589 let group = Group {
590 user_agents: vec!["Googlebot".to_string(), "Bingbot".to_string()],
591 rules: vec![],
592 crawl_delay: None,
593 request_rate: None,
594 };
595 assert!(group.matches_user_agent("Googlebot"));
596 assert!(group.matches_user_agent("googlebot")); assert!(group.matches_user_agent("Googlebot-Image")); assert!(!group.matches_user_agent("Yandexbot"));
599 }
600
601 #[test]
602 fn test_request_rate() {
603 let rate = RequestRate::new(1, 10);
604 assert_eq!(rate.requests, 1);
605 assert_eq!(rate.seconds, 10);
606 assert!((rate.delay_seconds() - 10.0).abs() < 0.001);
607 assert_eq!(rate.delay(), Duration::from_secs(10));
608
609 let rate2 = RequestRate::new(2, 10);
610 assert!((rate2.delay_seconds() - 5.0).abs() < 0.001);
611 }
612
613 #[test]
614 fn test_effective_rules_delay() {
615 let mut rules = EffectiveRules::empty();
616 assert!(rules.effective_delay().is_none());
617
618 rules.crawl_delay = Some(2.5);
619 assert_eq!(rules.effective_delay(), Some(Duration::from_secs_f64(2.5)));
620
621 rules.request_rate = Some(RequestRate::new(1, 10));
623 assert_eq!(rules.effective_delay(), Some(Duration::from_secs_f64(2.5)));
624
625 rules.crawl_delay = None;
627 assert_eq!(rules.effective_delay(), Some(Duration::from_secs(10)));
628 }
629
630 #[test]
631 fn test_fetch_status_not_modified() {
632 let status = FetchStatus::NotModified;
633 assert!(status.is_not_modified());
634 assert!(!status.allows_all());
635 assert!(!status.denies_all());
636 }
637
638 #[test]
639 fn test_policy_extend_ttl() {
640 let mut policy = RobotsPolicy::unavailable(404, Duration::from_secs(60));
641 let original_expires = policy.expires_at_ms;
642
643 std::thread::sleep(Duration::from_millis(10));
644 policy.extend_ttl(Duration::from_secs(3600));
645
646 assert!(policy.expires_at_ms > original_expires);
647 }
648}