halldyll_robots/
types.rs

1//! Types - Core types for robots.txt handling (RFC 9309)
2
3use serde::{Deserialize, Serialize};
4use std::time::{Duration, SystemTime, UNIX_EPOCH};
5use url::Url;
6
7/// Cache key for robots.txt (scheme + authority)
8#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
9pub struct RobotsCacheKey {
10    /// URL scheme (http/https)
11    pub scheme: String,
12    /// Authority (host + optional port)
13    pub authority: String,
14}
15
16/// Request-rate directive (non-standard but common)
17/// Format: "requests/seconds" e.g., "1/10" means 1 request per 10 seconds
18#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
19pub struct RequestRate {
20    /// Number of requests allowed
21    pub requests: u32,
22    /// Time period in seconds
23    pub seconds: u32,
24}
25
26impl RequestRate {
27    /// Create a new request rate
28    pub fn new(requests: u32, seconds: u32) -> Self {
29        Self { requests, seconds }
30    }
31
32    /// Get the minimum delay between requests in seconds
33    pub fn delay_seconds(&self) -> f64 {
34        if self.requests == 0 {
35            f64::MAX
36        } else {
37            self.seconds as f64 / self.requests as f64
38        }
39    }
40
41    /// Get the delay as Duration
42    pub fn delay(&self) -> Duration {
43        Duration::from_secs_f64(self.delay_seconds())
44    }
45}
46
47impl RobotsCacheKey {
48    /// Create a new cache key from a URL
49    pub fn from_url(url: &Url) -> Option<Self> {
50        let host = url.host_str()?;
51        let authority = match url.port() {
52            Some(port) => format!("{}:{}", host, port),
53            None => host.to_string(),
54        };
55        Some(Self {
56            scheme: url.scheme().to_lowercase(),
57            authority: authority.to_lowercase(),
58        })
59    }
60
61    /// Get the robots.txt URL for this key
62    pub fn robots_url(&self) -> String {
63        format!("{}://{}/robots.txt", self.scheme, self.authority)
64    }
65}
66
67/// Fetch status distinguishing Unavailable vs Unreachable (RFC 9309)
68#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
69pub enum FetchStatus {
70    /// Successfully fetched and parsed
71    Success,
72    /// Not modified (304) - use cached version
73    NotModified,
74    /// Unavailable (4xx) - crawler can access everything
75    Unavailable {
76        /// HTTP status code
77        status_code: u16,
78    },
79    /// Unreachable (5xx, network error) - assume DISALLOW all
80    Unreachable {
81        /// Error description
82        reason: String,
83    },
84    /// Protected (401/403) - safe mode treats as deny
85    Protected {
86        /// HTTP status code
87        status_code: u16,
88    },
89}
90
91impl FetchStatus {
92    /// Check if this status allows all paths (RFC 9309: Unavailable)
93    pub fn allows_all(&self) -> bool {
94        matches!(self, FetchStatus::Unavailable { .. })
95    }
96
97    /// Check if this status denies all paths (RFC 9309: Unreachable)
98    pub fn denies_all(&self) -> bool {
99        matches!(self, FetchStatus::Unreachable { .. } | FetchStatus::Protected { .. })
100    }
101
102    /// Check if this is a not-modified response (304)
103    pub fn is_not_modified(&self) -> bool {
104        matches!(self, FetchStatus::NotModified)
105    }
106}
107
108/// A single rule (Allow or Disallow)
109#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
110pub struct Rule {
111    /// Rule kind
112    pub kind: RuleKind,
113    /// Original pattern from robots.txt
114    pub pattern: String,
115    /// Compiled pattern for matching (with wildcards expanded)
116    #[serde(skip)]
117    pub compiled: Option<CompiledPattern>,
118}
119
120impl Rule {
121    /// Create a new rule
122    pub fn new(kind: RuleKind, pattern: String) -> Self {
123        let compiled = CompiledPattern::compile(&pattern);
124        Self {
125            kind,
126            pattern,
127            compiled: Some(compiled),
128        }
129    }
130
131    /// Check if this rule matches a path
132    pub fn matches(&self, path: &str) -> bool {
133        match &self.compiled {
134            Some(compiled) => compiled.matches(path),
135            None => CompiledPattern::compile(&self.pattern).matches(path),
136        }
137    }
138
139    /// Get the specificity (length) of this rule's pattern
140    pub fn specificity(&self) -> usize {
141        // Count actual characters, not wildcards
142        self.pattern.chars().filter(|&c| c != '*').count()
143    }
144}
145
146/// Rule kind
147#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
148pub enum RuleKind {
149    /// Allow crawling
150    Allow,
151    /// Disallow crawling
152    Disallow,
153}
154
155/// Compiled pattern for efficient matching
156#[derive(Debug, Clone, PartialEq, Eq)]
157pub struct CompiledPattern {
158    /// Pattern segments (between wildcards)
159    segments: Vec<PatternSegment>,
160    /// Whether pattern ends with $
161    anchored_end: bool,
162}
163
164/// A segment of the pattern
165#[derive(Debug, Clone, PartialEq, Eq)]
166enum PatternSegment {
167    /// Literal text to match
168    Literal(String),
169    /// Wildcard (matches 0+ characters)
170    Wildcard,
171}
172
173impl CompiledPattern {
174    /// Compile a pattern string
175    pub fn compile(pattern: &str) -> Self {
176        let anchored_end = pattern.ends_with('$');
177        let pattern = if anchored_end {
178            &pattern[..pattern.len() - 1]
179        } else {
180            pattern
181        };
182
183        let mut segments = Vec::new();
184        let mut current = String::new();
185
186        for c in pattern.chars() {
187            if c == '*' {
188                if !current.is_empty() {
189                    segments.push(PatternSegment::Literal(current.clone()));
190                    current.clear();
191                }
192                // Collapse consecutive wildcards
193                if !matches!(segments.last(), Some(PatternSegment::Wildcard)) {
194                    segments.push(PatternSegment::Wildcard);
195                }
196            } else {
197                current.push(c);
198            }
199        }
200
201        if !current.is_empty() {
202            segments.push(PatternSegment::Literal(current));
203        }
204
205        Self {
206            segments,
207            anchored_end,
208        }
209    }
210
211    /// Check if this pattern matches a path
212    pub fn matches(&self, path: &str) -> bool {
213        self.matches_recursive(path, 0)
214    }
215
216    fn matches_recursive(&self, remaining: &str, segment_idx: usize) -> bool {
217        // Base case: no more segments
218        if segment_idx >= self.segments.len() {
219            return if self.anchored_end {
220                remaining.is_empty()
221            } else {
222                true
223            };
224        }
225
226        match &self.segments[segment_idx] {
227            PatternSegment::Literal(lit) => {
228                if remaining.starts_with(lit.as_str()) {
229                    self.matches_recursive(&remaining[lit.len()..], segment_idx + 1)
230                } else {
231                    false
232                }
233            }
234            PatternSegment::Wildcard => {
235                // Try matching at every position
236                if segment_idx + 1 >= self.segments.len() {
237                    // Wildcard at end: matches everything
238                    if self.anchored_end {
239                        // Anchored: only match if this is the end
240                        remaining.is_empty() || self.matches_recursive(remaining, segment_idx + 1)
241                    } else {
242                        true
243                    }
244                } else {
245                    // Try matching the rest at each position
246                    for i in 0..=remaining.len() {
247                        if self.matches_recursive(&remaining[i..], segment_idx + 1) {
248                            return true;
249                        }
250                    }
251                    false
252                }
253            }
254        }
255    }
256}
257
258/// A group of rules for specific user-agents
259#[derive(Debug, Clone, Default, Serialize, Deserialize)]
260pub struct Group {
261    /// User-agent tokens this group applies to
262    pub user_agents: Vec<String>,
263    /// Rules in this group (in order)
264    pub rules: Vec<Rule>,
265    /// Crawl-delay directive (optional, in seconds)
266    pub crawl_delay: Option<f64>,
267    /// Request-rate directive (optional, non-standard)
268    pub request_rate: Option<RequestRate>,
269}
270
271impl Group {
272    /// Check if this group matches a user-agent token
273    pub fn matches_user_agent(&self, token: &str) -> bool {
274        let token_lower = token.to_lowercase();
275        self.user_agents.iter().any(|ua| {
276            let ua_lower = ua.to_lowercase();
277            // Prefix match: "Googlebot" matches "Googlebot-Image"
278            token_lower.starts_with(&ua_lower) || ua_lower == "*"
279        })
280    }
281
282    /// Check if this is the wildcard group
283    pub fn is_wildcard(&self) -> bool {
284        self.user_agents.iter().any(|ua| ua == "*")
285    }
286}
287
288/// Complete robots.txt policy
289#[derive(Debug, Clone, Serialize, Deserialize)]
290pub struct RobotsPolicy {
291    /// When the robots.txt was fetched (Unix timestamp millis)
292    pub fetched_at_ms: u64,
293    /// When this policy expires (Unix timestamp millis)
294    pub expires_at_ms: u64,
295    /// Fetch status
296    pub fetch_status: FetchStatus,
297    /// Parsed groups
298    pub groups: Vec<Group>,
299    /// Sitemap URLs found
300    pub sitemaps: Vec<String>,
301    /// Original robots.txt size in bytes
302    pub content_size: usize,
303    /// ETag header for conditional GET
304    #[serde(skip_serializing_if = "Option::is_none")]
305    pub etag: Option<String>,
306    /// Last-Modified header for conditional GET
307    #[serde(skip_serializing_if = "Option::is_none")]
308    pub last_modified: Option<String>,
309}
310
311impl RobotsPolicy {
312    /// Get current timestamp in milliseconds
313    fn now_millis() -> u64 {
314        SystemTime::now()
315            .duration_since(UNIX_EPOCH)
316            .unwrap_or_default()
317            .as_millis() as u64
318    }
319
320    /// Create a policy for unavailable robots.txt (allow all)
321    pub fn unavailable(status_code: u16, ttl: Duration) -> Self {
322        let now = Self::now_millis();
323        Self {
324            fetched_at_ms: now,
325            expires_at_ms: now + ttl.as_millis() as u64,
326            fetch_status: FetchStatus::Unavailable { status_code },
327            groups: Vec::new(),
328            sitemaps: Vec::new(),
329            content_size: 0,
330            etag: None,
331            last_modified: None,
332        }
333    }
334
335    /// Create a policy for unreachable robots.txt (deny all)
336    pub fn unreachable(reason: String, ttl: Duration) -> Self {
337        let now = Self::now_millis();
338        Self {
339            fetched_at_ms: now,
340            expires_at_ms: now + ttl.as_millis() as u64,
341            fetch_status: FetchStatus::Unreachable { reason },
342            groups: Vec::new(),
343            sitemaps: Vec::new(),
344            content_size: 0,
345            etag: None,
346            last_modified: None,
347        }
348    }
349
350    /// Create a policy for protected robots.txt (deny all in safe mode)
351    pub fn protected(status_code: u16, ttl: Duration) -> Self {
352        let now = Self::now_millis();
353        Self {
354            fetched_at_ms: now,
355            expires_at_ms: now + ttl.as_millis() as u64,
356            fetch_status: FetchStatus::Protected { status_code },
357            groups: Vec::new(),
358            sitemaps: Vec::new(),
359            content_size: 0,
360            etag: None,
361            last_modified: None,
362        }
363    }
364
365    /// Create a policy indicating 304 Not Modified
366    /// The caller should extend the existing cached policy's TTL
367    pub fn not_modified(ttl: Duration) -> Self {
368        let now = Self::now_millis();
369        Self {
370            fetched_at_ms: now,
371            expires_at_ms: now + ttl.as_millis() as u64,
372            fetch_status: FetchStatus::NotModified,
373            groups: Vec::new(),
374            sitemaps: Vec::new(),
375            content_size: 0,
376            etag: None,
377            last_modified: None,
378        }
379    }
380
381    /// Extend the expiration time by the given TTL
382    pub fn extend_ttl(&mut self, ttl: Duration) {
383        self.expires_at_ms = Self::now_millis() + ttl.as_millis() as u64;
384    }
385
386    /// Check if this policy has expired
387    pub fn is_expired(&self) -> bool {
388        Self::now_millis() > self.expires_at_ms
389    }
390
391    /// Get time until expiration
392    pub fn ttl(&self) -> Duration {
393        let now = Self::now_millis();
394        if self.expires_at_ms > now {
395            Duration::from_millis(self.expires_at_ms - now)
396        } else {
397            Duration::ZERO
398        }
399    }
400}
401
402/// Decision returned by is_allowed
403#[derive(Debug, Clone)]
404pub struct Decision {
405    /// Whether access is allowed
406    pub allowed: bool,
407    /// The rule that matched (if any)
408    pub matched_rule: Option<Rule>,
409    /// Reason for the decision
410    pub reason: DecisionReason,
411}
412
413impl Decision {
414    /// Create an allowed decision
415    pub fn allow(reason: DecisionReason) -> Self {
416        Self {
417            allowed: true,
418            matched_rule: None,
419            reason,
420        }
421    }
422
423    /// Create a denied decision
424    pub fn deny(reason: DecisionReason) -> Self {
425        Self {
426            allowed: false,
427            matched_rule: None,
428            reason,
429        }
430    }
431
432    /// Create a decision with a matched rule
433    pub fn with_rule(mut self, rule: Rule) -> Self {
434        self.matched_rule = Some(rule);
435        self
436    }
437}
438
439/// Reason for the decision
440#[derive(Debug, Clone, PartialEq, Eq)]
441pub enum DecisionReason {
442    /// Robots.txt unavailable (4xx), allow all
443    RobotsUnavailable,
444    /// Robots.txt unreachable (5xx/network), deny all
445    RobotsUnreachable,
446    /// Robots.txt protected (401/403 in safe mode), deny all
447    RobotsProtected,
448    /// No matching rule found, default allow
449    NoMatchingRule,
450    /// Matched an allow rule
451    AllowRuleMatched,
452    /// Matched a disallow rule
453    DisallowRuleMatched,
454    /// Path is /robots.txt (always allowed)
455    RobotsTxtPath,
456    /// Robots.txt respect is disabled
457    RobotsDisabled,
458}
459
460/// Effective rules for a specific user-agent (after merging groups)
461#[derive(Debug, Clone)]
462pub struct EffectiveRules {
463    /// Merged rules from all matching groups
464    pub rules: Vec<Rule>,
465    /// Crawl delay (from first group that specifies it)
466    pub crawl_delay: Option<f64>,
467    /// Request rate (from first group that specifies it)
468    pub request_rate: Option<RequestRate>,
469    /// Which user-agents were matched
470    pub matched_agents: Vec<String>,
471}
472
473impl EffectiveRules {
474    /// Create empty effective rules
475    pub fn empty() -> Self {
476        Self {
477            rules: Vec::new(),
478            crawl_delay: None,
479            request_rate: None,
480            matched_agents: Vec::new(),
481        }
482    }
483
484    /// Get the effective delay between requests
485    /// Uses crawl_delay if set, otherwise request_rate, otherwise None
486    pub fn effective_delay(&self) -> Option<Duration> {
487        if let Some(delay) = self.crawl_delay {
488            return Some(Duration::from_secs_f64(delay));
489        }
490        if let Some(rate) = self.request_rate {
491            return Some(rate.delay());
492        }
493        None
494    }
495}
496
497/// Configuration for robots.txt handling
498#[derive(Debug, Clone, Serialize, Deserialize)]
499pub struct RobotsConfig {
500    /// User-agent token to identify as
501    pub user_agent: String,
502    /// Cache TTL in seconds (max 24 hours per RFC)
503    pub cache_ttl_secs: u64,
504    /// Whether to respect robots.txt
505    pub respect_robots: bool,
506    /// Default crawl delay in milliseconds
507    pub default_crawl_delay_ms: u64,
508    /// Maximum robots.txt size in bytes (min 500 KiB per RFC)
509    pub max_robots_size: usize,
510    /// Maximum redirects to follow
511    pub max_redirects: u32,
512    /// Fetch timeout in seconds
513    pub fetch_timeout_secs: u64,
514    /// Use safe mode (treat 401/403 as deny)
515    pub safe_mode: bool,
516}
517
518impl Default for RobotsConfig {
519    fn default() -> Self {
520        Self {
521            user_agent: "HalldyllBot/1.0".to_string(),
522            cache_ttl_secs: 3600, // 1 hour
523            respect_robots: true,
524            default_crawl_delay_ms: 100,
525            max_robots_size: 512 * 1024, // 512 KiB (RFC minimum is 500 KiB)
526            max_redirects: 5,
527            fetch_timeout_secs: 10,
528            safe_mode: true, // Safer default for production
529        }
530    }
531}
532
533#[cfg(test)]
534mod tests {
535    use super::*;
536
537    #[test]
538    fn test_cache_key_from_url() {
539        let url = Url::parse("https://example.com:8080/path").unwrap();
540        let key = RobotsCacheKey::from_url(&url).unwrap();
541        assert_eq!(key.scheme, "https");
542        assert_eq!(key.authority, "example.com:8080");
543        assert_eq!(key.robots_url(), "https://example.com:8080/robots.txt");
544    }
545
546    #[test]
547    fn test_compiled_pattern_literal() {
548        let pattern = CompiledPattern::compile("/admin/");
549        assert!(pattern.matches("/admin/"));
550        assert!(pattern.matches("/admin/users"));
551        assert!(!pattern.matches("/administrator"));
552    }
553
554    #[test]
555    fn test_compiled_pattern_wildcard() {
556        let pattern = CompiledPattern::compile("/api/*/users");
557        assert!(pattern.matches("/api/v1/users"));
558        assert!(pattern.matches("/api/v2/users"));
559        assert!(pattern.matches("/api//users"));
560        assert!(!pattern.matches("/api/users"));
561    }
562
563    #[test]
564    fn test_compiled_pattern_anchored() {
565        let pattern = CompiledPattern::compile("/exact$");
566        assert!(pattern.matches("/exact"));
567        assert!(!pattern.matches("/exact/"));
568        assert!(!pattern.matches("/exact/more"));
569    }
570
571    #[test]
572    fn test_compiled_pattern_complex() {
573        let pattern = CompiledPattern::compile("/*.php$");
574        assert!(pattern.matches("/index.php"));
575        assert!(pattern.matches("/admin/login.php"));
576        assert!(!pattern.matches("/index.php5"));
577        assert!(!pattern.matches("/index.php/extra"));
578    }
579
580    #[test]
581    fn test_rule_specificity() {
582        let rule1 = Rule::new(RuleKind::Disallow, "/admin".to_string());
583        let rule2 = Rule::new(RuleKind::Allow, "/admin/public".to_string());
584        assert!(rule2.specificity() > rule1.specificity());
585    }
586
587    #[test]
588    fn test_group_matches_user_agent() {
589        let group = Group {
590            user_agents: vec!["Googlebot".to_string(), "Bingbot".to_string()],
591            rules: vec![],
592            crawl_delay: None,
593            request_rate: None,
594        };
595        assert!(group.matches_user_agent("Googlebot"));
596        assert!(group.matches_user_agent("googlebot")); // case insensitive
597        assert!(group.matches_user_agent("Googlebot-Image")); // prefix match
598        assert!(!group.matches_user_agent("Yandexbot"));
599    }
600
601    #[test]
602    fn test_request_rate() {
603        let rate = RequestRate::new(1, 10);
604        assert_eq!(rate.requests, 1);
605        assert_eq!(rate.seconds, 10);
606        assert!((rate.delay_seconds() - 10.0).abs() < 0.001);
607        assert_eq!(rate.delay(), Duration::from_secs(10));
608
609        let rate2 = RequestRate::new(2, 10);
610        assert!((rate2.delay_seconds() - 5.0).abs() < 0.001);
611    }
612
613    #[test]
614    fn test_effective_rules_delay() {
615        let mut rules = EffectiveRules::empty();
616        assert!(rules.effective_delay().is_none());
617
618        rules.crawl_delay = Some(2.5);
619        assert_eq!(rules.effective_delay(), Some(Duration::from_secs_f64(2.5)));
620
621        // crawl_delay takes precedence over request_rate
622        rules.request_rate = Some(RequestRate::new(1, 10));
623        assert_eq!(rules.effective_delay(), Some(Duration::from_secs_f64(2.5)));
624
625        // Without crawl_delay, request_rate is used
626        rules.crawl_delay = None;
627        assert_eq!(rules.effective_delay(), Some(Duration::from_secs(10)));
628    }
629
630    #[test]
631    fn test_fetch_status_not_modified() {
632        let status = FetchStatus::NotModified;
633        assert!(status.is_not_modified());
634        assert!(!status.allows_all());
635        assert!(!status.denies_all());
636    }
637
638    #[test]
639    fn test_policy_extend_ttl() {
640        let mut policy = RobotsPolicy::unavailable(404, Duration::from_secs(60));
641        let original_expires = policy.expires_at_ms;
642        
643        std::thread::sleep(Duration::from_millis(10));
644        policy.extend_ttl(Duration::from_secs(3600));
645        
646        assert!(policy.expires_at_ms > original_expires);
647    }
648}
halldyll_robots/types.rs

halldyll_robots/
types.rs