Skip to main content

cortex_runtime/cartography/
robots.rs

1//! Parse robots.txt files.
2
3/// Parsed robots.txt rules.
4#[derive(Debug, Clone, Default)]
5pub struct RobotsRules {
6    pub allowed: Vec<String>,
7    pub disallowed: Vec<String>,
8    pub crawl_delay: Option<f32>,
9    pub sitemaps: Vec<String>,
10}
11
12impl RobotsRules {
13    /// Check if a path is allowed by the robots rules.
14    pub fn is_allowed(&self, path: &str) -> bool {
15        // Check disallowed first (more specific wins)
16        let mut longest_disallow = 0;
17        let mut is_disallowed = false;
18        for pattern in &self.disallowed {
19            if path_matches(path, pattern) && pattern.len() > longest_disallow {
20                longest_disallow = pattern.len();
21                is_disallowed = true;
22            }
23        }
24
25        let mut longest_allow = 0;
26        let mut is_allowed = false;
27        for pattern in &self.allowed {
28            if path_matches(path, pattern) && pattern.len() > longest_allow {
29                longest_allow = pattern.len();
30                is_allowed = true;
31            }
32        }
33
34        // Longer match wins
35        if is_allowed && is_disallowed {
36            return longest_allow >= longest_disallow;
37        }
38        if is_disallowed {
39            return false;
40        }
41        true
42    }
43}
44
45/// Parse a robots.txt string for a specific user agent.
46pub fn parse_robots(txt: &str, user_agent: &str) -> RobotsRules {
47    let mut rules = RobotsRules::default();
48    let mut in_matching_group = false;
49    let mut found_matching_group = false;
50    let ua_lower = user_agent.to_lowercase();
51
52    for line in txt.lines() {
53        let line = line.trim();
54
55        // Skip comments and empty lines
56        if line.is_empty() || line.starts_with('#') {
57            continue;
58        }
59
60        // Remove inline comments
61        let line = line.split('#').next().unwrap_or("").trim();
62
63        if let Some((key, value)) = line.split_once(':') {
64            let key = key.trim().to_lowercase();
65            let value = value.trim();
66
67            match key.as_str() {
68                "user-agent" => {
69                    let ua = value.to_lowercase();
70                    in_matching_group = ua == "*" || ua == ua_lower;
71                    if in_matching_group {
72                        found_matching_group = true;
73                    }
74                }
75                "allow" if in_matching_group || !found_matching_group => {
76                    if !value.is_empty() {
77                        rules.allowed.push(value.to_string());
78                    }
79                }
80                "disallow" if in_matching_group || !found_matching_group => {
81                    if !value.is_empty() {
82                        rules.disallowed.push(value.to_string());
83                    }
84                }
85                "crawl-delay" if in_matching_group || !found_matching_group => {
86                    if let Ok(delay) = value.parse::<f32>() {
87                        rules.crawl_delay = Some(delay);
88                    }
89                }
90                "sitemap" => {
91                    // Sitemap directives are global
92                    if !value.is_empty() {
93                        rules.sitemaps.push(value.to_string());
94                    }
95                }
96                _ => {}
97            }
98        }
99    }
100
101    rules
102}
103
104/// Check if a path matches a robots.txt pattern.
105fn path_matches(path: &str, pattern: &str) -> bool {
106    if pattern.is_empty() {
107        return false;
108    }
109
110    // Simple prefix matching (robots.txt standard)
111    if let Some(prefix) = pattern.strip_suffix('*') {
112        return path.starts_with(prefix);
113    }
114
115    if let Some(exact) = pattern.strip_suffix('$') {
116        return path == exact;
117    }
118
119    path.starts_with(pattern)
120}
121
122#[cfg(test)]
123mod tests {
124    use super::*;
125
126    #[test]
127    fn test_parse_robots() {
128        let txt = r#"
129User-agent: *
130Allow: /
131Disallow: /admin
132Disallow: /private/
133Crawl-delay: 1.5
134
135Sitemap: https://example.com/sitemap.xml
136Sitemap: https://example.com/sitemap-blog.xml
137"#;
138
139        let rules = parse_robots(txt, "cortex");
140        assert_eq!(rules.allowed.len(), 1);
141        assert_eq!(rules.disallowed.len(), 2);
142        assert_eq!(rules.crawl_delay, Some(1.5));
143        assert_eq!(rules.sitemaps.len(), 2);
144
145        assert!(rules.is_allowed("/"));
146        assert!(rules.is_allowed("/about"));
147        assert!(!rules.is_allowed("/admin"));
148        assert!(!rules.is_allowed("/admin/settings"));
149        assert!(!rules.is_allowed("/private/data"));
150    }
151
152    #[test]
153    fn test_allow_overrides_disallow() {
154        let txt = r#"
155User-agent: *
156Disallow: /api/
157Allow: /api/public/
158"#;
159        let rules = parse_robots(txt, "cortex");
160        assert!(!rules.is_allowed("/api/secret"));
161        assert!(rules.is_allowed("/api/public/docs"));
162    }
163}