cortex_runtime/cartography/
robots.rs1#[derive(Debug, Clone, Default)]
5pub struct RobotsRules {
6 pub allowed: Vec<String>,
7 pub disallowed: Vec<String>,
8 pub crawl_delay: Option<f32>,
9 pub sitemaps: Vec<String>,
10}
11
12impl RobotsRules {
13 pub fn is_allowed(&self, path: &str) -> bool {
15 let mut longest_disallow = 0;
17 let mut is_disallowed = false;
18 for pattern in &self.disallowed {
19 if path_matches(path, pattern) && pattern.len() > longest_disallow {
20 longest_disallow = pattern.len();
21 is_disallowed = true;
22 }
23 }
24
25 let mut longest_allow = 0;
26 let mut is_allowed = false;
27 for pattern in &self.allowed {
28 if path_matches(path, pattern) && pattern.len() > longest_allow {
29 longest_allow = pattern.len();
30 is_allowed = true;
31 }
32 }
33
34 if is_allowed && is_disallowed {
36 return longest_allow >= longest_disallow;
37 }
38 if is_disallowed {
39 return false;
40 }
41 true
42 }
43}
44
45pub fn parse_robots(txt: &str, user_agent: &str) -> RobotsRules {
47 let mut rules = RobotsRules::default();
48 let mut in_matching_group = false;
49 let mut found_matching_group = false;
50 let ua_lower = user_agent.to_lowercase();
51
52 for line in txt.lines() {
53 let line = line.trim();
54
55 if line.is_empty() || line.starts_with('#') {
57 continue;
58 }
59
60 let line = line.split('#').next().unwrap_or("").trim();
62
63 if let Some((key, value)) = line.split_once(':') {
64 let key = key.trim().to_lowercase();
65 let value = value.trim();
66
67 match key.as_str() {
68 "user-agent" => {
69 let ua = value.to_lowercase();
70 in_matching_group = ua == "*" || ua == ua_lower;
71 if in_matching_group {
72 found_matching_group = true;
73 }
74 }
75 "allow" if in_matching_group || !found_matching_group => {
76 if !value.is_empty() {
77 rules.allowed.push(value.to_string());
78 }
79 }
80 "disallow" if in_matching_group || !found_matching_group => {
81 if !value.is_empty() {
82 rules.disallowed.push(value.to_string());
83 }
84 }
85 "crawl-delay" if in_matching_group || !found_matching_group => {
86 if let Ok(delay) = value.parse::<f32>() {
87 rules.crawl_delay = Some(delay);
88 }
89 }
90 "sitemap" => {
91 if !value.is_empty() {
93 rules.sitemaps.push(value.to_string());
94 }
95 }
96 _ => {}
97 }
98 }
99 }
100
101 rules
102}
103
104fn path_matches(path: &str, pattern: &str) -> bool {
106 if pattern.is_empty() {
107 return false;
108 }
109
110 if let Some(prefix) = pattern.strip_suffix('*') {
112 return path.starts_with(prefix);
113 }
114
115 if let Some(exact) = pattern.strip_suffix('$') {
116 return path == exact;
117 }
118
119 path.starts_with(pattern)
120}
121
122#[cfg(test)]
123mod tests {
124 use super::*;
125
126 #[test]
127 fn test_parse_robots() {
128 let txt = r#"
129User-agent: *
130Allow: /
131Disallow: /admin
132Disallow: /private/
133Crawl-delay: 1.5
134
135Sitemap: https://example.com/sitemap.xml
136Sitemap: https://example.com/sitemap-blog.xml
137"#;
138
139 let rules = parse_robots(txt, "cortex");
140 assert_eq!(rules.allowed.len(), 1);
141 assert_eq!(rules.disallowed.len(), 2);
142 assert_eq!(rules.crawl_delay, Some(1.5));
143 assert_eq!(rules.sitemaps.len(), 2);
144
145 assert!(rules.is_allowed("/"));
146 assert!(rules.is_allowed("/about"));
147 assert!(!rules.is_allowed("/admin"));
148 assert!(!rules.is_allowed("/admin/settings"));
149 assert!(!rules.is_allowed("/private/data"));
150 }
151
152 #[test]
153 fn test_allow_overrides_disallow() {
154 let txt = r#"
155User-agent: *
156Disallow: /api/
157Allow: /api/public/
158"#;
159 let rules = parse_robots(txt, "cortex");
160 assert!(!rules.is_allowed("/api/secret"));
161 assert!(rules.is_allowed("/api/public/docs"));
162 }
163}