hpx_browser/net/
robots.rs1use scc::HashMap as SccHashMap;
2
3#[derive(Debug, Clone, Default)]
4pub struct RobotsRules {
5 pub user_agent: String,
6 pub allow: Vec<String>,
7 pub disallow: Vec<String>,
8 pub sitemaps: Vec<String>,
9}
10
11pub struct RobotsCache {
12 cache: SccHashMap<String, RobotsRules>,
13}
14
15impl RobotsCache {
16 pub fn new() -> Self {
17 Self {
18 cache: SccHashMap::new(),
19 }
20 }
21
22 pub fn parse(content: &str) -> RobotsRules {
24 let mut rules = RobotsRules {
25 user_agent: "*".to_string(),
26 ..RobotsRules::default()
27 };
28
29 for line in content.lines() {
30 let line = line.trim();
31 if line.is_empty() || line.starts_with('#') {
32 continue;
33 }
34
35 let Some((key, value)) = line.split_once(':') else {
36 continue;
37 };
38
39 let key = key.trim().to_ascii_lowercase();
40 let value = value.trim();
41
42 if value.is_empty() {
43 continue;
44 }
45
46 match key.as_str() {
47 "user-agent" => {
48 if rules.user_agent == "*" {
49 rules.user_agent = value.to_string();
50 }
51 }
52 "allow" => rules.allow.push(value.to_string()),
53 "disallow" => rules.disallow.push(value.to_string()),
54 "sitemap" => rules.sitemaps.push(value.to_string()),
55 _ => {}
56 }
57 }
58
59 rules.allow.sort_by_key(|b| std::cmp::Reverse(b.len()));
61 rules.disallow.sort_by_key(|b| std::cmp::Reverse(b.len()));
62
63 rules
64 }
65
66 pub fn is_allowed(rules: &RobotsRules, path: &str) -> bool {
70 let disallow_match = rules.disallow.iter().find(|d| path.starts_with(d.as_str()));
72
73 let allow_match = rules.allow.iter().find(|a| path.starts_with(a.as_str()));
75
76 match (disallow_match, allow_match) {
79 (Some(dis), Some(al)) => al.len() >= dis.len(),
80 (Some(_), None) => false,
81 _ => true,
82 }
83 }
84}
85
86#[cfg(test)]
87mod tests {
88 use super::*;
89
90 #[test]
91 fn parse_comments_and_blanks() {
92 let r = RobotsCache::parse("# comment\n\nUser-agent: *\nDisallow: /admin\n# another\n");
93 assert_eq!(r.disallow, vec!["/admin"]);
94 assert!(r.allow.is_empty());
95 }
96
97 #[test]
98 fn parse_multiple_disallow_and_allow() {
99 let r = RobotsCache::parse(
100 "User-agent: *\nDisallow: /tmp\nDisallow: /private\nAllow: /tmp/pub\n",
101 );
102 assert!(r.disallow.contains(&"/tmp".to_string()));
103 assert!(r.disallow.contains(&"/private".to_string()));
104 assert!(r.allow.contains(&"/tmp/pub".to_string()));
105 }
106
107 #[test]
108 fn parse_sitemap() {
109 let r = RobotsCache::parse("User-agent: *\nSitemap: https://example.com/sitemap.xml\n");
110 assert_eq!(r.sitemaps, vec!["https://example.com/sitemap.xml"]);
111 }
112
113 #[test]
114 fn disallow_blocks_path_prefix() {
115 let r = RobotsCache::parse("User-agent: *\nDisallow: /admin\n");
116 assert!(!RobotsCache::is_allowed(&r, "/admin"));
117 assert!(!RobotsCache::is_allowed(&r, "/admin/settings"));
118 assert!(RobotsCache::is_allowed(&r, "/public"));
119 }
120
121 #[test]
122 fn allow_overrides_disallow_when_more_specific() {
123 let r = RobotsCache::parse("User-agent: *\nDisallow: /tmp\nAllow: /tmp/pub\n");
124 assert!(!RobotsCache::is_allowed(&r, "/tmp/secret"));
125 assert!(RobotsCache::is_allowed(&r, "/tmp/pub"));
126 assert!(RobotsCache::is_allowed(&r, "/tmp/pub/file.txt"));
127 }
128
129 #[test]
130 fn disallow_empty_means_allow_all() {
131 let r = RobotsCache::parse("User-agent: *\nDisallow:\n");
132 assert!(RobotsCache::is_allowed(&r, "/anything"));
133 }
134
135 #[test]
136 fn default_allows_everything() {
137 let r = RobotsCache::parse("");
138 assert!(RobotsCache::is_allowed(&r, "/whatever"));
139 }
140
141 #[test]
142 fn cache_store_and_get() {
143 let cache = RobotsCache::new();
144 let rules = RobotsCache::parse("User-agent: *\nDisallow: /secret\n");
145 let _ = cache
146 .cache
147 .insert_sync("example.com".to_string(), rules.clone());
148 let entry = cache.cache.get_sync("example.com").unwrap();
149 assert_eq!(entry.disallow, vec!["/secret"]);
150 }
151}