Skip to main content

hpx_browser/net/
robots.rs

1use scc::HashMap as SccHashMap;
2
3#[derive(Debug, Clone, Default)]
4pub struct RobotsRules {
5    pub user_agent: String,
6    pub allow: Vec<String>,
7    pub disallow: Vec<String>,
8    pub sitemaps: Vec<String>,
9}
10
11pub struct RobotsCache {
12    cache: SccHashMap<String, RobotsRules>,
13}
14
15impl RobotsCache {
16    pub fn new() -> Self {
17        Self {
18            cache: SccHashMap::new(),
19        }
20    }
21
22    /// Parse robots.txt content into rules. Handles `User-agent: *` only.
23    pub fn parse(content: &str) -> RobotsRules {
24        let mut rules = RobotsRules {
25            user_agent: "*".to_string(),
26            ..RobotsRules::default()
27        };
28
29        for line in content.lines() {
30            let line = line.trim();
31            if line.is_empty() || line.starts_with('#') {
32                continue;
33            }
34
35            let Some((key, value)) = line.split_once(':') else {
36                continue;
37            };
38
39            let key = key.trim().to_ascii_lowercase();
40            let value = value.trim();
41
42            if value.is_empty() {
43                continue;
44            }
45
46            match key.as_str() {
47                "user-agent" => {
48                    if rules.user_agent == "*" {
49                        rules.user_agent = value.to_string();
50                    }
51                }
52                "allow" => rules.allow.push(value.to_string()),
53                "disallow" => rules.disallow.push(value.to_string()),
54                "sitemap" => rules.sitemaps.push(value.to_string()),
55                _ => {}
56            }
57        }
58
59        // Sort longer (more specific) paths first so matching is deterministic.
60        rules.allow.sort_by_key(|b| std::cmp::Reverse(b.len()));
61        rules.disallow.sort_by_key(|b| std::cmp::Reverse(b.len()));
62
63        rules
64    }
65
66    /// Check if `path` is allowed under the given rules.
67    /// Returns `true` (allowed) by default; explicit disallow blocks.
68    /// More-specific rules (longer prefix) win over less-specific ones.
69    pub fn is_allowed(rules: &RobotsRules, path: &str) -> bool {
70        // Find the longest matching disallow rule.
71        let disallow_match = rules.disallow.iter().find(|d| path.starts_with(d.as_str()));
72
73        // Find the longest matching allow rule.
74        let allow_match = rules.allow.iter().find(|a| path.starts_with(a.as_str()));
75
76        // Both lists are sorted longest-first, so the first match is the most
77        // specific. If allow is more specific (or equal), it wins.
78        match (disallow_match, allow_match) {
79            (Some(dis), Some(al)) => al.len() >= dis.len(),
80            (Some(_), None) => false,
81            _ => true,
82        }
83    }
84}
85
86#[cfg(test)]
87mod tests {
88    use super::*;
89
90    #[test]
91    fn parse_comments_and_blanks() {
92        let r = RobotsCache::parse("# comment\n\nUser-agent: *\nDisallow: /admin\n# another\n");
93        assert_eq!(r.disallow, vec!["/admin"]);
94        assert!(r.allow.is_empty());
95    }
96
97    #[test]
98    fn parse_multiple_disallow_and_allow() {
99        let r = RobotsCache::parse(
100            "User-agent: *\nDisallow: /tmp\nDisallow: /private\nAllow: /tmp/pub\n",
101        );
102        assert!(r.disallow.contains(&"/tmp".to_string()));
103        assert!(r.disallow.contains(&"/private".to_string()));
104        assert!(r.allow.contains(&"/tmp/pub".to_string()));
105    }
106
107    #[test]
108    fn parse_sitemap() {
109        let r = RobotsCache::parse("User-agent: *\nSitemap: https://example.com/sitemap.xml\n");
110        assert_eq!(r.sitemaps, vec!["https://example.com/sitemap.xml"]);
111    }
112
113    #[test]
114    fn disallow_blocks_path_prefix() {
115        let r = RobotsCache::parse("User-agent: *\nDisallow: /admin\n");
116        assert!(!RobotsCache::is_allowed(&r, "/admin"));
117        assert!(!RobotsCache::is_allowed(&r, "/admin/settings"));
118        assert!(RobotsCache::is_allowed(&r, "/public"));
119    }
120
121    #[test]
122    fn allow_overrides_disallow_when_more_specific() {
123        let r = RobotsCache::parse("User-agent: *\nDisallow: /tmp\nAllow: /tmp/pub\n");
124        assert!(!RobotsCache::is_allowed(&r, "/tmp/secret"));
125        assert!(RobotsCache::is_allowed(&r, "/tmp/pub"));
126        assert!(RobotsCache::is_allowed(&r, "/tmp/pub/file.txt"));
127    }
128
129    #[test]
130    fn disallow_empty_means_allow_all() {
131        let r = RobotsCache::parse("User-agent: *\nDisallow:\n");
132        assert!(RobotsCache::is_allowed(&r, "/anything"));
133    }
134
135    #[test]
136    fn default_allows_everything() {
137        let r = RobotsCache::parse("");
138        assert!(RobotsCache::is_allowed(&r, "/whatever"));
139    }
140
141    #[test]
142    fn cache_store_and_get() {
143        let cache = RobotsCache::new();
144        let rules = RobotsCache::parse("User-agent: *\nDisallow: /secret\n");
145        let _ = cache
146            .cache
147            .insert_sync("example.com".to_string(), rules.clone());
148        let entry = cache.cache.get_sync("example.com").unwrap();
149        assert_eq!(entry.disallow, vec!["/secret"]);
150    }
151}