Skip to main content

argus_robots/
parser.rs

1use std::time::Duration;
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum Rule {
5    Allow(String),
6    Disallow(String),
7}
8
9#[derive(Debug, Clone)]
10pub struct RobotsTxt {
11    rules: Vec<Rule>,
12    crawl_delay: Option<Duration>,
13}
14
15impl RobotsTxt {
16    pub fn parse(content: &str, user_agent: &str) -> Self {
17        let mut rules = Vec::new();
18        let mut crawl_delay = None;
19        let mut in_matching_section = false;
20        let mut in_any_section = false;
21
22        for line in content.lines() {
23            let line = line.trim();
24
25            if line.is_empty() || line.starts_with('#') {
26                continue;
27            }
28
29            let line = if let Some(pos) = line.find('#') {
30                line[..pos].trim()
31            } else {
32                line
33            };
34
35            let (key, value) = match line.split_once(':') {
36                Some((k, v)) => (k.trim().to_lowercase(), v.trim()),
37                None => continue,
38            };
39
40            match key.as_str() {
41                "user-agent" => {
42                    if in_matching_section {
43                        break;
44                    }
45                    in_any_section = true;
46                    let agent_pattern = value.to_lowercase();
47                    in_matching_section = agent_pattern == "*"
48                        || user_agent.to_lowercase().contains(&agent_pattern)
49                        || agent_pattern.contains(&user_agent.to_lowercase());
50                }
51                "allow" if in_matching_section => {
52                    rules.push(Rule::Allow(value.to_string()));
53                }
54                "disallow" if in_matching_section => {
55                    rules.push(Rule::Disallow(value.to_string()));
56                }
57                "crawl-delay" if in_matching_section => {
58                    if let Ok(seconds) = value.parse::<f64>() {
59                        crawl_delay = Some(Duration::from_secs_f64(seconds));
60                    }
61                }
62                _ => {}
63            }
64        }
65
66        if !in_any_section {
67            rules.push(Rule::Allow("/".to_string()));
68        }
69
70        Self { rules, crawl_delay }
71    }
72
73    pub fn is_allowed(&self, path: &str) -> bool {
74        if self.rules.is_empty() {
75            return true;
76        }
77
78        let mut allowed = true;
79        let mut best_match_len = 0;
80
81        for rule in &self.rules {
82            let (is_allow, pattern) = match rule {
83                Rule::Allow(p) => (true, p),
84                Rule::Disallow(p) => (false, p),
85            };
86
87            if pattern.is_empty() {
88                continue;
89            }
90
91            if self.matches_pattern(path, pattern) {
92                let pattern_len = pattern.len();
93                if pattern_len > best_match_len {
94                    best_match_len = pattern_len;
95                    allowed = is_allow;
96                }
97            }
98        }
99
100        allowed
101    }
102
103    fn matches_pattern(&self, path: &str, pattern: &str) -> bool {
104        if pattern == "/" {
105            return true;
106        }
107
108        if let Some(prefix) = pattern.strip_suffix('*') {
109            return path.starts_with(prefix);
110        }
111
112        if let Some(prefix) = pattern.strip_suffix('$') {
113            return path == prefix;
114        }
115
116        path.starts_with(pattern)
117    }
118
119    pub fn crawl_delay(&self) -> Option<Duration> {
120        self.crawl_delay
121    }
122}
123
124#[cfg(test)]
125mod tests {
126    use super::*;
127
128    #[test]
129    fn parse_empty_robots_txt() {
130        let robots = RobotsTxt::parse("", "TestBot");
131        assert!(robots.is_allowed("/"));
132        assert!(robots.is_allowed("/anything"));
133    }
134
135    #[test]
136    fn parse_wildcard_user_agent() {
137        let content = r#"
138User-agent: *
139Disallow: /admin/
140Allow: /admin/public
141"#;
142        let robots = RobotsTxt::parse(content, "TestBot");
143        assert!(!robots.is_allowed("/admin/"));
144        assert!(!robots.is_allowed("/admin/secret"));
145        assert!(robots.is_allowed("/admin/public"));
146        assert!(robots.is_allowed("/"));
147    }
148
149    #[test]
150    fn parse_specific_user_agent() {
151        let content = r#"
152User-agent: BadBot
153Disallow: /
154
155User-agent: GoodBot
156Disallow: /private/
157"#;
158        let robots = RobotsTxt::parse(content, "GoodBot");
159        assert!(!robots.is_allowed("/private/"));
160        assert!(robots.is_allowed("/public/"));
161    }
162
163    #[test]
164    fn parse_crawl_delay() {
165        let content = r#"
166User-agent: *
167Crawl-delay: 2.5
168"#;
169        let robots = RobotsTxt::parse(content, "TestBot");
170        assert_eq!(robots.crawl_delay(), Some(Duration::from_secs_f64(2.5)));
171    }
172
173    #[test]
174    fn pattern_matching_wildcard() {
175        let content = r#"
176User-agent: *
177Disallow: /temp*
178"#;
179        let robots = RobotsTxt::parse(content, "TestBot");
180        assert!(!robots.is_allowed("/temp"));
181        assert!(!robots.is_allowed("/temporary"));
182        assert!(robots.is_allowed("/other"));
183    }
184
185    #[test]
186    fn pattern_matching_end_anchor() {
187        let content = r#"
188User-agent: *
189Disallow: /file.html$
190"#;
191        let robots = RobotsTxt::parse(content, "TestBot");
192        assert!(!robots.is_allowed("/file.html"));
193        assert!(robots.is_allowed("/file.html?query=1"));
194    }
195
196    #[test]
197    fn longest_match_wins() {
198        let content = r#"
199User-agent: *
200Disallow: /admin/
201Allow: /admin/public/
202"#;
203        let robots = RobotsTxt::parse(content, "TestBot");
204        assert!(!robots.is_allowed("/admin/"));
205        assert!(!robots.is_allowed("/admin/secret"));
206        assert!(robots.is_allowed("/admin/public/"));
207        assert!(robots.is_allowed("/admin/public/page"));
208    }
209
210    #[test]
211    fn case_insensitive_user_agent() {
212        let content = r#"
213User-agent: googlebot
214Disallow: /private/
215"#;
216        let robots = RobotsTxt::parse(content, "GoogleBot");
217        assert!(!robots.is_allowed("/private/"));
218    }
219
220    #[test]
221    fn comments_ignored() {
222        let content = r#"
223# This is a comment
224User-agent: *
225Disallow: /admin/ # inline comment
226"#;
227        let robots = RobotsTxt::parse(content, "TestBot");
228        assert!(!robots.is_allowed("/admin/"));
229    }
230}