halldyll_robots/
matcher.rs1use crate::parser::encoding;
4use crate::types::{Decision, DecisionReason, EffectiveRules, Group, Rule, RuleKind, RobotsPolicy};
5use tracing::debug;
6
7pub struct RobotsMatcher {
9 user_agent: String,
11}
12
13impl RobotsMatcher {
14 pub fn new(user_agent: &str) -> Self {
16 Self {
17 user_agent: user_agent.to_string(),
18 }
19 }
20
21 pub fn effective_rules(&self, policy: &RobotsPolicy) -> EffectiveRules {
23 let mut matched_groups: Vec<&Group> = Vec::new();
24 let mut wildcard_group: Option<&Group> = None;
25
26 for group in &policy.groups {
28 if group.is_wildcard() {
29 wildcard_group = Some(group);
30 } else if group.matches_user_agent(&self.user_agent) {
31 matched_groups.push(group);
32 }
33 }
34
35 if matched_groups.is_empty() {
37 if let Some(wg) = wildcard_group {
38 matched_groups.push(wg);
39 }
40 }
41
42 let mut rules: Vec<Rule> = Vec::new();
44 let mut crawl_delay: Option<f64> = None;
45 let mut request_rate = None;
46 let mut matched_agents: Vec<String> = Vec::new();
47
48 for group in matched_groups {
49 matched_agents.extend(group.user_agents.clone());
50 rules.extend(group.rules.clone());
51 if crawl_delay.is_none() && group.crawl_delay.is_some() {
52 crawl_delay = group.crawl_delay;
53 }
54 if request_rate.is_none() && group.request_rate.is_some() {
55 request_rate = group.request_rate;
56 }
57 }
58
59 EffectiveRules {
60 rules,
61 crawl_delay,
62 request_rate,
63 matched_agents,
64 }
65 }
66
67 pub fn is_allowed(&self, policy: &RobotsPolicy, path: &str) -> Decision {
69 if path == "/robots.txt" || path.starts_with("/robots.txt?") {
71 return Decision::allow(DecisionReason::RobotsTxtPath);
72 }
73
74 match &policy.fetch_status {
76 crate::types::FetchStatus::Success => {
77 }
79 crate::types::FetchStatus::NotModified => {
80 }
83 crate::types::FetchStatus::Unavailable { .. } => {
84 return Decision::allow(DecisionReason::RobotsUnavailable);
85 }
86 crate::types::FetchStatus::Unreachable { .. } => {
87 return Decision::deny(DecisionReason::RobotsUnreachable);
88 }
89 crate::types::FetchStatus::Protected { .. } => {
90 return Decision::deny(DecisionReason::RobotsProtected);
91 }
92 }
93
94 let effective = self.effective_rules(policy);
96
97 if effective.rules.is_empty() {
98 return Decision::allow(DecisionReason::NoMatchingRule);
100 }
101
102 let normalized_path = encoding::normalize_for_comparison(path);
104
105 self.find_best_match(&effective.rules, &normalized_path)
107 }
108
109 fn find_best_match(&self, rules: &[Rule], path: &str) -> Decision {
111 let mut best_match: Option<(&Rule, usize)> = None;
112
113 for rule in rules {
114 let normalized_pattern = encoding::normalize_for_comparison(&rule.pattern);
116
117 if rule.matches(path) {
119 let specificity = self.calculate_specificity(&normalized_pattern, path);
120
121 debug!(
122 "Rule {:?} {} matches {} with specificity {}",
123 rule.kind, rule.pattern, path, specificity
124 );
125
126 match best_match {
127 None => {
128 best_match = Some((rule, specificity));
129 }
130 Some((_, best_spec)) => {
131 if specificity > best_spec {
132 best_match = Some((rule, specificity));
134 } else if specificity == best_spec {
135 if let Some((best_rule, _)) = best_match {
137 if rule.kind == RuleKind::Allow && best_rule.kind == RuleKind::Disallow {
138 best_match = Some((rule, specificity));
139 }
140 }
141 }
142 }
143 }
144 }
145 }
146
147 match best_match {
148 Some((rule, _)) => {
149 let allowed = rule.kind == RuleKind::Allow;
150 let reason = if allowed {
151 DecisionReason::AllowRuleMatched
152 } else {
153 DecisionReason::DisallowRuleMatched
154 };
155 Decision {
156 allowed,
157 matched_rule: Some(rule.clone()),
158 reason,
159 }
160 }
161 None => {
162 Decision::allow(DecisionReason::NoMatchingRule)
164 }
165 }
166 }
167
168 fn calculate_specificity(&self, pattern: &str, _path: &str) -> usize {
170 pattern
173 .chars()
174 .filter(|&c| c != '*' && c != '$')
175 .count()
176 }
177}
178
179pub fn is_allowed(policy: &RobotsPolicy, user_agent: &str, path: &str) -> Decision {
181 let matcher = RobotsMatcher::new(user_agent);
182 matcher.is_allowed(policy, path)
183}
184
185pub fn get_crawl_delay(policy: &RobotsPolicy, user_agent: &str) -> Option<f64> {
187 let matcher = RobotsMatcher::new(user_agent);
188 let effective = matcher.effective_rules(policy);
189 effective.crawl_delay
190}
191
192#[cfg(test)]
193mod tests {
194 use super::*;
195 use crate::parser::RobotsParser;
196 use std::time::Duration;
197
198 fn parse_robots(content: &str) -> RobotsPolicy {
199 let parser = RobotsParser::new();
200 parser.parse(content, Duration::from_secs(3600))
201 }
202
203 #[test]
204 fn test_basic_disallow() {
205 let policy = parse_robots(r#"
206User-agent: *
207Disallow: /private/
208"#);
209 let matcher = RobotsMatcher::new("TestBot");
210
211 assert!(!matcher.is_allowed(&policy, "/private/").allowed);
212 assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
213 assert!(matcher.is_allowed(&policy, "/public/").allowed);
214 }
215
216 #[test]
217 fn test_allow_overrides() {
218 let policy = parse_robots(r#"
219User-agent: *
220Disallow: /private/
221Allow: /private/public/
222"#);
223 let matcher = RobotsMatcher::new("TestBot");
224
225 assert!(!matcher.is_allowed(&policy, "/private/").allowed);
226 assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
227 assert!(matcher.is_allowed(&policy, "/private/public/").allowed);
228 assert!(matcher.is_allowed(&policy, "/private/public/file.html").allowed);
229 }
230
231 #[test]
232 fn test_longest_match_wins() {
233 let policy = parse_robots(r#"
234User-agent: *
235Allow: /private/public/
236Disallow: /private/
237"#);
238 let matcher = RobotsMatcher::new("TestBot");
239
240 assert!(matcher.is_allowed(&policy, "/private/public/").allowed);
242 assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
243 }
244
245 #[test]
246 fn test_equal_length_allow_wins() {
247 let policy = parse_robots(r#"
248User-agent: *
249Disallow: /path
250Allow: /path
251"#);
252 let matcher = RobotsMatcher::new("TestBot");
253
254 assert!(matcher.is_allowed(&policy, "/path").allowed);
256 }
257
258 #[test]
259 fn test_wildcard_pattern() {
260 let policy = parse_robots(r#"
261User-agent: *
262Disallow: /*.php
263"#);
264 let matcher = RobotsMatcher::new("TestBot");
265
266 assert!(!matcher.is_allowed(&policy, "/index.php").allowed);
267 assert!(!matcher.is_allowed(&policy, "/admin/login.php").allowed);
268 assert!(matcher.is_allowed(&policy, "/index.html").allowed);
269 }
270
271 #[test]
272 fn test_end_anchor() {
273 let policy = parse_robots(r#"
274User-agent: *
275Disallow: /exact$
276"#);
277 let matcher = RobotsMatcher::new("TestBot");
278
279 assert!(!matcher.is_allowed(&policy, "/exact").allowed);
280 assert!(matcher.is_allowed(&policy, "/exact/").allowed);
281 assert!(matcher.is_allowed(&policy, "/exact/more").allowed);
282 }
283
284 #[test]
285 fn test_specific_user_agent() {
286 let policy = parse_robots(r#"
287User-agent: Googlebot
288Disallow: /google-only
289
290User-agent: *
291Disallow: /admin
292"#);
293
294 let google = RobotsMatcher::new("Googlebot");
295 let other = RobotsMatcher::new("OtherBot");
296
297 assert!(!google.is_allowed(&policy, "/google-only").allowed);
299 assert!(google.is_allowed(&policy, "/admin").allowed); assert!(other.is_allowed(&policy, "/google-only").allowed);
303 assert!(!other.is_allowed(&policy, "/admin").allowed);
304 }
305
306 #[test]
307 fn test_robots_txt_always_allowed() {
308 let policy = parse_robots(r#"
309User-agent: *
310Disallow: /
311"#);
312 let matcher = RobotsMatcher::new("TestBot");
313
314 assert!(matcher.is_allowed(&policy, "/robots.txt").allowed);
316 assert!(!matcher.is_allowed(&policy, "/anything").allowed);
317 }
318
319 #[test]
320 fn test_user_agent_prefix_match() {
321 let policy = parse_robots(r#"
322User-agent: Googlebot
323Disallow: /
324"#);
325
326 let bot = RobotsMatcher::new("Googlebot-Image");
328 assert!(!bot.is_allowed(&policy, "/test").allowed);
329 }
330
331 #[test]
332 fn test_crawl_delay() {
333 let policy = parse_robots(r#"
334User-agent: *
335Crawl-delay: 2.5
336Disallow: /admin
337"#);
338
339 let delay = get_crawl_delay(&policy, "TestBot");
340 assert_eq!(delay, Some(2.5));
341 }
342
343 #[test]
344 fn test_no_rules_means_allowed() {
345 let policy = parse_robots(r#"
346User-agent: *
347"#);
348 let matcher = RobotsMatcher::new("TestBot");
349
350 assert!(matcher.is_allowed(&policy, "/anything").allowed);
351 }
352
353 #[test]
354 fn test_empty_robots() {
355 let policy = parse_robots("");
356 let matcher = RobotsMatcher::new("TestBot");
357
358 assert!(matcher.is_allowed(&policy, "/anything").allowed);
359 }
360}