use crate::parser::encoding;
use crate::types::{Decision, DecisionReason, EffectiveRules, Group, Rule, RuleKind, RobotsPolicy};
use tracing::debug;
pub struct RobotsMatcher {
user_agent: String,
}
impl RobotsMatcher {
pub fn new(user_agent: &str) -> Self {
Self {
user_agent: user_agent.to_string(),
}
}
pub fn effective_rules(&self, policy: &RobotsPolicy) -> EffectiveRules {
let mut matched_groups: Vec<&Group> = Vec::new();
let mut wildcard_group: Option<&Group> = None;
for group in &policy.groups {
if group.is_wildcard() {
wildcard_group = Some(group);
} else if group.matches_user_agent(&self.user_agent) {
matched_groups.push(group);
}
}
if matched_groups.is_empty() {
if let Some(wg) = wildcard_group {
matched_groups.push(wg);
}
}
let mut rules: Vec<Rule> = Vec::new();
let mut crawl_delay: Option<f64> = None;
let mut request_rate = None;
let mut matched_agents: Vec<String> = Vec::new();
for group in matched_groups {
matched_agents.extend(group.user_agents.clone());
rules.extend(group.rules.clone());
if crawl_delay.is_none() && group.crawl_delay.is_some() {
crawl_delay = group.crawl_delay;
}
if request_rate.is_none() && group.request_rate.is_some() {
request_rate = group.request_rate;
}
}
EffectiveRules {
rules,
crawl_delay,
request_rate,
matched_agents,
}
}
pub fn is_allowed(&self, policy: &RobotsPolicy, path: &str) -> Decision {
if path == "/robots.txt" || path.starts_with("/robots.txt?") {
return Decision::allow(DecisionReason::RobotsTxtPath);
}
match &policy.fetch_status {
crate::types::FetchStatus::Success => {
}
crate::types::FetchStatus::NotModified => {
}
crate::types::FetchStatus::Unavailable { .. } => {
return Decision::allow(DecisionReason::RobotsUnavailable);
}
crate::types::FetchStatus::Unreachable { .. } => {
return Decision::deny(DecisionReason::RobotsUnreachable);
}
crate::types::FetchStatus::Protected { .. } => {
return Decision::deny(DecisionReason::RobotsProtected);
}
}
let effective = self.effective_rules(policy);
if effective.rules.is_empty() {
return Decision::allow(DecisionReason::NoMatchingRule);
}
let normalized_path = encoding::normalize_for_comparison(path);
self.find_best_match(&effective.rules, &normalized_path)
}
fn find_best_match(&self, rules: &[Rule], path: &str) -> Decision {
let mut best_match: Option<(&Rule, usize)> = None;
for rule in rules {
let normalized_pattern = encoding::normalize_for_comparison(&rule.pattern);
if rule.matches(path) {
let specificity = self.calculate_specificity(&normalized_pattern, path);
debug!(
"Rule {:?} {} matches {} with specificity {}",
rule.kind, rule.pattern, path, specificity
);
match best_match {
None => {
best_match = Some((rule, specificity));
}
Some((_, best_spec)) => {
if specificity > best_spec {
best_match = Some((rule, specificity));
} else if specificity == best_spec {
if let Some((best_rule, _)) = best_match {
if rule.kind == RuleKind::Allow && best_rule.kind == RuleKind::Disallow {
best_match = Some((rule, specificity));
}
}
}
}
}
}
}
match best_match {
Some((rule, _)) => {
let allowed = rule.kind == RuleKind::Allow;
let reason = if allowed {
DecisionReason::AllowRuleMatched
} else {
DecisionReason::DisallowRuleMatched
};
Decision {
allowed,
matched_rule: Some(rule.clone()),
reason,
}
}
None => {
Decision::allow(DecisionReason::NoMatchingRule)
}
}
}
fn calculate_specificity(&self, pattern: &str, _path: &str) -> usize {
pattern
.chars()
.filter(|&c| c != '*' && c != '$')
.count()
}
}
pub fn is_allowed(policy: &RobotsPolicy, user_agent: &str, path: &str) -> Decision {
let matcher = RobotsMatcher::new(user_agent);
matcher.is_allowed(policy, path)
}
pub fn get_crawl_delay(policy: &RobotsPolicy, user_agent: &str) -> Option<f64> {
let matcher = RobotsMatcher::new(user_agent);
let effective = matcher.effective_rules(policy);
effective.crawl_delay
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::RobotsParser;
use std::time::Duration;
fn parse_robots(content: &str) -> RobotsPolicy {
let parser = RobotsParser::new();
parser.parse(content, Duration::from_secs(3600))
}
#[test]
fn test_basic_disallow() {
let policy = parse_robots(r#"
User-agent: *
Disallow: /private/
"#);
let matcher = RobotsMatcher::new("TestBot");
assert!(!matcher.is_allowed(&policy, "/private/").allowed);
assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
assert!(matcher.is_allowed(&policy, "/public/").allowed);
}
#[test]
fn test_allow_overrides() {
let policy = parse_robots(r#"
User-agent: *
Disallow: /private/
Allow: /private/public/
"#);
let matcher = RobotsMatcher::new("TestBot");
assert!(!matcher.is_allowed(&policy, "/private/").allowed);
assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
assert!(matcher.is_allowed(&policy, "/private/public/").allowed);
assert!(matcher.is_allowed(&policy, "/private/public/file.html").allowed);
}
#[test]
fn test_longest_match_wins() {
let policy = parse_robots(r#"
User-agent: *
Allow: /private/public/
Disallow: /private/
"#);
let matcher = RobotsMatcher::new("TestBot");
assert!(matcher.is_allowed(&policy, "/private/public/").allowed);
assert!(!matcher.is_allowed(&policy, "/private/secret").allowed);
}
#[test]
fn test_equal_length_allow_wins() {
let policy = parse_robots(r#"
User-agent: *
Disallow: /path
Allow: /path
"#);
let matcher = RobotsMatcher::new("TestBot");
assert!(matcher.is_allowed(&policy, "/path").allowed);
}
#[test]
fn test_wildcard_pattern() {
let policy = parse_robots(r#"
User-agent: *
Disallow: /*.php
"#);
let matcher = RobotsMatcher::new("TestBot");
assert!(!matcher.is_allowed(&policy, "/index.php").allowed);
assert!(!matcher.is_allowed(&policy, "/admin/login.php").allowed);
assert!(matcher.is_allowed(&policy, "/index.html").allowed);
}
#[test]
fn test_end_anchor() {
let policy = parse_robots(r#"
User-agent: *
Disallow: /exact$
"#);
let matcher = RobotsMatcher::new("TestBot");
assert!(!matcher.is_allowed(&policy, "/exact").allowed);
assert!(matcher.is_allowed(&policy, "/exact/").allowed);
assert!(matcher.is_allowed(&policy, "/exact/more").allowed);
}
#[test]
fn test_specific_user_agent() {
let policy = parse_robots(r#"
User-agent: Googlebot
Disallow: /google-only
User-agent: *
Disallow: /admin
"#);
let google = RobotsMatcher::new("Googlebot");
let other = RobotsMatcher::new("OtherBot");
assert!(!google.is_allowed(&policy, "/google-only").allowed);
assert!(google.is_allowed(&policy, "/admin").allowed);
assert!(other.is_allowed(&policy, "/google-only").allowed);
assert!(!other.is_allowed(&policy, "/admin").allowed);
}
#[test]
fn test_robots_txt_always_allowed() {
let policy = parse_robots(r#"
User-agent: *
Disallow: /
"#);
let matcher = RobotsMatcher::new("TestBot");
assert!(matcher.is_allowed(&policy, "/robots.txt").allowed);
assert!(!matcher.is_allowed(&policy, "/anything").allowed);
}
#[test]
fn test_user_agent_prefix_match() {
let policy = parse_robots(r#"
User-agent: Googlebot
Disallow: /
"#);
let bot = RobotsMatcher::new("Googlebot-Image");
assert!(!bot.is_allowed(&policy, "/test").allowed);
}
#[test]
fn test_crawl_delay() {
let policy = parse_robots(r#"
User-agent: *
Crawl-delay: 2.5
Disallow: /admin
"#);
let delay = get_crawl_delay(&policy, "TestBot");
assert_eq!(delay, Some(2.5));
}
#[test]
fn test_no_rules_means_allowed() {
let policy = parse_robots(r#"
User-agent: *
"#);
let matcher = RobotsMatcher::new("TestBot");
assert!(matcher.is_allowed(&policy, "/anything").allowed);
}
#[test]
fn test_empty_robots() {
let policy = parse_robots("");
let matcher = RobotsMatcher::new("TestBot");
assert!(matcher.is_allowed(&policy, "/anything").allowed);
}
}