use serde::{Deserialize, Serialize};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use url::Url;
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct RobotsCacheKey {
pub scheme: String,
pub authority: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct RequestRate {
pub requests: u32,
pub seconds: u32,
}
impl RequestRate {
pub fn new(requests: u32, seconds: u32) -> Self {
Self { requests, seconds }
}
pub fn delay_seconds(&self) -> f64 {
if self.requests == 0 {
f64::MAX
} else {
self.seconds as f64 / self.requests as f64
}
}
pub fn delay(&self) -> Duration {
Duration::from_secs_f64(self.delay_seconds())
}
}
impl RobotsCacheKey {
pub fn from_url(url: &Url) -> Option<Self> {
let host = url.host_str()?;
let authority = match url.port() {
Some(port) => format!("{}:{}", host, port),
None => host.to_string(),
};
Some(Self {
scheme: url.scheme().to_lowercase(),
authority: authority.to_lowercase(),
})
}
pub fn robots_url(&self) -> String {
format!("{}://{}/robots.txt", self.scheme, self.authority)
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum FetchStatus {
Success,
NotModified,
Unavailable {
status_code: u16,
},
Unreachable {
reason: String,
},
Protected {
status_code: u16,
},
}
impl FetchStatus {
pub fn allows_all(&self) -> bool {
matches!(self, FetchStatus::Unavailable { .. })
}
pub fn denies_all(&self) -> bool {
matches!(self, FetchStatus::Unreachable { .. } | FetchStatus::Protected { .. })
}
pub fn is_not_modified(&self) -> bool {
matches!(self, FetchStatus::NotModified)
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Rule {
pub kind: RuleKind,
pub pattern: String,
#[serde(skip)]
pub compiled: Option<CompiledPattern>,
}
impl Rule {
pub fn new(kind: RuleKind, pattern: String) -> Self {
let compiled = CompiledPattern::compile(&pattern);
Self {
kind,
pattern,
compiled: Some(compiled),
}
}
pub fn matches(&self, path: &str) -> bool {
match &self.compiled {
Some(compiled) => compiled.matches(path),
None => CompiledPattern::compile(&self.pattern).matches(path),
}
}
pub fn specificity(&self) -> usize {
self.pattern.chars().filter(|&c| c != '*').count()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum RuleKind {
Allow,
Disallow,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CompiledPattern {
segments: Vec<PatternSegment>,
anchored_end: bool,
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum PatternSegment {
Literal(String),
Wildcard,
}
impl CompiledPattern {
pub fn compile(pattern: &str) -> Self {
let anchored_end = pattern.ends_with('$');
let pattern = if anchored_end {
&pattern[..pattern.len() - 1]
} else {
pattern
};
let mut segments = Vec::new();
let mut current = String::new();
for c in pattern.chars() {
if c == '*' {
if !current.is_empty() {
segments.push(PatternSegment::Literal(current.clone()));
current.clear();
}
if !matches!(segments.last(), Some(PatternSegment::Wildcard)) {
segments.push(PatternSegment::Wildcard);
}
} else {
current.push(c);
}
}
if !current.is_empty() {
segments.push(PatternSegment::Literal(current));
}
Self {
segments,
anchored_end,
}
}
pub fn matches(&self, path: &str) -> bool {
self.matches_recursive(path, 0)
}
fn matches_recursive(&self, remaining: &str, segment_idx: usize) -> bool {
if segment_idx >= self.segments.len() {
return if self.anchored_end {
remaining.is_empty()
} else {
true
};
}
match &self.segments[segment_idx] {
PatternSegment::Literal(lit) => {
if remaining.starts_with(lit.as_str()) {
self.matches_recursive(&remaining[lit.len()..], segment_idx + 1)
} else {
false
}
}
PatternSegment::Wildcard => {
if segment_idx + 1 >= self.segments.len() {
if self.anchored_end {
remaining.is_empty() || self.matches_recursive(remaining, segment_idx + 1)
} else {
true
}
} else {
for i in 0..=remaining.len() {
if self.matches_recursive(&remaining[i..], segment_idx + 1) {
return true;
}
}
false
}
}
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Group {
pub user_agents: Vec<String>,
pub rules: Vec<Rule>,
pub crawl_delay: Option<f64>,
pub request_rate: Option<RequestRate>,
}
impl Group {
pub fn matches_user_agent(&self, token: &str) -> bool {
let token_lower = token.to_lowercase();
self.user_agents.iter().any(|ua| {
let ua_lower = ua.to_lowercase();
token_lower.starts_with(&ua_lower) || ua_lower == "*"
})
}
pub fn is_wildcard(&self) -> bool {
self.user_agents.iter().any(|ua| ua == "*")
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RobotsPolicy {
pub fetched_at_ms: u64,
pub expires_at_ms: u64,
pub fetch_status: FetchStatus,
pub groups: Vec<Group>,
pub sitemaps: Vec<String>,
pub content_size: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub etag: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_modified: Option<String>,
}
impl RobotsPolicy {
fn now_millis() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64
}
pub fn unavailable(status_code: u16, ttl: Duration) -> Self {
let now = Self::now_millis();
Self {
fetched_at_ms: now,
expires_at_ms: now + ttl.as_millis() as u64,
fetch_status: FetchStatus::Unavailable { status_code },
groups: Vec::new(),
sitemaps: Vec::new(),
content_size: 0,
etag: None,
last_modified: None,
}
}
pub fn unreachable(reason: String, ttl: Duration) -> Self {
let now = Self::now_millis();
Self {
fetched_at_ms: now,
expires_at_ms: now + ttl.as_millis() as u64,
fetch_status: FetchStatus::Unreachable { reason },
groups: Vec::new(),
sitemaps: Vec::new(),
content_size: 0,
etag: None,
last_modified: None,
}
}
pub fn protected(status_code: u16, ttl: Duration) -> Self {
let now = Self::now_millis();
Self {
fetched_at_ms: now,
expires_at_ms: now + ttl.as_millis() as u64,
fetch_status: FetchStatus::Protected { status_code },
groups: Vec::new(),
sitemaps: Vec::new(),
content_size: 0,
etag: None,
last_modified: None,
}
}
pub fn not_modified(ttl: Duration) -> Self {
let now = Self::now_millis();
Self {
fetched_at_ms: now,
expires_at_ms: now + ttl.as_millis() as u64,
fetch_status: FetchStatus::NotModified,
groups: Vec::new(),
sitemaps: Vec::new(),
content_size: 0,
etag: None,
last_modified: None,
}
}
pub fn extend_ttl(&mut self, ttl: Duration) {
self.expires_at_ms = Self::now_millis() + ttl.as_millis() as u64;
}
pub fn is_expired(&self) -> bool {
Self::now_millis() > self.expires_at_ms
}
pub fn ttl(&self) -> Duration {
let now = Self::now_millis();
if self.expires_at_ms > now {
Duration::from_millis(self.expires_at_ms - now)
} else {
Duration::ZERO
}
}
}
#[derive(Debug, Clone)]
pub struct Decision {
pub allowed: bool,
pub matched_rule: Option<Rule>,
pub reason: DecisionReason,
}
impl Decision {
pub fn allow(reason: DecisionReason) -> Self {
Self {
allowed: true,
matched_rule: None,
reason,
}
}
pub fn deny(reason: DecisionReason) -> Self {
Self {
allowed: false,
matched_rule: None,
reason,
}
}
pub fn with_rule(mut self, rule: Rule) -> Self {
self.matched_rule = Some(rule);
self
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DecisionReason {
RobotsUnavailable,
RobotsUnreachable,
RobotsProtected,
NoMatchingRule,
AllowRuleMatched,
DisallowRuleMatched,
RobotsTxtPath,
RobotsDisabled,
}
#[derive(Debug, Clone)]
pub struct EffectiveRules {
pub rules: Vec<Rule>,
pub crawl_delay: Option<f64>,
pub request_rate: Option<RequestRate>,
pub matched_agents: Vec<String>,
}
impl EffectiveRules {
pub fn empty() -> Self {
Self {
rules: Vec::new(),
crawl_delay: None,
request_rate: None,
matched_agents: Vec::new(),
}
}
pub fn effective_delay(&self) -> Option<Duration> {
if let Some(delay) = self.crawl_delay {
return Some(Duration::from_secs_f64(delay));
}
if let Some(rate) = self.request_rate {
return Some(rate.delay());
}
None
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RobotsConfig {
pub user_agent: String,
pub cache_ttl_secs: u64,
pub respect_robots: bool,
pub default_crawl_delay_ms: u64,
pub max_robots_size: usize,
pub max_redirects: u32,
pub fetch_timeout_secs: u64,
pub safe_mode: bool,
}
impl Default for RobotsConfig {
fn default() -> Self {
Self {
user_agent: "HalldyllBot/1.0".to_string(),
cache_ttl_secs: 3600, respect_robots: true,
default_crawl_delay_ms: 100,
max_robots_size: 512 * 1024, max_redirects: 5,
fetch_timeout_secs: 10,
safe_mode: true, }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cache_key_from_url() {
let url = Url::parse("https://example.com:8080/path").unwrap();
let key = RobotsCacheKey::from_url(&url).unwrap();
assert_eq!(key.scheme, "https");
assert_eq!(key.authority, "example.com:8080");
assert_eq!(key.robots_url(), "https://example.com:8080/robots.txt");
}
#[test]
fn test_compiled_pattern_literal() {
let pattern = CompiledPattern::compile("/admin/");
assert!(pattern.matches("/admin/"));
assert!(pattern.matches("/admin/users"));
assert!(!pattern.matches("/administrator"));
}
#[test]
fn test_compiled_pattern_wildcard() {
let pattern = CompiledPattern::compile("/api/*/users");
assert!(pattern.matches("/api/v1/users"));
assert!(pattern.matches("/api/v2/users"));
assert!(pattern.matches("/api//users"));
assert!(!pattern.matches("/api/users"));
}
#[test]
fn test_compiled_pattern_anchored() {
let pattern = CompiledPattern::compile("/exact$");
assert!(pattern.matches("/exact"));
assert!(!pattern.matches("/exact/"));
assert!(!pattern.matches("/exact/more"));
}
#[test]
fn test_compiled_pattern_complex() {
let pattern = CompiledPattern::compile("/*.php$");
assert!(pattern.matches("/index.php"));
assert!(pattern.matches("/admin/login.php"));
assert!(!pattern.matches("/index.php5"));
assert!(!pattern.matches("/index.php/extra"));
}
#[test]
fn test_rule_specificity() {
let rule1 = Rule::new(RuleKind::Disallow, "/admin".to_string());
let rule2 = Rule::new(RuleKind::Allow, "/admin/public".to_string());
assert!(rule2.specificity() > rule1.specificity());
}
#[test]
fn test_group_matches_user_agent() {
let group = Group {
user_agents: vec!["Googlebot".to_string(), "Bingbot".to_string()],
rules: vec![],
crawl_delay: None,
request_rate: None,
};
assert!(group.matches_user_agent("Googlebot"));
assert!(group.matches_user_agent("googlebot")); assert!(group.matches_user_agent("Googlebot-Image")); assert!(!group.matches_user_agent("Yandexbot"));
}
#[test]
fn test_request_rate() {
let rate = RequestRate::new(1, 10);
assert_eq!(rate.requests, 1);
assert_eq!(rate.seconds, 10);
assert!((rate.delay_seconds() - 10.0).abs() < 0.001);
assert_eq!(rate.delay(), Duration::from_secs(10));
let rate2 = RequestRate::new(2, 10);
assert!((rate2.delay_seconds() - 5.0).abs() < 0.001);
}
#[test]
fn test_effective_rules_delay() {
let mut rules = EffectiveRules::empty();
assert!(rules.effective_delay().is_none());
rules.crawl_delay = Some(2.5);
assert_eq!(rules.effective_delay(), Some(Duration::from_secs_f64(2.5)));
rules.request_rate = Some(RequestRate::new(1, 10));
assert_eq!(rules.effective_delay(), Some(Duration::from_secs_f64(2.5)));
rules.crawl_delay = None;
assert_eq!(rules.effective_delay(), Some(Duration::from_secs(10)));
}
#[test]
fn test_fetch_status_not_modified() {
let status = FetchStatus::NotModified;
assert!(status.is_not_modified());
assert!(!status.allows_all());
assert!(!status.denies_all());
}
#[test]
fn test_policy_extend_ttl() {
let mut policy = RobotsPolicy::unavailable(404, Duration::from_secs(60));
let original_expires = policy.expires_at_ms;
std::thread::sleep(Duration::from_millis(10));
policy.extend_ttl(Duration::from_secs(3600));
assert!(policy.expires_at_ms > original_expires);
}
}