use std::collections::HashSet;
use std::time::Duration;
use tracing::debug;
use url::Url;
use crate::cache::{CacheStatsSnapshot, RobotsCache};
use crate::fetcher::{FetchStatsSnapshot, RobotsFetcher};
use crate::matcher::RobotsMatcher;
use crate::types::{
Decision, DecisionReason, EffectiveRules, FetchStatus, RobotsCacheKey,
RobotsConfig, RobotsPolicy, Rule,
};
pub struct RobotsChecker {
config: RobotsConfig,
fetcher: RobotsFetcher,
cache: RobotsCache,
matcher: RobotsMatcher,
}
impl RobotsChecker {
pub fn new(config: RobotsConfig) -> Self {
let matcher = RobotsMatcher::new(&config.user_agent);
let fetcher = RobotsFetcher::new(config.clone());
let cache = RobotsCache::new(Duration::from_secs(config.cache_ttl_secs));
Self {
config,
fetcher,
cache,
matcher,
}
}
pub fn with_persistence(config: RobotsConfig, persist_dir: &str) -> Self {
let matcher = RobotsMatcher::new(&config.user_agent);
let fetcher = RobotsFetcher::new(config.clone());
let cache = RobotsCache::with_persistence(
Duration::from_secs(config.cache_ttl_secs),
persist_dir,
);
Self {
config,
fetcher,
cache,
matcher,
}
}
pub async fn is_allowed(&self, url: &Url) -> Decision {
if !self.config.respect_robots {
return Decision::allow(DecisionReason::RobotsDisabled);
}
let policy = self.get_policy(url).await;
let full_path = Self::build_full_path(url);
self.matcher.is_allowed(&policy, &full_path)
}
pub async fn is_path_allowed(&self, origin: &Url, path: &str) -> Decision {
if !self.config.respect_robots {
return Decision::allow(DecisionReason::RobotsDisabled);
}
let policy = self.get_policy(origin).await;
self.matcher.is_allowed(&policy, path)
}
pub async fn get_policy(&self, url: &Url) -> RobotsPolicy {
let key = match RobotsCacheKey::from_url(url) {
Some(k) => k,
None => {
return RobotsPolicy::unreachable(
"Invalid URL".to_string(),
Duration::from_secs(60),
);
}
};
if let Some(cached_policy) = self.cache.get(&key) {
if !cached_policy.is_expired() {
return cached_policy;
}
if cached_policy.etag.is_some() || cached_policy.last_modified.is_some() {
let new_policy = self.fetcher.fetch_for_key_conditional(
&key,
cached_policy.etag.as_deref(),
cached_policy.last_modified.as_deref(),
).await;
if new_policy.fetch_status.is_not_modified() {
let mut refreshed = cached_policy.clone();
refreshed.extend_ttl(new_policy.ttl());
self.cache.insert(key, refreshed.clone());
return refreshed;
}
self.cache.insert(key, new_policy.clone());
return new_policy;
}
}
let policy = self.fetcher.fetch_for_key(&key).await;
self.cache.insert(key, policy.clone());
policy
}
pub async fn get_crawl_delay(&self, url: &Url) -> Option<Duration> {
let policy = self.get_policy(url).await;
let effective = self.matcher.effective_rules(&policy);
effective.effective_delay()
}
pub async fn get_request_rate(&self, url: &Url) -> Option<crate::types::RequestRate> {
let policy = self.get_policy(url).await;
let effective = self.matcher.effective_rules(&policy);
effective.request_rate
}
pub async fn get_sitemaps(&self, url: &Url) -> Vec<String> {
let policy = self.get_policy(url).await;
policy.sitemaps
}
pub async fn get_effective_rules(&self, url: &Url) -> EffectiveRules {
let policy = self.get_policy(url).await;
self.matcher.effective_rules(&policy)
}
pub async fn diagnose(&self, url: &Url) -> RobotsDiagnostics {
let policy = self.get_policy(url).await;
let effective = self.matcher.effective_rules(&policy);
let full_path = Self::build_full_path(url);
let decision = self.matcher.is_allowed(&policy, &full_path);
RobotsDiagnostics {
url: url.to_string(),
robots_url: RobotsCacheKey::from_url(url)
.map(|k| k.robots_url())
.unwrap_or_default(),
fetch_status: policy.fetch_status.clone(),
groups_count: policy.groups.len(),
matched_agents: effective.matched_agents,
applicable_rules: effective.rules,
decision,
}
}
pub fn cache_stats(&self) -> CacheStatsSnapshot {
self.cache.stats().snapshot()
}
pub fn fetch_stats(&self) -> FetchStatsSnapshot {
self.fetcher.stats().snapshot()
}
pub fn clear_cache(&self) {
self.cache.clear();
}
pub fn evict_expired(&self) -> usize {
self.cache.evict_expired()
}
pub async fn save_cache(&self) -> std::io::Result<usize> {
self.cache.save_to_disk().await
}
pub async fn load_cache(&self) -> std::io::Result<usize> {
self.cache.load_from_disk().await
}
pub fn cached_domains(&self) -> Vec<String> {
self.cache.domains()
}
pub fn config(&self) -> &RobotsConfig {
&self.config
}
pub async fn prefetch(&self, urls: &[Url]) {
let mut seen = HashSet::new();
for url in urls {
if let Some(key) = RobotsCacheKey::from_url(url) {
if seen.insert(key.clone()) && self.cache.get(&key).is_none() {
debug!("Prefetching robots.txt for {}", key.authority);
let policy = self.fetcher.fetch_for_key(&key).await;
self.cache.insert(key, policy);
}
}
}
}
fn build_full_path(url: &Url) -> String {
let path = url.path();
match url.query() {
Some(q) => format!("{}?{}", path, q),
None => path.to_string(),
}
}
}
#[derive(Debug, Clone)]
pub struct RobotsDiagnostics {
pub url: String,
pub robots_url: String,
pub fetch_status: FetchStatus,
pub groups_count: usize,
pub matched_agents: Vec<String>,
pub applicable_rules: Vec<Rule>,
pub decision: Decision,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_config_default() {
let config = RobotsConfig::default();
assert!(config.respect_robots);
assert!(config.safe_mode);
assert!(config.max_robots_size >= 500 * 1024);
}
#[tokio::test]
async fn test_checker_creation() {
let config = RobotsConfig::default();
let _checker = RobotsChecker::new(config);
}
#[tokio::test]
async fn test_disabled_robots() {
let config = RobotsConfig {
respect_robots: false,
..Default::default()
};
let checker = RobotsChecker::new(config);
let url = Url::parse("https://example.com/blocked").unwrap();
let decision = checker.is_allowed(&url).await;
assert!(decision.allowed);
assert_eq!(decision.reason, DecisionReason::RobotsDisabled);
}
#[test]
fn test_build_full_path() {
let url = Url::parse("https://example.com/path?query=1").unwrap();
assert_eq!(RobotsChecker::build_full_path(&url), "/path?query=1");
let url = Url::parse("https://example.com/path").unwrap();
assert_eq!(RobotsChecker::build_full_path(&url), "/path");
}
}