use crate::cache::{parse_cache_control, DEFAULT_CACHE_TTL, MAX_CACHE_TTL};
use crate::parser::RobotsParser;
use crate::types::{RobotsCacheKey, RobotsConfig, RobotsPolicy};
use reqwest::{Client, Response};
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
use tracing::{debug, info, warn};
use url::Url;
#[derive(Debug, Default)]
pub struct FetchStats {
pub total: AtomicU64,
pub success: AtomicU64,
pub unavailable: AtomicU64,
pub unreachable: AtomicU64,
pub protected: AtomicU64,
pub conditional_hits: AtomicU64,
pub bytes_fetched: AtomicU64,
pub fetch_time_ms: AtomicU64,
pub min_fetch_time_ms: AtomicU64,
pub max_fetch_time_ms: AtomicU64,
}
impl FetchStats {
pub fn snapshot(&self) -> FetchStatsSnapshot {
FetchStatsSnapshot {
total: self.total.load(Ordering::Relaxed),
success: self.success.load(Ordering::Relaxed),
unavailable: self.unavailable.load(Ordering::Relaxed),
unreachable: self.unreachable.load(Ordering::Relaxed),
protected: self.protected.load(Ordering::Relaxed),
conditional_hits: self.conditional_hits.load(Ordering::Relaxed),
bytes_fetched: self.bytes_fetched.load(Ordering::Relaxed),
fetch_time_ms: self.fetch_time_ms.load(Ordering::Relaxed),
min_fetch_time_ms: self.min_fetch_time_ms.load(Ordering::Relaxed),
max_fetch_time_ms: self.max_fetch_time_ms.load(Ordering::Relaxed),
}
}
fn update_min_max(&self, elapsed_ms: u64) {
let mut current_min = self.min_fetch_time_ms.load(Ordering::Relaxed);
while current_min == 0 || elapsed_ms < current_min {
match self.min_fetch_time_ms.compare_exchange_weak(
current_min,
elapsed_ms,
Ordering::Relaxed,
Ordering::Relaxed,
) {
Ok(_) => break,
Err(x) => current_min = x,
}
}
let mut current_max = self.max_fetch_time_ms.load(Ordering::Relaxed);
while elapsed_ms > current_max {
match self.max_fetch_time_ms.compare_exchange_weak(
current_max,
elapsed_ms,
Ordering::Relaxed,
Ordering::Relaxed,
) {
Ok(_) => break,
Err(x) => current_max = x,
}
}
}
}
#[derive(Debug, Clone)]
pub struct FetchStatsSnapshot {
pub total: u64,
pub success: u64,
pub unavailable: u64,
pub unreachable: u64,
pub protected: u64,
pub conditional_hits: u64,
pub bytes_fetched: u64,
pub fetch_time_ms: u64,
pub min_fetch_time_ms: u64,
pub max_fetch_time_ms: u64,
}
impl FetchStatsSnapshot {
pub fn avg_fetch_time_ms(&self) -> f64 {
if self.total == 0 {
0.0
} else {
self.fetch_time_ms as f64 / self.total as f64
}
}
pub fn success_rate(&self) -> f64 {
if self.total == 0 {
0.0
} else {
self.success as f64 / self.total as f64
}
}
pub fn conditional_hit_rate(&self) -> f64 {
if self.total == 0 {
0.0
} else {
self.conditional_hits as f64 / self.total as f64
}
}
}
pub struct RobotsFetcher {
client: Client,
config: RobotsConfig,
parser: RobotsParser,
stats: Arc<FetchStats>,
}
impl RobotsFetcher {
pub fn new(config: RobotsConfig) -> Self {
let max_robots_size = config.max_robots_size;
let client = Client::builder()
.user_agent(&config.user_agent)
.timeout(Duration::from_secs(config.fetch_timeout_secs))
.redirect(reqwest::redirect::Policy::limited(config.max_redirects as usize))
.build()
.expect("Failed to create HTTP client");
Self {
client,
config,
parser: RobotsParser::with_max_size(max_robots_size),
stats: Arc::new(FetchStats::default()),
}
}
pub fn stats(&self) -> Arc<FetchStats> {
self.stats.clone()
}
pub async fn fetch(&self, url: &Url) -> RobotsPolicy {
let key = match RobotsCacheKey::from_url(url) {
Some(k) => k,
None => {
return RobotsPolicy::unreachable(
"Invalid URL: no host".to_string(),
DEFAULT_CACHE_TTL,
);
}
};
self.fetch_for_key(&key).await
}
pub async fn fetch_for_key(&self, key: &RobotsCacheKey) -> RobotsPolicy {
self.fetch_for_key_conditional(key, None, None).await
}
pub async fn fetch_for_key_conditional(
&self,
key: &RobotsCacheKey,
etag: Option<&str>,
last_modified: Option<&str>,
) -> RobotsPolicy {
let robots_url = key.robots_url();
let start = Instant::now();
self.stats.total.fetch_add(1, Ordering::Relaxed);
debug!("Fetching robots.txt from {}", robots_url);
let result = self.do_fetch_conditional(&robots_url, etag, last_modified).await;
let elapsed = start.elapsed();
let elapsed_ms = elapsed.as_millis() as u64;
self.stats.fetch_time_ms.fetch_add(elapsed_ms, Ordering::Relaxed);
self.stats.update_min_max(elapsed_ms);
match result {
Ok((response, ttl)) => {
self.handle_response(response, ttl).await
}
Err(e) => {
self.handle_network_error(e)
}
}
}
async fn do_fetch_conditional(
&self,
url: &str,
etag: Option<&str>,
last_modified: Option<&str>,
) -> Result<(Response, Duration), reqwest::Error> {
let mut request = self.client.get(url);
if let Some(etag) = etag {
request = request.header("If-None-Match", etag);
}
if let Some(lm) = last_modified {
request = request.header("If-Modified-Since", lm);
}
let response = request.send().await?;
let ttl = response
.headers()
.get("cache-control")
.and_then(|v| v.to_str().ok())
.and_then(parse_cache_control)
.unwrap_or(Duration::from_secs(self.config.cache_ttl_secs))
.min(MAX_CACHE_TTL);
Ok((response, ttl))
}
async fn handle_response(&self, response: Response, ttl: Duration) -> RobotsPolicy {
let status = response.status();
let status_code = status.as_u16();
let etag = response
.headers()
.get("etag")
.and_then(|v| v.to_str().ok())
.map(|s| s.to_string());
let last_modified = response
.headers()
.get("last-modified")
.and_then(|v| v.to_str().ok())
.map(|s| s.to_string());
info!(
"Robots.txt response: {} ({})",
response.url(),
status_code
);
match status_code {
304 => {
self.stats.conditional_hits.fetch_add(1, Ordering::Relaxed);
debug!("Robots.txt not modified (304), use cached version");
RobotsPolicy::not_modified(ttl)
}
200..=299 => {
self.stats.success.fetch_add(1, Ordering::Relaxed);
self.parse_successful_response(response, ttl, etag, last_modified).await
}
400 | 404 | 405 | 410 | 451 => {
self.stats.unavailable.fetch_add(1, Ordering::Relaxed);
debug!("Robots.txt unavailable ({}), allowing all", status_code);
RobotsPolicy::unavailable(status_code, ttl)
}
401 | 403 => {
if self.config.safe_mode {
self.stats.protected.fetch_add(1, Ordering::Relaxed);
warn!("Robots.txt protected ({}), denying in safe mode", status_code);
RobotsPolicy::protected(status_code, ttl)
} else {
self.stats.unavailable.fetch_add(1, Ordering::Relaxed);
debug!("Robots.txt protected ({}), allowing per RFC", status_code);
RobotsPolicy::unavailable(status_code, ttl)
}
}
400..=499 => {
self.stats.unavailable.fetch_add(1, Ordering::Relaxed);
debug!("Robots.txt unavailable ({}), allowing all", status_code);
RobotsPolicy::unavailable(status_code, ttl)
}
500..=599 => {
self.stats.unreachable.fetch_add(1, Ordering::Relaxed);
warn!("Robots.txt unreachable ({}), denying all", status_code);
RobotsPolicy::unreachable(
format!("Server error: {}", status_code),
ttl,
)
}
_ => {
self.stats.unreachable.fetch_add(1, Ordering::Relaxed);
warn!("Unexpected robots.txt status ({}), denying all", status_code);
RobotsPolicy::unreachable(
format!("Unexpected status: {}", status_code),
ttl,
)
}
}
}
async fn parse_successful_response(
&self,
response: Response,
ttl: Duration,
etag: Option<String>,
last_modified: Option<String>,
) -> RobotsPolicy {
let content_type = response
.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.unwrap_or("text/plain");
if !content_type.contains("text/plain") && !content_type.contains("text/html") {
warn!(
"Unexpected Content-Type for robots.txt: {}, treating as empty",
content_type
);
return RobotsPolicy::unavailable(200, ttl);
}
match response.text().await {
Ok(content) => {
let bytes = content.len();
self.stats.bytes_fetched.fetch_add(bytes as u64, Ordering::Relaxed);
if content.is_empty() {
debug!("Empty robots.txt, allowing all");
return RobotsPolicy::unavailable(200, ttl);
}
let mut policy = self.parser.parse(&content, ttl);
policy.etag = etag;
policy.last_modified = last_modified;
policy
}
Err(e) => {
warn!("Failed to read robots.txt body: {}", e);
RobotsPolicy::unreachable(
format!("Body read error: {}", e),
DEFAULT_CACHE_TTL,
)
}
}
}
fn handle_network_error(&self, error: reqwest::Error) -> RobotsPolicy {
self.stats.unreachable.fetch_add(1, Ordering::Relaxed);
let reason = if error.is_timeout() {
"Network timeout".to_string()
} else if error.is_connect() {
"Connection failed".to_string()
} else if error.is_redirect() {
"Too many redirects".to_string()
} else {
format!("Network error: {}", error)
};
warn!("Robots.txt fetch failed: {}", reason);
RobotsPolicy::unreachable(reason, DEFAULT_CACHE_TTL)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fetch_stats_default() {
let stats = FetchStats::default();
let snapshot = stats.snapshot();
assert_eq!(snapshot.total, 0);
assert_eq!(snapshot.success, 0);
}
#[tokio::test]
async fn test_fetcher_creation() {
let config = RobotsConfig::default();
let _fetcher = RobotsFetcher::new(config);
}
}