use crate::error::{Result, ScrapeError};
use moka::future::Cache;
use std::sync::LazyLock;
use reqwest::Client;
use robotstxt::DefaultMatcher;
use std::sync::Arc;
use std::time::Duration;
use tracing::{debug, warn};
use url::Url;
#[derive(Debug, Clone)]
pub struct RobotsData {
pub content: String,
pub crawl_delay: Option<f64>,
pub allowed: bool,
}
static ROBOTS_CACHE: LazyLock<Arc<Cache<String, RobotsData>>> = LazyLock::new(|| {
Arc::new(
Cache::builder()
.max_capacity(10_000)
.time_to_live(Duration::from_secs(24 * 3600)) .build(),
)
});
fn extract_domain(url: &Url) -> Result<String> {
crate::utils::etld::extract_etld_plus_one(url.as_str())
}
fn parse_crawl_delay(robots_txt: &str, user_agent: &str) -> Option<f64> {
let mut in_user_agent_block = false;
let mut crawl_delay: Option<f64> = None;
for line in robots_txt.lines() {
let line = line.trim();
if line.to_lowercase().starts_with("user-agent:") {
let agent = line[11..].trim();
in_user_agent_block = agent == "*" || agent.eq_ignore_ascii_case(user_agent);
}
if in_user_agent_block && line.to_lowercase().starts_with("crawl-delay:") {
let delay_str = line[12..].trim();
if let Ok(delay) = delay_str.parse::<f64>() {
crawl_delay = Some(delay);
debug!("Parsed Crawl-Delay: {} seconds", delay);
}
}
}
crawl_delay
}
async fn fetch_robots_txt(domain: &str, user_agent: &str) -> Result<RobotsData> {
let robots_url = format!("https://{}/robots.txt", domain);
debug!("Fetching robots.txt from: {}", robots_url);
let client = Client::builder()
.timeout(Duration::from_secs(5))
.build()
.map_err(|e| ScrapeError::Internal(format!("Failed to create HTTP client: {}", e)))?;
let response = match client.get(&robots_url).send().await {
Ok(resp) => resp,
Err(e) => {
warn!("Failed to fetch robots.txt from {}: {}", robots_url, e);
return Ok(RobotsData {
content: String::new(),
crawl_delay: None,
allowed: true,
});
}
};
if !response.status().is_success() {
warn!("robots.txt not found at {} (status: {})", robots_url, response.status());
return Ok(RobotsData {
content: String::new(),
crawl_delay: None,
allowed: true,
});
}
let robots_txt = response
.text()
.await
.map_err(|e| ScrapeError::Internal(format!("Failed to read robots.txt: {}", e)))?;
let crawl_delay = parse_crawl_delay(&robots_txt, user_agent);
Ok(RobotsData {
content: robots_txt,
crawl_delay,
allowed: true, })
}
pub async fn is_allowed_cached(url: &str, user_agent: &str) -> Result<(bool, Option<f64>)> {
let parsed_url = Url::parse(url)
.map_err(|e| ScrapeError::InvalidUrl(format!("Invalid URL: {}", e)))?;
let domain = extract_domain(&parsed_url)?;
let robots_data = if let Some(cached) = ROBOTS_CACHE.get(&domain).await {
debug!("Robots.txt cache hit for domain: {}", domain);
cached
} else {
debug!("Robots.txt cache miss for domain: {}", domain);
let data = fetch_robots_txt(&domain, user_agent).await?;
ROBOTS_CACHE.insert(domain.clone(), data.clone()).await;
data
};
let path = parsed_url.path();
let allowed = if robots_data.content.is_empty() {
true
} else {
let mut matcher = DefaultMatcher::default();
matcher.one_agent_allowed_by_robots(&robots_data.content, user_agent, path)
};
Ok((allowed, robots_data.crawl_delay))
}
pub async fn is_allowed_default_cached(url: &str) -> Result<(bool, Option<f64>)> {
is_allowed_cached(url, "Essence").await
}
#[cfg(test)]
pub async fn clear_cache() {
ROBOTS_CACHE.invalidate_all();
ROBOTS_CACHE.run_pending_tasks().await;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_domain_extraction() {
let url = Url::parse("https://www.example.com/path/to/page").unwrap();
let domain = extract_domain(&url).unwrap();
assert_eq!(domain, "example.com");
}
#[test]
fn test_crawl_delay_parsing() {
let robots_txt = r#"
User-agent: *
Crawl-delay: 2
User-agent: Essence
Crawl-delay: 0.5
Disallow: /admin
"#;
let delay_all = parse_crawl_delay(robots_txt, "*");
assert_eq!(delay_all, Some(2.0));
let delay_essence = parse_crawl_delay(robots_txt, "Essence");
assert_eq!(delay_essence, Some(0.5));
let delay_other = parse_crawl_delay(robots_txt, "OtherBot");
assert_eq!(delay_other, Some(2.0));
}
#[test]
fn test_crawl_delay_parsing_case_insensitive() {
let robots_txt = r#"
user-agent: *
crawl-delay: 1.5
"#;
let delay = parse_crawl_delay(robots_txt, "*");
assert_eq!(delay, Some(1.5));
}
#[tokio::test]
async fn test_cache() {
clear_cache().await;
let domain = "example.com";
let robots_data = RobotsData {
content: "User-agent: *\nDisallow: /admin".to_string(),
crawl_delay: Some(1.0),
allowed: true,
};
ROBOTS_CACHE.insert(domain.to_string(), robots_data.clone()).await;
let cached = ROBOTS_CACHE.get(domain).await;
assert!(cached.is_some());
assert_eq!(cached.unwrap().crawl_delay, Some(1.0));
}
}