use std::collections::HashMap;
use std::sync::RwLock;
use std::time::{Duration, Instant};
use texting_robots::Robot;
use url::Url;
pub struct RobotsCache {
entries: RwLock<HashMap<String, RobotsEntry>>,
ttl_secs: u64,
}
struct RobotsEntry {
content: String,
cached_at: Instant,
crawl_delay: Option<f64>,
}
impl Default for RobotsCache {
fn default() -> Self {
Self::new(3600) }
}
impl RobotsCache {
pub fn new(ttl_secs: u64) -> Self {
Self {
entries: RwLock::new(HashMap::new()),
ttl_secs,
}
}
fn cache_key(url: &Url) -> String {
format!("{}://{}", url.scheme(), url.host_str().unwrap_or(""))
}
pub fn get(&self, url: &Url) -> Option<String> {
let key = Self::cache_key(url);
let entries = self.entries.read().unwrap();
entries.get(&key).and_then(|entry| {
if entry.cached_at.elapsed() < Duration::from_secs(self.ttl_secs) {
Some(entry.content.clone())
} else {
None
}
})
}
pub fn get_crawl_delay(&self, url: &Url) -> Option<f64> {
let key = Self::cache_key(url);
let entries = self.entries.read().unwrap();
entries.get(&key).and_then(|entry| {
if entry.cached_at.elapsed() < Duration::from_secs(self.ttl_secs) {
entry.crawl_delay
} else {
None
}
})
}
pub fn set(&self, url: &Url, content: String, crawl_delay: Option<f64>) {
let key = Self::cache_key(url);
let entry = RobotsEntry {
content,
cached_at: Instant::now(),
crawl_delay,
};
self.entries.write().unwrap().insert(key, entry);
}
pub fn remove(&self, url: &Url) {
let key = Self::cache_key(url);
self.entries.write().unwrap().remove(&key);
}
pub fn clear(&self) {
self.entries.write().unwrap().clear();
}
pub fn len(&self) -> usize {
self.entries.read().unwrap().len()
}
pub fn is_empty(&self) -> bool {
self.entries.read().unwrap().is_empty()
}
}
pub struct RobotsChecker {
user_agent: String,
cache: RobotsCache,
enabled: bool,
}
impl RobotsChecker {
pub fn new(user_agent: &str, cache_ttl_secs: u64) -> Self {
Self {
user_agent: user_agent.to_string(),
cache: RobotsCache::new(cache_ttl_secs),
enabled: true,
}
}
pub fn set_enabled(&mut self, enabled: bool) {
self.enabled = enabled;
}
pub fn is_enabled(&self) -> bool {
self.enabled
}
pub fn robots_url(url: &Url) -> Option<Url> {
let base = format!("{}://{}/robots.txt", url.scheme(), url.host_str()?);
Url::parse(&base).ok()
}
pub fn cache_robots(&self, url: &Url, content: &str) {
let crawl_delay = self.parse_crawl_delay(content);
self.cache.set(url, content.to_string(), crawl_delay);
}
fn parse_crawl_delay(&self, content: &str) -> Option<f64> {
let ua_lower = self.user_agent.to_lowercase();
let mut in_matching_section = false;
let mut found_delay: Option<f64> = None;
let mut wildcard_delay: Option<f64> = None;
for line in content.lines() {
let line = line.trim();
if line.starts_with('#') || line.is_empty() {
continue;
}
let parts: Vec<&str> = line.splitn(2, ':').collect();
if parts.len() != 2 {
continue;
}
let directive = parts[0].trim().to_lowercase();
let value = parts[1].trim();
if directive == "user-agent" {
let ua = value.to_lowercase();
if ua == "*" {
in_matching_section = true;
} else if ua_lower.contains(&ua) || ua.contains(&ua_lower) {
in_matching_section = true;
} else {
in_matching_section = false;
}
} else if directive == "crawl-delay" && in_matching_section {
if let Ok(delay) = value.parse::<f64>() {
if value == "*" {
wildcard_delay = Some(delay);
} else {
found_delay = Some(delay);
}
}
}
}
found_delay.or(wildcard_delay)
}
pub fn is_allowed(&self, url: &Url, robots_content: Option<&str>) -> bool {
if !self.enabled {
return true;
}
let content = match robots_content {
Some(c) => c.to_string(),
None => match self.cache.get(url) {
Some(c) => c,
None => return true, },
};
let robot = match Robot::new(&self.user_agent, content.as_bytes()) {
Ok(r) => r,
Err(_) => return true, };
robot.allowed(url.path())
}
pub fn get_crawl_delay(&self, url: &Url) -> Option<Duration> {
if !self.enabled {
return None;
}
self.cache
.get_crawl_delay(url)
.map(|secs| Duration::from_secs_f64(secs))
}
pub fn cache(&self) -> &RobotsCache {
&self.cache
}
}
pub fn extract_sitemaps(robots_content: &str) -> Vec<String> {
robots_content
.lines()
.filter_map(|line| {
let line = line.trim();
if line.to_lowercase().starts_with("sitemap:") {
Some(line[8..].trim().to_string())
} else {
None
}
})
.collect()
}