use case_insensitive_string::compact_str::CompactString;
use dashmap::DashMap;
use std::sync::OnceLock;
use std::time::{Duration, Instant};
use crate::Client;
const MAX_ENTRIES: usize = 10_000;
struct RobotsCacheEntry {
rules_text: String,
fetched_at: Instant,
ttl: Duration,
}
impl RobotsCacheEntry {
fn is_fresh(&self) -> bool {
self.fetched_at.elapsed() < self.ttl
}
}
fn global_cache() -> &'static DashMap<CompactString, RobotsCacheEntry> {
static CACHE: OnceLock<DashMap<CompactString, RobotsCacheEntry>> = OnceLock::new();
CACHE.get_or_init(DashMap::new)
}
pub async fn get_or_fetch(domain: &str, client: &Client, ttl: Duration) -> Option<String> {
let key = CompactString::new(domain);
let cache = global_cache();
if let Some(entry) = cache.get(&key) {
if entry.is_fresh() {
return Some(entry.rules_text.clone());
}
}
let url = format!("https://{}/robots.txt", domain);
let text = fetch_robots_text(&url, client).await?;
if cache.len() >= MAX_ENTRIES {
evict_expired();
if cache.len() >= MAX_ENTRIES {
let to_remove = MAX_ENTRIES / 10;
let keys_to_remove: Vec<CompactString> = cache
.iter()
.take(to_remove)
.map(|entry| entry.key().clone())
.collect();
for k in keys_to_remove {
cache.remove(&k);
}
}
}
cache.insert(
key,
RobotsCacheEntry {
rules_text: text.clone(),
fetched_at: Instant::now(),
ttl,
},
);
Some(text)
}
pub async fn prefetch(domains: &[&str], client: &Client, ttl: Duration) {
let mut set = tokio::task::JoinSet::new();
for domain in domains {
let client = client.clone();
let domain = domain.to_string();
set.spawn(async move {
get_or_fetch(&domain, &client, ttl).await;
});
}
while set.join_next().await.is_some() {}
}
pub fn invalidate(domain: &str) {
let key = CompactString::new(domain);
global_cache().remove(&key);
}
pub fn evict_expired() {
let cache = global_cache();
cache.retain(|_, entry| entry.is_fresh());
}
async fn fetch_robots_text(url: &str, client: &Client) -> Option<String> {
let response = tokio::time::timeout(Duration::from_secs(10), client.get(url).send())
.await
.ok()?
.ok()?;
if !response.status().is_success() {
return None;
}
tokio::time::timeout(Duration::from_secs(10), response.text())
.await
.ok()?
.ok()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cache_entry_freshness() {
let entry = RobotsCacheEntry {
rules_text: "User-agent: *\nDisallow: /private".to_string(),
fetched_at: Instant::now(),
ttl: Duration::from_secs(60),
};
assert!(entry.is_fresh());
}
#[test]
fn test_cache_entry_expired() {
let entry = RobotsCacheEntry {
rules_text: "User-agent: *\nDisallow:".to_string(),
fetched_at: Instant::now(),
ttl: Duration::from_secs(0), };
assert!(!entry.is_fresh());
}
#[test]
fn test_invalidate_nonexistent() {
invalidate("nonexistent-robots-test.example.com");
}
#[test]
fn test_evict_expired_removes_stale() {
let cache = global_cache();
let key = CompactString::new("evict-test.example.com");
cache.insert(
key.clone(),
RobotsCacheEntry {
rules_text: String::new(),
fetched_at: Instant::now(),
ttl: Duration::from_secs(0), },
);
std::thread::sleep(Duration::from_millis(1));
evict_expired();
assert!(cache.get(&key).is_none());
}
#[test]
fn test_global_cache_singleton() {
let c1 = global_cache();
let c2 = global_cache();
assert!(std::ptr::eq(c1, c2));
}
#[test]
fn test_manual_cache_insert_and_read() {
let cache = global_cache();
let key = CompactString::new("test-manual.example.com");
cache.insert(
key.clone(),
RobotsCacheEntry {
rules_text: "User-agent: *\nAllow: /".to_string(),
fetched_at: Instant::now(),
ttl: Duration::from_secs(300),
},
);
let is_fresh = cache.get(&key).map(|e| e.is_fresh()).unwrap_or(false);
assert!(is_fresh);
cache.remove(&key);
}
}