use std::{
collections::HashMap,
sync::Arc,
time::{Duration, Instant},
};
use texting_robots::Robot;
use tokio::sync::Mutex;
const DEFAULT_TTL: Duration = Duration::from_secs(24 * 60 * 60);
struct CacheEntry {
txt: Option<Arc<String>>,
fetched_at: Instant,
}
pub struct RobotsCache {
cache: Mutex<HashMap<String, CacheEntry>>,
user_agent: String,
ttl: Duration,
}
impl RobotsCache {
pub fn new(user_agent: impl Into<String>) -> Self {
Self::with_ttl(user_agent, DEFAULT_TTL)
}
pub fn with_ttl(user_agent: impl Into<String>, ttl: Duration) -> Self {
Self {
cache: Mutex::new(HashMap::new()),
user_agent: user_agent.into(),
ttl,
}
}
pub async fn is_allowed(&self, client: &reqwest::Client, url: &str) -> bool {
let parsed = match url::Url::parse(url) {
Ok(u) => u,
Err(_) => return true,
};
let origin = format!("{}://{}", parsed.scheme(), parsed.host_str().unwrap_or(""));
{
let cache = self.cache.lock().await;
if let Some(entry) = cache.get(&origin)
&& entry.fetched_at.elapsed() < self.ttl
{
return Self::robot_allows(
&self.user_agent,
entry.txt.as_deref().map(|s| s.as_str()),
url,
);
}
}
let robots_url = format!("{}/robots.txt", origin);
let txt = client
.get(&robots_url)
.send()
.await
.ok()
.filter(|r| r.status().is_success())
.map(|r| async move { r.text().await.ok() });
let txt: Option<String> = match txt {
Some(fut) => fut.await,
None => None,
};
let entry = CacheEntry {
txt: txt.map(Arc::new),
fetched_at: Instant::now(),
};
let allowed = Self::robot_allows(
&self.user_agent,
entry.txt.as_deref().map(|s| s.as_str()),
url,
);
self.cache.lock().await.insert(origin, entry);
allowed
}
fn robot_allows(user_agent: &str, txt: Option<&str>, url: &str) -> bool {
match txt {
None => true,
Some(content) => Robot::new(user_agent, content.as_bytes())
.map(|r| r.allowed(url))
.unwrap_or(true),
}
}
}