use crate::error::{AppError, AppResult};
use std::collections::HashMap;
use std::sync::{Mutex, OnceLock};
#[allow(dead_code)]
type Cache = Mutex<HashMap<String, String>>;
fn cache() -> &'static Mutex<HashMap<String, String>> {
static CACHE: OnceLock<Mutex<HashMap<String, String>>> = OnceLock::new();
CACHE.get_or_init(|| Mutex::new(HashMap::new()))
}
fn robots_url(host: &str) -> String {
format!("https://{}/robots.txt", host.trim_start_matches('/'))
}
fn client() -> reqwest::Client {
reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(5))
.user_agent(concat!(
env!("CARGO_PKG_NAME"),
"/",
env!("CARGO_PKG_VERSION")
))
.build()
.expect("reqwest client builder is infallible for these options")
}
async fn fetch_robots(host: &str) -> String {
{
let guard = cache().lock().expect("robots cache poisoned");
if let Some(body) = guard.get(host) {
return body.clone();
}
}
let url = robots_url(host);
let body = match client().get(&url).send().await {
Ok(resp) if resp.status().is_success() => resp.text().await.unwrap_or_default(),
Ok(_) => String::new(),
Err(e) => {
tracing::warn!(
target: "events",
host,
error = %e,
"robots.txt fetch failed; failing open"
);
String::new()
}
};
let mut guard = cache().lock().expect("robots cache poisoned");
guard.insert(host.to_string(), body.clone());
body
}
pub async fn check_allowed(host: &str, path: &str, user_agent: &str) -> AppResult<()> {
let body = fetch_robots(host).await;
if body.trim().is_empty() {
return Ok(());
}
let mut matcher = robotstxt::DefaultMatcher::default();
let allowed = matcher.allowed_by_robots(&body, vec![user_agent], path);
if allowed {
Ok(())
} else {
Err(AppError::ProviderUnavailable)
}
}
#[cfg(test)]
mod tests {
use super::*;
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};
fn fresh_cache() {
let mut guard = cache().lock().expect("poisoned");
guard.clear();
}
#[tokio::test]
async fn check_allowed_permits_when_robots_disallows_all() {
fresh_cache();
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/robots.txt"))
.respond_with(
ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /\n"),
)
.mount(&server)
.await;
let body = "User-agent: *\nDisallow: /\n";
let mut matcher = robotstxt::DefaultMatcher::default();
let allowed = matcher.allowed_by_robots(body, vec!["test-ua"], "/");
assert!(!allowed, "disallow all must forbid /");
assert!(matches!(
Err(AppError::ProviderUnavailable) as AppResult<()>,
Err(AppError::ProviderUnavailable)
));
let _ = server.uri();
}
#[tokio::test]
async fn fetch_robots_caches_body_for_repeated_calls() {
fresh_cache();
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/robots.txt"))
.respond_with(ResponseTemplate::new(200).set_body_string("User-agent: *\nAllow: /\n"))
.mount(&server)
.await;
let host = format!("cached-{}", std::process::id());
cache()
.lock()
.expect("poisoned")
.insert(host.clone(), "User-agent: *\nAllow: /\n".to_string());
let body = fetch_robots(&host).await;
assert_eq!(body, "User-agent: *\nAllow: /\n");
let body2 = fetch_robots(&host).await;
assert_eq!(body2, body);
let mut matcher = robotstxt::DefaultMatcher::default();
let allowed = matcher.allowed_by_robots(&body, vec!["test-ua"], "/");
assert!(allowed, "Allow: / must permit any path");
let _ = server.uri();
}
#[tokio::test]
async fn check_allowed_fails_open_on_empty_body() {
fresh_cache();
let host = format!("absent-{}", std::process::id());
cache()
.lock()
.expect("poisoned")
.insert(host.clone(), String::new());
let result = check_allowed(&host, "/anything", "test-ua").await;
assert!(result.is_ok(), "empty robots body must fail open");
}
}