use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::{Mutex, OnceCell};
#[derive(Debug, Clone, Default)]
pub(crate) struct Rules {
disallow: Vec<String>,
}
impl Rules {
fn allow_all() -> Self {
Self::default()
}
pub(crate) fn is_allowed(&self, path: &str) -> bool {
!self
.disallow
.iter()
.any(|rule| !rule.is_empty() && path.starts_with(rule.as_str()))
}
pub(crate) fn parse(body: &str, ua_token: &str) -> Self {
let ua_token = ua_token.to_ascii_lowercase();
let mut groups: Vec<(Vec<String>, Vec<String>)> = Vec::new();
let mut agents: Vec<String> = Vec::new();
let mut disallow: Vec<String> = Vec::new();
let mut saw_rule = false;
for raw in body.lines() {
let line = raw.split('#').next().unwrap_or("").trim();
if line.is_empty() {
continue;
}
let Some((key, value)) = line.split_once(':') else {
continue;
};
let key = key.trim().to_ascii_lowercase();
let value = value.trim().to_owned();
match key.as_str() {
"user-agent" => {
if saw_rule {
groups.push((std::mem::take(&mut agents), std::mem::take(&mut disallow)));
saw_rule = false;
}
agents.push(value.to_ascii_lowercase());
}
"disallow" => {
disallow.push(value);
saw_rule = true;
}
_ => {} }
}
if !agents.is_empty() || !disallow.is_empty() {
groups.push((agents, disallow));
}
let mut wildcard: Option<Vec<String>> = None;
for (agents, rules) in groups {
if agents.iter().any(|a| a == &ua_token) {
return Self { disallow: rules };
}
if wildcard.is_none() && agents.iter().any(|a| a == "*") {
wildcard = Some(rules);
}
}
Self {
disallow: wildcard.unwrap_or_default(),
}
}
}
#[derive(Debug, Clone)]
pub(crate) struct RobotsCache {
client: reqwest::Client,
ua_token: String,
cells: Arc<Mutex<HashMap<String, Arc<OnceCell<Rules>>>>>,
}
impl RobotsCache {
pub(crate) fn new(client: reqwest::Client, ua_token: impl Into<String>) -> Self {
Self {
client,
ua_token: ua_token.into(),
cells: Arc::new(Mutex::new(HashMap::new())),
}
}
pub(crate) async fn allowed(&self, origin: &str, path: &str) -> bool {
let cell = {
let mut cells = self.cells.lock().await;
cells
.entry(origin.to_owned())
.or_insert_with(|| Arc::new(OnceCell::new()))
.clone()
};
let rules = cell.get_or_init(|| self.fetch(origin.to_owned())).await;
rules.is_allowed(path)
}
async fn fetch(&self, origin: String) -> Rules {
let url = format!("{origin}/robots.txt");
match self.client.get(&url).send().await {
Ok(resp) if resp.status().is_success() => {
let body = resp.text().await.unwrap_or_default();
Rules::parse(&body, &self.ua_token)
}
_ => Rules::allow_all(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn wildcard_group_disallow_prefix() {
let r = Rules::parse("User-agent: *\nDisallow: /private", "adler");
assert!(!r.is_allowed("/private/x"));
assert!(r.is_allowed("/public"));
}
#[test]
fn specific_group_preferred_over_wildcard() {
let body = "User-agent: adler\nDisallow: /\n\nUser-agent: *\nDisallow:";
let r = Rules::parse(body, "adler");
assert!(!r.is_allowed("/anything"));
}
#[test]
fn falls_back_to_wildcard_when_no_specific_group() {
let body = "User-agent: googlebot\nDisallow: /g\n\nUser-agent: *\nDisallow: /w";
let r = Rules::parse(body, "adler");
assert!(!r.is_allowed("/w/x"));
assert!(r.is_allowed("/g/x")); }
#[test]
fn empty_disallow_allows_everything() {
let r = Rules::parse("User-agent: *\nDisallow:", "adler");
assert!(r.is_allowed("/anything"));
}
#[test]
fn comments_and_blank_lines_ignored() {
let body = "# a comment\n\nUser-agent: * # us\nDisallow: /no # nope\n";
let r = Rules::parse(body, "adler");
assert!(!r.is_allowed("/no"));
assert!(r.is_allowed("/ok"));
}
#[test]
fn allow_lines_are_ignored_conservatively() {
let body = "User-agent: *\nDisallow: /u\nAllow: /u/public";
let r = Rules::parse(body, "adler");
assert!(!r.is_allowed("/u/public"));
}
#[test]
fn missing_robots_allows_all() {
assert!(Rules::allow_all().is_allowed("/anything"));
}
#[tokio::test]
async fn cache_fetches_and_applies_rules() {
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/robots.txt"))
.respond_with(
ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
)
.mount(&server)
.await;
let cache = RobotsCache::new(reqwest::Client::new(), "adler");
let origin = server.uri();
assert!(!cache.allowed(&origin, "/no/alice").await);
assert!(cache.allowed(&origin, "/yes/alice").await);
}
#[tokio::test]
async fn missing_robots_txt_allows() {
use wiremock::matchers::method;
use wiremock::{Mock, MockServer, ResponseTemplate};
let server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(404))
.mount(&server)
.await;
let cache = RobotsCache::new(reqwest::Client::new(), "adler");
assert!(cache.allowed(&server.uri(), "/anything").await);
}
}