use crate::http_client::HttpClient;
use robots_txt::{Robots, matcher::SimpleMatcher};
use std::sync::OnceLock;
use url::Url;
#[derive(Debug)]
pub struct RobotsTxt {
robots_text: Option<String>,
domain: String,
parsed_robots: OnceLock<bool>,
}
impl RobotsTxt {
pub async fn fetch(
http_client: &HttpClient,
target_url: &Url,
custom_robots_path: Option<&str>,
) -> Result<Self, String> {
let domain = target_url
.domain()
.map(|domain| domain.to_string())
.or_else(|| {
target_url.host_str().map(|host| {
if let Some(port) = target_url.port() {
format!("{}:{}", host, port)
} else {
host.to_string()
}
})
})
.as_deref()
.ok_or_else(|| format!("Target URL {} does not have a valid domain or host.", target_url))?
.to_string();
let robots_url_str = if let Some(path) = custom_robots_path {
if let Ok(custom_url) = Url::parse(path) {
custom_url.to_string()
} else {
return Err(format!(
"Custom robots_path '{}' is not a valid URL. Local file paths are not yet supported for robots.txt.",
path
));
}
} else {
let base_url_str = format!("{}://{}", target_url.scheme(), domain);
let base_url = Url::parse(&base_url_str).map_err(|e| format!("Failed to parse base URL: {}", e))?;
base_url
.join("/robots.txt")
.map_err(|e| format!("Failed to join /robots.txt: {}", e))?
.to_string()
};
let robots_url =
Url::parse(&robots_url_str).map_err(|e| format!("Invalid robots.txt URL {}: {}", robots_url_str, e))?;
tracing::info!("Fetching robots.txt from: {}", robots_url);
match http_client.fetch(robots_url).await {
Ok(response) => {
tracing::debug!("robots.txt content for {}:\n{}", domain, response);
Ok(RobotsTxt {
robots_text: Some(response),
domain,
parsed_robots: OnceLock::new(),
})
}
Err(e) => {
tracing::warn!("Error fetching robots.txt for {}: {}", domain, e);
Ok(RobotsTxt {
robots_text: None,
domain,
parsed_robots: OnceLock::new(),
})
}
}
}
pub fn is_allowed(&self, url_to_check: &Url, user_agent: &str) -> bool {
if url_to_check.domain().is_none() {
tracing::warn!(
"Checking URL {} against robots.txt for a different domain {}",
url_to_check,
self.domain
);
return true;
} else if url_to_check.domain().unwrap_or_default() != self.domain {
tracing::warn!(
"Checking URL {} against robots.txt for a different domain {}",
url_to_check,
self.domain
);
return false;
}
let has_robots = *self.parsed_robots.get_or_init(|| self.robots_text.is_some());
if !has_robots {
return true;
}
let robots = match &self.robots_text {
Some(text) => Robots::from_str_lossy(text),
None => return true,
};
let section = robots.choose_section(user_agent);
let matcher = SimpleMatcher::new(§ion.rules);
matcher.check_path(url_to_check.path())
}
}
#[cfg(test)]
mod tests {
use super::*;
use httpmock::{Method::GET, MockServer};
use std::sync::Once;
use tokio::runtime::Runtime;
static INIT: Once = Once::new();
fn init_tracing() {
INIT.call_once(|| {
let _ = tracing_subscriber::fmt::try_init();
});
}
#[test]
fn test_fetch_successful() {
init_tracing();
let rt = Runtime::new().unwrap();
rt.block_on(async {
let server = MockServer::start_async().await;
let robots_body = "User-agent: *\nDisallow: /private/";
let mock = server
.mock_async(|when, then| {
when.method(GET).path("/robots.txt");
then.status(200).body(robots_body);
})
.await;
let http_client = HttpClient::new_reqwest(30.0).unwrap();
let url = Url::parse(&format!("http://{}", server.address())).unwrap();
let robots = RobotsTxt::fetch(&http_client, &url, None).await.unwrap();
assert_eq!(robots.domain, server.address().to_string());
assert_eq!(robots.robots_text.as_deref(), Some(robots_body));
mock.assert_async().await;
});
}
#[test]
fn test_fetch_not_found() {
init_tracing();
let rt = Runtime::new().unwrap();
rt.block_on(async {
let server = MockServer::start_async().await;
let mock = server
.mock_async(|when, then| {
when.method(GET).path("/robots.txt");
then.status(404);
})
.await;
let http_client = HttpClient::new_reqwest(30.0).unwrap();
let url = Url::parse(&format!("http://{}", server.address())).unwrap();
let robots = RobotsTxt::fetch(&http_client, &url, None).await.unwrap();
assert_eq!(robots.domain, server.address().to_string());
assert!(robots.robots_text.is_none());
mock.assert_async().await;
});
}
#[test]
fn test_fetch_custom_robots_path() {
init_tracing();
let rt = Runtime::new().unwrap();
rt.block_on(async {
let server = MockServer::start_async().await;
let robots_body = "User-agent: *\nDisallow: /foo/";
let mock = server
.mock_async(|when, then| {
when.method(GET).path("/custom-robots.txt");
then.status(200).body(robots_body);
})
.await;
let http_client = HttpClient::new_reqwest(30.0).unwrap();
let custom_url = format!("http://{}/custom-robots.txt", server.address());
let url = Url::parse(&format!("http://{}", server.address())).unwrap();
let robots = RobotsTxt::fetch(&http_client, &url, Some(&custom_url)).await.unwrap();
assert_eq!(robots.domain, server.address().to_string());
assert_eq!(robots.robots_text.as_deref(), Some(robots_body));
mock.assert_async().await;
});
}
#[test]
fn test_fetch_invalid_custom_path() {
init_tracing();
let rt = Runtime::new().unwrap();
rt.block_on(async {
let http_client = HttpClient::new_reqwest(30.0).unwrap();
let url = Url::parse("http://example.com").unwrap();
let result = RobotsTxt::fetch(&http_client, &url, Some("/not/a/url")).await;
assert!(result.is_err());
assert!(result.unwrap_err().contains("not a valid URL"));
});
}
#[test]
fn test_robots_txt_parsing_basic_direct() {
let rules_text = "User-agent: *\nDisallow: /private/\nAllow: /public/";
let rt = RobotsTxt {
robots_text: Some(rules_text.to_string()),
domain: "example.com".to_string(),
parsed_robots: OnceLock::new(),
};
let private_url = Url::parse("http://example.com/private/something").unwrap();
let public_url = Url::parse("http://example.com/public/page").unwrap();
let other_url = Url::parse("http://example.com/anywhere_else").unwrap();
assert!(!rt.is_allowed(&private_url, "*"));
assert!(rt.is_allowed(&public_url, "*"));
assert!(rt.is_allowed(&other_url, "*"));
}
#[test]
fn test_is_allowed_with_no_robots_text() {
let rt = RobotsTxt {
robots_text: None,
domain: "example.com".to_string(),
parsed_robots: OnceLock::new(),
};
let url = Url::parse("http://example.com/some/path").unwrap();
assert!(rt.is_allowed(&url, "test-agent"));
}
#[test]
fn test_is_allowed_with_simple_disallow_text() {
let rules = "User-agent: test-agent
Disallow: /forbidden/";
let rt = RobotsTxt {
robots_text: Some(rules.to_string()),
domain: "example.com".to_string(),
parsed_robots: OnceLock::new(),
};
let allowed_url = Url::parse("http://example.com/allowed/path").unwrap();
let disallowed_url = Url::parse("http://example.com/forbidden/path").unwrap();
assert!(rt.is_allowed(&allowed_url, "test-agent"));
assert!(!rt.is_allowed(&disallowed_url, "test-agent"));
assert!(rt.is_allowed(&disallowed_url, "another-agent"));
}
#[test]
fn test_is_allowed_different_domain() {
let rules = "User-agent: *\nDisallow: /private/";
let rt = RobotsTxt {
robots_text: Some(rules.to_string()),
domain: "example.com".to_string(),
parsed_robots: OnceLock::new(),
};
let other_domain_url = Url::parse("http://other.com/private/").unwrap();
assert!(!rt.is_allowed(&other_domain_url, "*"));
}
#[test]
fn test_is_allowed_url_without_domain() {
let rules = "User-agent: *\nDisallow: /private/";
let rt = RobotsTxt {
robots_text: Some(rules.to_string()),
domain: "example.com".to_string(),
parsed_robots: OnceLock::new(),
};
let url = Url::parse("file:///private/").unwrap();
assert!(rt.is_allowed(&url, "*"));
}
#[test]
fn test_is_allowed_specific_user_agent() {
let rules = "User-agent: special-bot\nDisallow: /blocked/\nUser-agent: *\nAllow: /";
let rt = RobotsTxt {
robots_text: Some(rules.to_string()),
domain: "example.com".to_string(),
parsed_robots: OnceLock::new(),
};
let blocked_url = Url::parse("http://example.com/blocked/page").unwrap();
let allowed_url = Url::parse("http://example.com/other/page").unwrap();
assert!(!rt.is_allowed(&blocked_url, "special-bot"));
assert!(rt.is_allowed(&blocked_url, "other-bot"));
assert!(rt.is_allowed(&allowed_url, "special-bot"));
}
#[test]
fn test_is_allowed_empty_robots_txt() {
let rt = RobotsTxt {
robots_text: Some("".to_string()),
domain: "example.com".to_string(),
parsed_robots: OnceLock::new(),
};
let url = Url::parse("http://example.com/any/path").unwrap();
assert!(rt.is_allowed(&url, "*"));
}
}