use std::collections::HashMap;
use crate::session::SessionManager;
pub struct RobotsTxtManager {
cache: HashMap<String, RobotsTxtRules>,
}
struct RobotsTxtRules {
disallowed: Vec<String>,
crawl_delay: Option<f64>,
}
impl Default for RobotsTxtManager {
fn default() -> Self {
Self::new()
}
}
impl RobotsTxtManager {
pub fn new() -> Self {
Self {
cache: HashMap::new(),
}
}
pub async fn can_fetch(
&mut self,
url: &str,
sid: &str,
session_manager: &SessionManager,
) -> bool {
let Some(domain) = extract_domain(url) else {
return true;
};
let rules = self.get_or_fetch(&domain, sid, session_manager).await;
let path = url::Url::parse(url)
.ok()
.map(|u| u.path().to_owned())
.unwrap_or_else(|| "/".into());
!rules.disallowed.iter().any(|d| path.starts_with(d))
}
pub async fn get_crawl_delay(
&mut self,
url: &str,
sid: &str,
session_manager: &SessionManager,
) -> Option<f64> {
let domain = extract_domain(url)?;
let rules = self.get_or_fetch(&domain, sid, session_manager).await;
rules.crawl_delay
}
pub async fn prefetch(&mut self, urls: &[String], sid: &str, session_manager: &SessionManager) {
let mut domains_seen = std::collections::HashSet::new();
for url in urls {
if let Some(domain) = extract_domain(url) {
if domains_seen.insert(domain.clone()) {
self.get_or_fetch(&domain, sid, session_manager).await;
}
}
}
}
async fn get_or_fetch(
&mut self,
domain: &str,
sid: &str,
session_manager: &SessionManager,
) -> &RobotsTxtRules {
if !self.cache.contains_key(domain) {
let rules = fetch_and_parse(domain, sid, session_manager).await;
self.cache.insert(domain.to_owned(), rules);
}
self.cache.get(domain).unwrap()
}
}
async fn fetch_and_parse(
domain: &str,
sid: &str,
session_manager: &SessionManager,
) -> RobotsTxtRules {
let robots_url = format!("https://{domain}/robots.txt");
let content = match session_manager.get(if sid.is_empty() {
session_manager.default_session_id().unwrap_or("default")
} else {
sid
}) {
Ok(_session) => {
let req = crate::request::Request::new(&robots_url);
match session_manager.fetch(&req).await {
Ok(resp) if resp.is_success() => String::from_utf8_lossy(&resp.body).to_string(),
_ => String::new(),
}
}
Err(_) => String::new(),
};
parse_robots_txt(&content)
}
fn parse_robots_txt(content: &str) -> RobotsTxtRules {
let mut disallowed = Vec::new();
let mut crawl_delay = None;
let mut in_wildcard_agent = false;
for line in content.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let Some((key, value)) = line.split_once(':') else {
continue;
};
let key = key.trim().to_lowercase();
let value = value.trim();
match key.as_str() {
"user-agent" => {
in_wildcard_agent = value == "*";
}
"disallow" if in_wildcard_agent && !value.is_empty() => {
disallowed.push(value.to_owned());
}
"crawl-delay" if in_wildcard_agent => {
if let Ok(delay) = value.parse::<f64>() {
crawl_delay = Some(delay);
}
}
_ => {}
}
}
RobotsTxtRules {
disallowed,
crawl_delay,
}
}
fn extract_domain(url: &str) -> Option<String> {
url::Url::parse(url)
.ok()
.and_then(|u| u.host_str().map(|h| h.to_owned()))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_robots_txt_basic() {
let rules = parse_robots_txt(
"User-agent: *\nDisallow: /admin\nDisallow: /private\nCrawl-delay: 2\n",
);
assert_eq!(rules.disallowed, vec!["/admin", "/private"]);
assert_eq!(rules.crawl_delay, Some(2.0));
}
#[test]
fn parse_robots_txt_empty() {
let rules = parse_robots_txt("");
assert!(rules.disallowed.is_empty());
assert!(rules.crawl_delay.is_none());
}
#[test]
fn parse_robots_txt_specific_agent_ignored() {
let rules = parse_robots_txt("User-agent: Googlebot\nDisallow: /secret\n");
assert!(rules.disallowed.is_empty());
}
#[test]
fn extract_domain_works() {
assert_eq!(
extract_domain("https://example.com/page"),
Some("example.com".into())
);
assert_eq!(extract_domain("not-a-url"), None);
}
}