use crate::error::{Result, ScrapeError};
use reqwest::Client;
use robotstxt::DefaultMatcher;
use std::time::Duration;
use url::Url;
pub async fn is_allowed(url: &str, user_agent: &str) -> Result<bool> {
let parsed_url =
Url::parse(url).map_err(|e| ScrapeError::InvalidUrl(format!("Invalid URL: {}", e)))?;
let robots_url = format!(
"{}://{}/robots.txt",
parsed_url.scheme(),
parsed_url
.host_str()
.ok_or_else(|| ScrapeError::InvalidUrl("No host in URL".to_string()))?
);
let client = Client::builder()
.timeout(Duration::from_secs(5))
.build()
.map_err(|e| ScrapeError::Internal(format!("Failed to create HTTP client: {}", e)))?;
let response = match client.get(&robots_url).send().await {
Ok(resp) => resp,
Err(_) => {
return Ok(true);
}
};
if !response.status().is_success() {
return Ok(true);
}
let robots_txt = response
.text()
.await
.map_err(|e| ScrapeError::Internal(format!("Failed to read robots.txt: {}", e)))?;
let mut matcher = DefaultMatcher::default();
let path = parsed_url.path();
Ok(matcher.one_agent_allowed_by_robots(&robots_txt, user_agent, path))
}
pub async fn is_allowed_default(url: &str) -> Result<bool> {
is_allowed(url, "Essence").await
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_robots_url_construction() {
let url = "https://example.com/page";
let parsed = Url::parse(url).unwrap();
let robots_url = format!(
"{}://{}/robots.txt",
parsed.scheme(),
parsed.host_str().unwrap()
);
assert_eq!(robots_url, "https://example.com/robots.txt");
}
}