1use crate::error::{Result, ScrapeError};
2use reqwest::Client;
3use robotstxt::DefaultMatcher;
4use std::time::Duration;
5use url::Url;
6
7pub async fn is_allowed(url: &str, user_agent: &str) -> Result<bool> {
9 let parsed_url =
10 Url::parse(url).map_err(|e| ScrapeError::InvalidUrl(format!("Invalid URL: {}", e)))?;
11
12 let robots_url = format!(
14 "{}://{}/robots.txt",
15 parsed_url.scheme(),
16 parsed_url
17 .host_str()
18 .ok_or_else(|| ScrapeError::InvalidUrl("No host in URL".to_string()))?
19 );
20
21 let client = Client::builder()
23 .timeout(Duration::from_secs(5))
24 .build()
25 .map_err(|e| ScrapeError::Internal(format!("Failed to create HTTP client: {}", e)))?;
26
27 let response = match client.get(&robots_url).send().await {
28 Ok(resp) => resp,
29 Err(_) => {
30 return Ok(true);
32 }
33 };
34
35 if !response.status().is_success() {
36 return Ok(true);
38 }
39
40 let robots_txt = response
41 .text()
42 .await
43 .map_err(|e| ScrapeError::Internal(format!("Failed to read robots.txt: {}", e)))?;
44
45 let mut matcher = DefaultMatcher::default();
47 let path = parsed_url.path();
48
49 Ok(matcher.one_agent_allowed_by_robots(&robots_txt, user_agent, path))
50}
51
52pub async fn is_allowed_default(url: &str) -> Result<bool> {
54 is_allowed(url, "Essence").await
55}
56
57#[cfg(test)]
58mod tests {
59 use super::*;
60
61 #[test]
62 fn test_robots_url_construction() {
63 let url = "https://example.com/page";
64 let parsed = Url::parse(url).unwrap();
65 let robots_url = format!(
66 "{}://{}/robots.txt",
67 parsed.scheme(),
68 parsed.host_str().unwrap()
69 );
70 assert_eq!(robots_url, "https://example.com/robots.txt");
71 }
72}