essence/utils/
robots.rs

1use crate::error::{Result, ScrapeError};
2use reqwest::Client;
3use robotstxt::DefaultMatcher;
4use std::time::Duration;
5use url::Url;
6
7/// Check if a URL is allowed by robots.txt
8pub async fn is_allowed(url: &str, user_agent: &str) -> Result<bool> {
9    let parsed_url =
10        Url::parse(url).map_err(|e| ScrapeError::InvalidUrl(format!("Invalid URL: {}", e)))?;
11
12    // Construct robots.txt URL
13    let robots_url = format!(
14        "{}://{}/robots.txt",
15        parsed_url.scheme(),
16        parsed_url
17            .host_str()
18            .ok_or_else(|| ScrapeError::InvalidUrl("No host in URL".to_string()))?
19    );
20
21    // Fetch robots.txt
22    let client = Client::builder()
23        .timeout(Duration::from_secs(5))
24        .build()
25        .map_err(|e| ScrapeError::Internal(format!("Failed to create HTTP client: {}", e)))?;
26
27    let response = match client.get(&robots_url).send().await {
28        Ok(resp) => resp,
29        Err(_) => {
30            // If robots.txt doesn't exist or can't be fetched, allow by default
31            return Ok(true);
32        }
33    };
34
35    if !response.status().is_success() {
36        // If robots.txt doesn't exist, allow by default
37        return Ok(true);
38    }
39
40    let robots_txt = response
41        .text()
42        .await
43        .map_err(|e| ScrapeError::Internal(format!("Failed to read robots.txt: {}", e)))?;
44
45    // Parse and check robots.txt
46    let mut matcher = DefaultMatcher::default();
47    let path = parsed_url.path();
48
49    Ok(matcher.one_agent_allowed_by_robots(&robots_txt, user_agent, path))
50}
51
52/// Check robots.txt with default user agent
53pub async fn is_allowed_default(url: &str) -> Result<bool> {
54    is_allowed(url, "Essence").await
55}
56
57#[cfg(test)]
58mod tests {
59    use super::*;
60
61    #[test]
62    fn test_robots_url_construction() {
63        let url = "https://example.com/page";
64        let parsed = Url::parse(url).unwrap();
65        let robots_url = format!(
66            "{}://{}/robots.txt",
67            parsed.scheme(),
68            parsed.host_str().unwrap()
69        );
70        assert_eq!(robots_url, "https://example.com/robots.txt");
71    }
72}
essence/utils/robots.rs

essence/utils/
robots.rs