omnivore_core/crawler/
robots.rs

1use crate::{Error, Result};
2use dashmap::DashMap;
3use std::sync::Arc;
4use std::time::{Duration, Instant};
5use url::Url;
6
7pub struct RobotsChecker {
8    cache: Arc<DashMap<String, CachedRobots>>,
9    client: reqwest::Client,
10}
11
12struct CachedRobots {
13    robots_txt: String,
14    fetched_at: Instant,
15    ttl: Duration,
16}
17
18impl RobotsChecker {
19    pub fn new(_user_agent: String) -> Self {
20        let client = reqwest::Client::builder()
21            .timeout(Duration::from_secs(10))
22            .build()
23            .expect("Failed to build HTTP client");
24
25        Self {
26            cache: Arc::new(DashMap::new()),
27            client,
28        }
29    }
30
31    pub async fn is_allowed(&self, url: &Url) -> Result<bool> {
32        let robots_url = self.get_robots_url(url)?;
33        let domain = url
34            .domain()
35            .ok_or_else(|| Error::Parse("Invalid domain".to_string()))?;
36
37        if let Some(cached) = self.cache.get(domain) {
38            if cached.fetched_at.elapsed() < cached.ttl {
39                return Ok(self.check_robots_txt(&cached.robots_txt, url.as_str()));
40            }
41        }
42
43        let robots_txt = self.fetch_robots_txt(&robots_url).await?;
44
45        let allowed = self.check_robots_txt(&robots_txt, url.as_str());
46
47        self.cache.insert(
48            domain.to_string(),
49            CachedRobots {
50                robots_txt,
51                fetched_at: Instant::now(),
52                ttl: Duration::from_secs(3600),
53            },
54        );
55
56        Ok(allowed)
57    }
58
59    fn check_robots_txt(&self, _robots_txt: &str, _url: &str) -> bool {
60        // TODO: Implement proper robots.txt parsing
61        // For now, allow all URLs
62        true
63    }
64
65    fn get_robots_url(&self, url: &Url) -> Result<Url> {
66        let mut robots_url = url.clone();
67        robots_url.set_path("/robots.txt");
68        robots_url.set_query(None);
69        robots_url.set_fragment(None);
70        Ok(robots_url)
71    }
72
73    async fn fetch_robots_txt(&self, url: &Url) -> Result<String> {
74        match self.client.get(url.as_str()).send().await {
75            Ok(response) => {
76                if response.status().is_success() {
77                    Ok(response.text().await?)
78                } else {
79                    Ok(String::new())
80                }
81            }
82            Err(_) => Ok(String::new()),
83        }
84    }
85
86    pub fn get_crawl_delay(&self, domain: &str) -> Option<Duration> {
87        self.cache.get(domain).and(None)
88    }
89}