omnivore_core/crawler/
robots.rs1use crate::{Error, Result};
2use dashmap::DashMap;
3use std::sync::Arc;
4use std::time::{Duration, Instant};
5use url::Url;
6
7pub struct RobotsChecker {
8 cache: Arc<DashMap<String, CachedRobots>>,
9 client: reqwest::Client,
10}
11
12struct CachedRobots {
13 robots_txt: String,
14 fetched_at: Instant,
15 ttl: Duration,
16}
17
18impl RobotsChecker {
19 pub fn new(_user_agent: String) -> Self {
20 let client = reqwest::Client::builder()
21 .timeout(Duration::from_secs(10))
22 .build()
23 .expect("Failed to build HTTP client");
24
25 Self {
26 cache: Arc::new(DashMap::new()),
27 client,
28 }
29 }
30
31 pub async fn is_allowed(&self, url: &Url) -> Result<bool> {
32 let robots_url = self.get_robots_url(url)?;
33 let domain = url
34 .domain()
35 .ok_or_else(|| Error::Parse("Invalid domain".to_string()))?;
36
37 if let Some(cached) = self.cache.get(domain) {
38 if cached.fetched_at.elapsed() < cached.ttl {
39 return Ok(self.check_robots_txt(&cached.robots_txt, url.as_str()));
40 }
41 }
42
43 let robots_txt = self.fetch_robots_txt(&robots_url).await?;
44
45 let allowed = self.check_robots_txt(&robots_txt, url.as_str());
46
47 self.cache.insert(
48 domain.to_string(),
49 CachedRobots {
50 robots_txt,
51 fetched_at: Instant::now(),
52 ttl: Duration::from_secs(3600),
53 },
54 );
55
56 Ok(allowed)
57 }
58
59 fn check_robots_txt(&self, _robots_txt: &str, _url: &str) -> bool {
60 true
63 }
64
65 fn get_robots_url(&self, url: &Url) -> Result<Url> {
66 let mut robots_url = url.clone();
67 robots_url.set_path("/robots.txt");
68 robots_url.set_query(None);
69 robots_url.set_fragment(None);
70 Ok(robots_url)
71 }
72
73 async fn fetch_robots_txt(&self, url: &Url) -> Result<String> {
74 match self.client.get(url.as_str()).send().await {
75 Ok(response) => {
76 if response.status().is_success() {
77 Ok(response.text().await?)
78 } else {
79 Ok(String::new())
80 }
81 }
82 Err(_) => Ok(String::new()),
83 }
84 }
85
86 pub fn get_crawl_delay(&self, domain: &str) -> Option<Duration> {
87 self.cache.get(domain).and(None)
88 }
89}