essence/utils/
robots_enhanced.rs1use crate::error::{Result, ScrapeError};
11use moka::future::Cache;
12use std::sync::LazyLock;
13use reqwest::Client;
14use robotstxt::DefaultMatcher;
15use std::sync::Arc;
16use std::time::Duration;
17use tracing::{debug, warn};
18use url::Url;
19
20#[derive(Debug, Clone)]
22pub struct RobotsData {
23 pub content: String,
25 pub crawl_delay: Option<f64>,
27 pub allowed: bool,
29}
30
31static ROBOTS_CACHE: LazyLock<Arc<Cache<String, RobotsData>>> = LazyLock::new(|| {
35 Arc::new(
36 Cache::builder()
37 .max_capacity(10_000)
38 .time_to_live(Duration::from_secs(24 * 3600)) .build(),
40 )
41});
42
43fn extract_domain(url: &Url) -> Result<String> {
45 crate::utils::etld::extract_etld_plus_one(url.as_str())
46}
47
48fn parse_crawl_delay(robots_txt: &str, user_agent: &str) -> Option<f64> {
56 let mut in_user_agent_block = false;
57 let mut crawl_delay: Option<f64> = None;
58
59 for line in robots_txt.lines() {
60 let line = line.trim();
61
62 if line.to_lowercase().starts_with("user-agent:") {
64 let agent = line[11..].trim();
65 in_user_agent_block = agent == "*" || agent.eq_ignore_ascii_case(user_agent);
66 }
67
68 if in_user_agent_block && line.to_lowercase().starts_with("crawl-delay:") {
70 let delay_str = line[12..].trim();
71 if let Ok(delay) = delay_str.parse::<f64>() {
72 crawl_delay = Some(delay);
73 debug!("Parsed Crawl-Delay: {} seconds", delay);
74 }
75 }
76 }
77
78 crawl_delay
79}
80
81async fn fetch_robots_txt(domain: &str, user_agent: &str) -> Result<RobotsData> {
83 let robots_url = format!("https://{}/robots.txt", domain);
84 debug!("Fetching robots.txt from: {}", robots_url);
85
86 let client = Client::builder()
87 .timeout(Duration::from_secs(5))
88 .build()
89 .map_err(|e| ScrapeError::Internal(format!("Failed to create HTTP client: {}", e)))?;
90
91 let response = match client.get(&robots_url).send().await {
92 Ok(resp) => resp,
93 Err(e) => {
94 warn!("Failed to fetch robots.txt from {}: {}", robots_url, e);
95 return Ok(RobotsData {
97 content: String::new(),
98 crawl_delay: None,
99 allowed: true,
100 });
101 }
102 };
103
104 if !response.status().is_success() {
105 warn!("robots.txt not found at {} (status: {})", robots_url, response.status());
106 return Ok(RobotsData {
108 content: String::new(),
109 crawl_delay: None,
110 allowed: true,
111 });
112 }
113
114 let robots_txt = response
115 .text()
116 .await
117 .map_err(|e| ScrapeError::Internal(format!("Failed to read robots.txt: {}", e)))?;
118
119 let crawl_delay = parse_crawl_delay(&robots_txt, user_agent);
121
122 Ok(RobotsData {
123 content: robots_txt,
124 crawl_delay,
125 allowed: true, })
127}
128
129pub async fn is_allowed_cached(url: &str, user_agent: &str) -> Result<(bool, Option<f64>)> {
138 let parsed_url = Url::parse(url)
139 .map_err(|e| ScrapeError::InvalidUrl(format!("Invalid URL: {}", e)))?;
140
141 let domain = extract_domain(&parsed_url)?;
142
143 let robots_data = if let Some(cached) = ROBOTS_CACHE.get(&domain).await {
145 debug!("Robots.txt cache hit for domain: {}", domain);
146 cached
147 } else {
148 debug!("Robots.txt cache miss for domain: {}", domain);
149 let data = fetch_robots_txt(&domain, user_agent).await?;
150 ROBOTS_CACHE.insert(domain.clone(), data.clone()).await;
151 data
152 };
153
154 let path = parsed_url.path();
156 let allowed = if robots_data.content.is_empty() {
157 true
159 } else {
160 let mut matcher = DefaultMatcher::default();
161 matcher.one_agent_allowed_by_robots(&robots_data.content, user_agent, path)
162 };
163
164 Ok((allowed, robots_data.crawl_delay))
165}
166
167pub async fn is_allowed_default_cached(url: &str) -> Result<(bool, Option<f64>)> {
169 is_allowed_cached(url, "Essence").await
170}
171
172#[cfg(test)]
174pub async fn clear_cache() {
175 ROBOTS_CACHE.invalidate_all();
176 ROBOTS_CACHE.run_pending_tasks().await;
177}
178
179#[cfg(test)]
180mod tests {
181 use super::*;
182
183 #[test]
184 fn test_domain_extraction() {
185 let url = Url::parse("https://www.example.com/path/to/page").unwrap();
186 let domain = extract_domain(&url).unwrap();
187 assert_eq!(domain, "example.com");
189 }
190
191 #[test]
192 fn test_crawl_delay_parsing() {
193 let robots_txt = r#"
194User-agent: *
195Crawl-delay: 2
196
197User-agent: Essence
198Crawl-delay: 0.5
199Disallow: /admin
200 "#;
201
202 let delay_all = parse_crawl_delay(robots_txt, "*");
203 assert_eq!(delay_all, Some(2.0));
204
205 let delay_essence = parse_crawl_delay(robots_txt, "Essence");
206 assert_eq!(delay_essence, Some(0.5));
207
208 let delay_other = parse_crawl_delay(robots_txt, "OtherBot");
210 assert_eq!(delay_other, Some(2.0));
211 }
212
213 #[test]
214 fn test_crawl_delay_parsing_case_insensitive() {
215 let robots_txt = r#"
216user-agent: *
217crawl-delay: 1.5
218 "#;
219
220 let delay = parse_crawl_delay(robots_txt, "*");
221 assert_eq!(delay, Some(1.5));
222 }
223
224 #[tokio::test]
225 async fn test_cache() {
226 clear_cache().await;
227
228 let domain = "example.com";
230 let robots_data = RobotsData {
231 content: "User-agent: *\nDisallow: /admin".to_string(),
232 crawl_delay: Some(1.0),
233 allowed: true,
234 };
235
236 ROBOTS_CACHE.insert(domain.to_string(), robots_data.clone()).await;
237
238 let cached = ROBOTS_CACHE.get(domain).await;
240 assert!(cached.is_some());
241 assert_eq!(cached.unwrap().crawl_delay, Some(1.0));
242 }
243}