Skip to main content

essence/utils/
robots_enhanced.rs

1//! Enhanced robots.txt parser with caching and crawl-delay support
2//!
3//! Features:
4//! - In-memory caching with TTL (24 hours default)
5//! - Crawl-Delay directive parsing
6//! - eTLD+1 domain extraction (future: use psl crate)
7//! - Configurable user agent
8//! - Graceful fallback if robots.txt unavailable
9
10use crate::error::{Result, ScrapeError};
11use moka::future::Cache;
12use std::sync::LazyLock;
13use reqwest::Client;
14use robotstxt::DefaultMatcher;
15use std::sync::Arc;
16use std::time::Duration;
17use tracing::{debug, warn};
18use url::Url;
19
20/// Cached robots.txt data
21#[derive(Debug, Clone)]
22pub struct RobotsData {
23    /// Raw robots.txt content
24    pub content: String,
25    /// Crawl-Delay value (in seconds) if specified
26    pub crawl_delay: Option<f64>,
27    /// Whether the URL is allowed
28    pub allowed: bool,
29}
30
31/// Global robots.txt cache
32/// Key: domain (e.g., "example.com")
33/// Value: RobotsData
34static ROBOTS_CACHE: LazyLock<Arc<Cache<String, RobotsData>>> = LazyLock::new(|| {
35    Arc::new(
36        Cache::builder()
37            .max_capacity(10_000)
38            .time_to_live(Duration::from_secs(24 * 3600)) // 24 hours
39            .build(),
40    )
41});
42
43/// Extract domain from URL (uses proper eTLD+1 extraction)
44fn extract_domain(url: &Url) -> Result<String> {
45    crate::utils::etld::extract_etld_plus_one(url.as_str())
46}
47
48/// Parse Crawl-Delay directive from robots.txt
49/// 
50/// Looks for lines like:
51/// - `Crawl-delay: 1`
52/// - `Crawl-Delay: 0.5`
53/// 
54/// Returns the delay in seconds, or None if not specified
55fn parse_crawl_delay(robots_txt: &str, user_agent: &str) -> Option<f64> {
56    let mut in_user_agent_block = false;
57    let mut crawl_delay: Option<f64> = None;
58
59    for line in robots_txt.lines() {
60        let line = line.trim();
61
62        // Check if this is the user agent we're looking for
63        if line.to_lowercase().starts_with("user-agent:") {
64            let agent = line[11..].trim();
65            in_user_agent_block = agent == "*" || agent.eq_ignore_ascii_case(user_agent);
66        }
67
68        // If we're in the right user-agent block, look for Crawl-delay
69        if in_user_agent_block && line.to_lowercase().starts_with("crawl-delay:") {
70            let delay_str = line[12..].trim();
71            if let Ok(delay) = delay_str.parse::<f64>() {
72                crawl_delay = Some(delay);
73                debug!("Parsed Crawl-Delay: {} seconds", delay);
74            }
75        }
76    }
77
78    crawl_delay
79}
80
81/// Fetch and parse robots.txt for a domain
82async fn fetch_robots_txt(domain: &str, user_agent: &str) -> Result<RobotsData> {
83    let robots_url = format!("https://{}/robots.txt", domain);
84    debug!("Fetching robots.txt from: {}", robots_url);
85
86    let client = Client::builder()
87        .timeout(Duration::from_secs(5))
88        .build()
89        .map_err(|e| ScrapeError::Internal(format!("Failed to create HTTP client: {}", e)))?;
90
91    let response = match client.get(&robots_url).send().await {
92        Ok(resp) => resp,
93        Err(e) => {
94            warn!("Failed to fetch robots.txt from {}: {}", robots_url, e);
95            // If robots.txt doesn't exist, allow by default
96            return Ok(RobotsData {
97                content: String::new(),
98                crawl_delay: None,
99                allowed: true,
100            });
101        }
102    };
103
104    if !response.status().is_success() {
105        warn!("robots.txt not found at {} (status: {})", robots_url, response.status());
106        // If robots.txt doesn't exist, allow by default
107        return Ok(RobotsData {
108            content: String::new(),
109            crawl_delay: None,
110            allowed: true,
111        });
112    }
113
114    let robots_txt = response
115        .text()
116        .await
117        .map_err(|e| ScrapeError::Internal(format!("Failed to read robots.txt: {}", e)))?;
118
119    // Parse crawl-delay directive
120    let crawl_delay = parse_crawl_delay(&robots_txt, user_agent);
121
122    Ok(RobotsData {
123        content: robots_txt,
124        crawl_delay,
125        allowed: true, // Will be determined per-URL
126    })
127}
128
129/// Check if a URL is allowed by robots.txt (with caching)
130/// 
131/// This function:
132/// 1. Extracts the domain from the URL
133/// 2. Checks if robots.txt is cached
134/// 3. If not cached, fetches and caches it
135/// 4. Checks if the specific path is allowed
136/// 5. Returns the crawl delay if specified
137pub async fn is_allowed_cached(url: &str, user_agent: &str) -> Result<(bool, Option<f64>)> {
138    let parsed_url = Url::parse(url)
139        .map_err(|e| ScrapeError::InvalidUrl(format!("Invalid URL: {}", e)))?;
140
141    let domain = extract_domain(&parsed_url)?;
142
143    // Check cache first
144    let robots_data = if let Some(cached) = ROBOTS_CACHE.get(&domain).await {
145        debug!("Robots.txt cache hit for domain: {}", domain);
146        cached
147    } else {
148        debug!("Robots.txt cache miss for domain: {}", domain);
149        let data = fetch_robots_txt(&domain, user_agent).await?;
150        ROBOTS_CACHE.insert(domain.clone(), data.clone()).await;
151        data
152    };
153
154    // Check if the specific path is allowed
155    let path = parsed_url.path();
156    let allowed = if robots_data.content.is_empty() {
157        // No robots.txt, allow by default
158        true
159    } else {
160        let mut matcher = DefaultMatcher::default();
161        matcher.one_agent_allowed_by_robots(&robots_data.content, user_agent, path)
162    };
163
164    Ok((allowed, robots_data.crawl_delay))
165}
166
167/// Check robots.txt with default user agent "Essence"
168pub async fn is_allowed_default_cached(url: &str) -> Result<(bool, Option<f64>)> {
169    is_allowed_cached(url, "Essence").await
170}
171
172/// Clear the robots.txt cache (useful for testing)
173#[cfg(test)]
174pub async fn clear_cache() {
175    ROBOTS_CACHE.invalidate_all();
176    ROBOTS_CACHE.run_pending_tasks().await;
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182
183    #[test]
184    fn test_domain_extraction() {
185        let url = Url::parse("https://www.example.com/path/to/page").unwrap();
186        let domain = extract_domain(&url).unwrap();
187        // extract_domain uses eTLD+1, so www.example.com → example.com
188        assert_eq!(domain, "example.com");
189    }
190
191    #[test]
192    fn test_crawl_delay_parsing() {
193        let robots_txt = r#"
194User-agent: *
195Crawl-delay: 2
196
197User-agent: Essence
198Crawl-delay: 0.5
199Disallow: /admin
200        "#;
201
202        let delay_all = parse_crawl_delay(robots_txt, "*");
203        assert_eq!(delay_all, Some(2.0));
204
205        let delay_essence = parse_crawl_delay(robots_txt, "Essence");
206        assert_eq!(delay_essence, Some(0.5));
207
208        // OtherBot matches the wildcard (*) block, so gets its crawl delay
209        let delay_other = parse_crawl_delay(robots_txt, "OtherBot");
210        assert_eq!(delay_other, Some(2.0));
211    }
212
213    #[test]
214    fn test_crawl_delay_parsing_case_insensitive() {
215        let robots_txt = r#"
216user-agent: *
217crawl-delay: 1.5
218        "#;
219
220        let delay = parse_crawl_delay(robots_txt, "*");
221        assert_eq!(delay, Some(1.5));
222    }
223
224    #[tokio::test]
225    async fn test_cache() {
226        clear_cache().await;
227
228        // First call - should cache
229        let domain = "example.com";
230        let robots_data = RobotsData {
231            content: "User-agent: *\nDisallow: /admin".to_string(),
232            crawl_delay: Some(1.0),
233            allowed: true,
234        };
235
236        ROBOTS_CACHE.insert(domain.to_string(), robots_data.clone()).await;
237
238        // Second call - should hit cache
239        let cached = ROBOTS_CACHE.get(domain).await;
240        assert!(cached.is_some());
241        assert_eq!(cached.unwrap().crawl_delay, Some(1.0));
242    }
243}