scrapling_spider/
robotstxt.rs

1//! Robots.txt fetching, parsing, and enforcement.
2//!
3//! When [`Spider::robots_txt_obey`](crate::spider::Spider::robots_txt_obey)
4//! returns `true`, the engine creates a [`RobotsTxtManager`] and consults it
5//! before every fetch. The manager lazily fetches and parses each domain's
6//! `robots.txt` the first time it is needed, then caches the result for the
7//! remainder of the crawl.
8//!
9//! Only rules under the `User-agent: *` section are respected. The parser
10//! extracts `Disallow` directives (matched as path prefixes) and an optional
11//! `Crawl-delay` value. Domains whose `robots.txt` cannot be fetched or parsed
12//! are treated as "allow all."
13
14use std::collections::HashMap;
15
16use crate::session::SessionManager;
17
18/// Fetches, parses, and caches robots.txt rules per domain.
19///
20/// You do not construct this directly; the [`CrawlerEngine`](crate::spider::CrawlerEngine)
21/// creates one when `robots_txt_obey` is enabled. The manager maintains an
22/// internal cache so that each domain's `robots.txt` is fetched at most once
23/// per crawl run.
24pub struct RobotsTxtManager {
25    cache: HashMap<String, RobotsTxtRules>,
26}
27
28struct RobotsTxtRules {
29    disallowed: Vec<String>,
30    crawl_delay: Option<f64>,
31}
32
33impl Default for RobotsTxtManager {
34    fn default() -> Self {
35        Self::new()
36    }
37}
38
39impl RobotsTxtManager {
40    /// Creates a new robots.txt manager with an empty cache. Rules will be
41    /// fetched on demand the first time a URL on a new domain is checked.
42    pub fn new() -> Self {
43        Self {
44            cache: HashMap::new(),
45        }
46    }
47
48    /// Returns `true` if the URL is allowed by the domain's robots.txt rules.
49    ///
50    /// The method extracts the domain from the URL, fetches and parses
51    /// `robots.txt` if it hasn't been cached yet, and checks whether the URL's
52    /// path matches any `Disallow` directive. URLs that cannot be parsed (e.g.,
53    /// invalid format) are considered allowed.
54    pub async fn can_fetch(
55        &mut self,
56        url: &str,
57        sid: &str,
58        session_manager: &SessionManager,
59    ) -> bool {
60        let Some(domain) = extract_domain(url) else {
61            return true;
62        };
63
64        let rules = self.get_or_fetch(&domain, sid, session_manager).await;
65
66        let path = url::Url::parse(url)
67            .ok()
68            .map(|u| u.path().to_owned())
69            .unwrap_or_else(|| "/".into());
70
71        !rules.disallowed.iter().any(|d| path.starts_with(d))
72    }
73
74    /// Returns the crawl-delay specified in the domain's robots.txt, if any.
75    /// The delay is in seconds and comes from the `Crawl-delay` directive under
76    /// `User-agent: *`. Returns `None` if no delay is specified or if the
77    /// domain's `robots.txt` could not be fetched.
78    pub async fn get_crawl_delay(
79        &mut self,
80        url: &str,
81        sid: &str,
82        session_manager: &SessionManager,
83    ) -> Option<f64> {
84        let domain = extract_domain(url)?;
85        let rules = self.get_or_fetch(&domain, sid, session_manager).await;
86        rules.crawl_delay
87    }
88
89    /// Pre-fetches robots.txt for all unique domains in the given URLs.
90    ///
91    /// The engine calls this before the crawl loop starts so that the first
92    /// batch of requests is not delayed by on-demand robots.txt lookups.
93    /// Duplicate domains are deduplicated internally.
94    pub async fn prefetch(&mut self, urls: &[String], sid: &str, session_manager: &SessionManager) {
95        let mut domains_seen = std::collections::HashSet::new();
96        for url in urls {
97            if let Some(domain) = extract_domain(url) {
98                if domains_seen.insert(domain.clone()) {
99                    self.get_or_fetch(&domain, sid, session_manager).await;
100                }
101            }
102        }
103    }
104
105    async fn get_or_fetch(
106        &mut self,
107        domain: &str,
108        sid: &str,
109        session_manager: &SessionManager,
110    ) -> &RobotsTxtRules {
111        if !self.cache.contains_key(domain) {
112            let rules = fetch_and_parse(domain, sid, session_manager).await;
113            self.cache.insert(domain.to_owned(), rules);
114        }
115        self.cache.get(domain).unwrap()
116    }
117}
118
119async fn fetch_and_parse(
120    domain: &str,
121    sid: &str,
122    session_manager: &SessionManager,
123) -> RobotsTxtRules {
124    let robots_url = format!("https://{domain}/robots.txt");
125
126    let content = match session_manager.get(if sid.is_empty() {
127        session_manager.default_session_id().unwrap_or("default")
128    } else {
129        sid
130    }) {
131        Ok(_session) => {
132            let req = crate::request::Request::new(&robots_url);
133            match session_manager.fetch(&req).await {
134                Ok(resp) if resp.is_success() => String::from_utf8_lossy(&resp.body).to_string(),
135                _ => String::new(),
136            }
137        }
138        Err(_) => String::new(),
139    };
140
141    parse_robots_txt(&content)
142}
143
144fn parse_robots_txt(content: &str) -> RobotsTxtRules {
145    let mut disallowed = Vec::new();
146    let mut crawl_delay = None;
147    let mut in_wildcard_agent = false;
148
149    for line in content.lines() {
150        let line = line.trim();
151        if line.is_empty() || line.starts_with('#') {
152            continue;
153        }
154
155        let Some((key, value)) = line.split_once(':') else {
156            continue;
157        };
158        let key = key.trim().to_lowercase();
159        let value = value.trim();
160
161        match key.as_str() {
162            "user-agent" => {
163                in_wildcard_agent = value == "*";
164            }
165            "disallow" if in_wildcard_agent && !value.is_empty() => {
166                disallowed.push(value.to_owned());
167            }
168            "crawl-delay" if in_wildcard_agent => {
169                if let Ok(delay) = value.parse::<f64>() {
170                    crawl_delay = Some(delay);
171                }
172            }
173            _ => {}
174        }
175    }
176
177    RobotsTxtRules {
178        disallowed,
179        crawl_delay,
180    }
181}
182
183fn extract_domain(url: &str) -> Option<String> {
184    url::Url::parse(url)
185        .ok()
186        .and_then(|u| u.host_str().map(|h| h.to_owned()))
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192
193    #[test]
194    fn parse_robots_txt_basic() {
195        let rules = parse_robots_txt(
196            "User-agent: *\nDisallow: /admin\nDisallow: /private\nCrawl-delay: 2\n",
197        );
198        assert_eq!(rules.disallowed, vec!["/admin", "/private"]);
199        assert_eq!(rules.crawl_delay, Some(2.0));
200    }
201
202    #[test]
203    fn parse_robots_txt_empty() {
204        let rules = parse_robots_txt("");
205        assert!(rules.disallowed.is_empty());
206        assert!(rules.crawl_delay.is_none());
207    }
208
209    #[test]
210    fn parse_robots_txt_specific_agent_ignored() {
211        let rules = parse_robots_txt("User-agent: Googlebot\nDisallow: /secret\n");
212        assert!(rules.disallowed.is_empty());
213    }
214
215    #[test]
216    fn extract_domain_works() {
217        assert_eq!(
218            extract_domain("https://example.com/page"),
219            Some("example.com".into())
220        );
221        assert_eq!(extract_domain("not-a-url"), None);
222    }
223}
scrapling_spider/robotstxt.rs

scrapling_spider/
robotstxt.rs