scrapling_spider/
robotstxt.rs1use std::collections::HashMap;
15
16use crate::session::SessionManager;
17
18pub struct RobotsTxtManager {
25 cache: HashMap<String, RobotsTxtRules>,
26}
27
28struct RobotsTxtRules {
29 disallowed: Vec<String>,
30 crawl_delay: Option<f64>,
31}
32
33impl Default for RobotsTxtManager {
34 fn default() -> Self {
35 Self::new()
36 }
37}
38
39impl RobotsTxtManager {
40 pub fn new() -> Self {
43 Self {
44 cache: HashMap::new(),
45 }
46 }
47
48 pub async fn can_fetch(
55 &mut self,
56 url: &str,
57 sid: &str,
58 session_manager: &SessionManager,
59 ) -> bool {
60 let Some(domain) = extract_domain(url) else {
61 return true;
62 };
63
64 let rules = self.get_or_fetch(&domain, sid, session_manager).await;
65
66 let path = url::Url::parse(url)
67 .ok()
68 .map(|u| u.path().to_owned())
69 .unwrap_or_else(|| "/".into());
70
71 !rules.disallowed.iter().any(|d| path.starts_with(d))
72 }
73
74 pub async fn get_crawl_delay(
79 &mut self,
80 url: &str,
81 sid: &str,
82 session_manager: &SessionManager,
83 ) -> Option<f64> {
84 let domain = extract_domain(url)?;
85 let rules = self.get_or_fetch(&domain, sid, session_manager).await;
86 rules.crawl_delay
87 }
88
89 pub async fn prefetch(&mut self, urls: &[String], sid: &str, session_manager: &SessionManager) {
95 let mut domains_seen = std::collections::HashSet::new();
96 for url in urls {
97 if let Some(domain) = extract_domain(url) {
98 if domains_seen.insert(domain.clone()) {
99 self.get_or_fetch(&domain, sid, session_manager).await;
100 }
101 }
102 }
103 }
104
105 async fn get_or_fetch(
106 &mut self,
107 domain: &str,
108 sid: &str,
109 session_manager: &SessionManager,
110 ) -> &RobotsTxtRules {
111 if !self.cache.contains_key(domain) {
112 let rules = fetch_and_parse(domain, sid, session_manager).await;
113 self.cache.insert(domain.to_owned(), rules);
114 }
115 self.cache.get(domain).unwrap()
116 }
117}
118
119async fn fetch_and_parse(
120 domain: &str,
121 sid: &str,
122 session_manager: &SessionManager,
123) -> RobotsTxtRules {
124 let robots_url = format!("https://{domain}/robots.txt");
125
126 let content = match session_manager.get(if sid.is_empty() {
127 session_manager.default_session_id().unwrap_or("default")
128 } else {
129 sid
130 }) {
131 Ok(_session) => {
132 let req = crate::request::Request::new(&robots_url);
133 match session_manager.fetch(&req).await {
134 Ok(resp) if resp.is_success() => String::from_utf8_lossy(&resp.body).to_string(),
135 _ => String::new(),
136 }
137 }
138 Err(_) => String::new(),
139 };
140
141 parse_robots_txt(&content)
142}
143
144fn parse_robots_txt(content: &str) -> RobotsTxtRules {
145 let mut disallowed = Vec::new();
146 let mut crawl_delay = None;
147 let mut in_wildcard_agent = false;
148
149 for line in content.lines() {
150 let line = line.trim();
151 if line.is_empty() || line.starts_with('#') {
152 continue;
153 }
154
155 let Some((key, value)) = line.split_once(':') else {
156 continue;
157 };
158 let key = key.trim().to_lowercase();
159 let value = value.trim();
160
161 match key.as_str() {
162 "user-agent" => {
163 in_wildcard_agent = value == "*";
164 }
165 "disallow" if in_wildcard_agent && !value.is_empty() => {
166 disallowed.push(value.to_owned());
167 }
168 "crawl-delay" if in_wildcard_agent => {
169 if let Ok(delay) = value.parse::<f64>() {
170 crawl_delay = Some(delay);
171 }
172 }
173 _ => {}
174 }
175 }
176
177 RobotsTxtRules {
178 disallowed,
179 crawl_delay,
180 }
181}
182
183fn extract_domain(url: &str) -> Option<String> {
184 url::Url::parse(url)
185 .ok()
186 .and_then(|u| u.host_str().map(|h| h.to_owned()))
187}
188
189#[cfg(test)]
190mod tests {
191 use super::*;
192
193 #[test]
194 fn parse_robots_txt_basic() {
195 let rules = parse_robots_txt(
196 "User-agent: *\nDisallow: /admin\nDisallow: /private\nCrawl-delay: 2\n",
197 );
198 assert_eq!(rules.disallowed, vec!["/admin", "/private"]);
199 assert_eq!(rules.crawl_delay, Some(2.0));
200 }
201
202 #[test]
203 fn parse_robots_txt_empty() {
204 let rules = parse_robots_txt("");
205 assert!(rules.disallowed.is_empty());
206 assert!(rules.crawl_delay.is_none());
207 }
208
209 #[test]
210 fn parse_robots_txt_specific_agent_ignored() {
211 let rules = parse_robots_txt("User-agent: Googlebot\nDisallow: /secret\n");
212 assert!(rules.disallowed.is_empty());
213 }
214
215 #[test]
216 fn extract_domain_works() {
217 assert_eq!(
218 extract_domain("https://example.com/page"),
219 Some("example.com".into())
220 );
221 assert_eq!(extract_domain("not-a-url"), None);
222 }
223}