robotstxt_rs/parser.rs
1use std::collections::HashMap;
2
3#[cfg(feature = "async")]
4use reqwest;
5
6/// A rule for a specific user-agent containing allowed and disallowed paths.
7#[derive(Debug, Clone)]
8pub struct RobotRule {
9 /// The user-agent this rule applies to (e.g., "Googlebot", "*")
10 pub user_agent: String,
11 /// List of paths explicitly allowed for this user-agent
12 pub allowed: Vec<String>,
13 /// List of paths disallowed for this user-agent
14 pub disallowed: Vec<String>,
15}
16
17/// The main structure representing a parsed robots.txt file.
18#[derive(Debug)]
19pub struct RobotsTxt {
20 domain: Option<String>,
21 rules: HashMap<String, RobotRule>,
22 sitemaps: Vec<String>,
23 comments: Vec<String>,
24}
25
26#[cfg(feature = "async")]
27
28impl RobotsTxt {
29 /// Parse a robots.txt file from a string.
30 ///
31 /// # Arguments
32 ///
33 /// * `content` - The robots.txt file content as a string
34 ///
35 /// # Example
36 ///
37 /// ```
38 /// use robotstxt_rs::RobotsTxt;
39 ///
40 /// let content = "User-agent: *\nDisallow: /admin/";
41 /// let robots = RobotsTxt::parse(content);
42 /// ```
43 pub fn parse(content: &str) -> Self {
44 Self::parse_with_domain(content, None)
45 }
46
47 /// Parse a robots.txt file from a string with a specified domain.
48 ///
49 /// # Arguments
50 ///
51 /// * `content` - The robots.txt file content as a string
52 /// * `domain` - Optional domain name to associate with this robots.txt
53 ///
54 /// # Example
55 ///
56 /// ```
57 /// use robotstxt_rs::RobotsTxt;
58 ///
59 /// let content = "User-agent: *\nDisallow: /admin/";
60 /// let robots = RobotsTxt::parse_with_domain(content, Some("example.com".to_string()));
61 /// ```
62 pub fn parse_with_domain(content: &str, domain: Option<String>) -> Self {
63 let mut rules: HashMap<String, RobotRule> = HashMap::new();
64 let mut sitemaps = Vec::new();
65 let mut comments = Vec::new();
66
67 let mut current_agents: Vec<String> = Vec::new();
68
69 for line in content.lines() {
70 let line = line.trim();
71
72 // Handle comments
73 if line.starts_with('#') {
74 comments.push(line[1..].trim().to_string());
75 continue;
76 }
77
78 // Skip empty lines
79 if line.is_empty() {
80 continue;
81 }
82
83 // Split on first colon
84 if let Some((directive, value)) = line.split_once(':') {
85 let directive = directive.trim().to_lowercase();
86 let value = value.trim().to_string();
87
88 match directive.as_str() {
89 "user-agent" => {
90 // Start new user-agent group
91 let agent = value.to_lowercase();
92 if !rules.contains_key(&agent) {
93 rules.insert(
94 agent.clone(),
95 RobotRule {
96 user_agent: agent.clone(),
97 allowed: Vec::new(),
98 disallowed: Vec::new(),
99 },
100 );
101 }
102 current_agents.push(agent);
103 }
104 "allow" => {
105 // Add to all current agents
106 for agent in ¤t_agents {
107 if let Some(rule) = rules.get_mut(agent) {
108 rule.allowed.push(value.clone());
109 }
110 }
111 }
112 "disallow" => {
113 // Add to all current agents
114 for agent in ¤t_agents {
115 if let Some(rule) = rules.get_mut(agent) {
116 rule.disallowed.push(value.clone());
117 }
118 }
119 }
120 "sitemap" => {
121 sitemaps.push(value);
122 // Sitemap is global, reset current agents
123 current_agents.clear();
124 }
125 _ => {
126 // Unknown directive, could log or ignore
127 }
128 }
129 }
130 }
131
132 RobotsTxt {
133 domain,
134 rules,
135 sitemaps,
136 comments,
137 }
138 }
139
140 /// Fetch and parse a robots.txt file from a URL (requires async feature).
141 ///
142 /// # Arguments
143 ///
144 /// * `url` - The URL to the robots.txt file
145 ///
146 /// # Example
147 ///
148 /// ```no_run
149 /// use robotstxt_rs::RobotsTxt;
150 ///
151 /// #[tokio::main]
152 /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
153 /// let robots = RobotsTxt::from_url("https://example.com/robots.txt").await?;
154 /// Ok(())
155 /// }
156 /// ```
157 pub async fn from_url(url: &str) -> Result<Self, Box<dyn std::error::Error>> {
158 let client = reqwest::Client::new();
159 let content = client.get(url).send().await?.text().await?;
160
161 // Extract domain from URL
162 let domain = extract_domain(url);
163
164 Ok(Self::parse_with_domain(&content, Some(domain)))
165 }
166
167 /// Check if a user-agent is allowed to fetch a specific path.
168 ///
169 /// # Arguments
170 ///
171 /// * `user_agent` - The user-agent string (e.g., "Googlebot")
172 /// * `path` - The path to check (e.g., "/admin/panel")
173 ///
174 /// # Returns
175 ///
176 /// Returns `true` if the user-agent is allowed to fetch the path, `false` otherwise.
177 ///
178 /// # Example
179 ///
180 /// ```
181 /// use robotstxt_rs::RobotsTxt;
182 ///
183 /// let content = "User-agent: *\nDisallow: /admin/";
184 /// let robots = RobotsTxt::parse(content);
185 /// assert!(!robots.can_fetch("Googlebot", "/admin/panel"));
186 /// assert!(robots.can_fetch("Googlebot", "/public/page"));
187 /// ```
188 pub fn can_fetch(&self, user_agent: &str, path: &str) -> bool {
189 let user_agent = user_agent.to_lowercase();
190
191 // Try exact match first
192 let rule = if let Some(rule) = self.rules.get(&user_agent) {
193 rule
194 } else if let Some(rule) = self.rules.get("*") {
195 // Fall back to wildcard
196 rule
197 } else {
198 // No rules = allowed
199 return true;
200 };
201
202 // Check disallowed paths first (more restrictive)
203 for disallowed in &rule.disallowed {
204 if disallowed.is_empty() {
205 continue;
206 }
207 if path_matches(path, disallowed) {
208 // Check if there's a more specific allow rule
209 for allowed in &rule.allowed {
210 if path_matches(path, allowed) && allowed.len() > disallowed.len() {
211 return true;
212 }
213 }
214 return false;
215 }
216 }
217
218 // If not explicitly disallowed, check allowed rules
219 // Empty allowed list means everything is allowed
220 if rule.allowed.is_empty() {
221 return true;
222 }
223
224 for allowed in &rule.allowed {
225 if path_matches(path, allowed) {
226 return true;
227 }
228 }
229
230 // If there are allow rules but no match, it's disallowed
231 false
232 }
233
234 /// Get the domain associated with this robots.txt file.
235 ///
236 /// # Returns
237 ///
238 /// Returns `Some(&str)` if a domain was specified, `None` otherwise.
239 pub fn get_domain(&self) -> Option<&str> {
240 self.domain.as_deref()
241 }
242
243 /// Get all sitemap URLs from the robots.txt file.
244 ///
245 /// # Returns
246 ///
247 /// Returns a slice of sitemap URLs.
248 pub fn get_sitemaps(&self) -> &[String] {
249 &self.sitemaps
250 }
251
252 /// Get all comments from the robots.txt file.
253 ///
254 /// # Returns
255 ///
256 /// Returns a slice of comment strings (without the # prefix).
257 pub fn get_comments(&self) -> &[String] {
258 &self.comments
259 }
260
261 /// Get all rules for all user-agents.
262 ///
263 /// # Returns
264 ///
265 /// Returns a HashMap where keys are user-agent strings and values are RobotRule structs.
266 pub fn get_rules(&self) -> &HashMap<String, RobotRule> {
267 &self.rules
268 }
269
270 /// Get the rule for a specific user-agent.
271 ///
272 /// # Arguments
273 ///
274 /// * `user_agent` - The user-agent string to look up
275 ///
276 /// # Returns
277 ///
278 /// Returns `Some(&RobotRule)` if rules exist for this user-agent or the wildcard (*), `None` otherwise.
279 ///
280 /// # Example
281 ///
282 /// ```
283 /// use robotstxt_rs::RobotsTxt;
284 ///
285 /// let content = "User-agent: Googlebot\nDisallow: /private/";
286 /// let robots = RobotsTxt::parse(content);
287 /// if let Some(rule) = robots.get_rule("Googlebot") {
288 /// println!("Disallowed paths: {:?}", rule.disallowed);
289 /// }
290 /// ```
291 pub fn get_rule(&self, user_agent: &str) -> Option<&RobotRule> {
292 let user_agent = user_agent.to_lowercase();
293 self.rules.get(&user_agent).or_else(|| self.rules.get("*"))
294 }
295}
296
297fn extract_domain(url: &str) -> String {
298 // Simple domain extraction - handles common cases
299 let url = url.trim();
300
301 // Remove protocol
302 let url = url
303 .strip_prefix("https://")
304 .or_else(|| url.strip_prefix("http://"))
305 .unwrap_or(url);
306
307 // Take everything before the first slash
308 let domain = url.split('/').next().unwrap_or(url);
309
310 // Remove port if present
311 let domain = domain.split(':').next().unwrap_or(domain);
312
313 domain.to_string()
314}
315
316fn path_matches(path: &str, pattern: &str) -> bool {
317 // Handle end-of-string anchor $
318 if pattern.ends_with('$') {
319 let pattern = &pattern[..pattern.len() - 1];
320 if pattern.contains('*') {
321 // Complex wildcard matching with end anchor
322 return path == pattern.replace('*', "");
323 } else {
324 return path == pattern;
325 }
326 }
327
328 // Simple prefix matching (most common case)
329 if !pattern.contains('*') {
330 return path.starts_with(pattern);
331 }
332
333 // Handle wildcards
334 let parts: Vec<&str> = pattern.split('*').collect();
335 let mut pos = 0;
336
337 for (i, part) in parts.iter().enumerate() {
338 if i == 0 {
339 // First part must be at the start
340 if !path[pos..].starts_with(part) {
341 return false;
342 }
343 pos += part.len();
344 } else if i == parts.len() - 1 {
345 // Last part must be at the end (or anywhere if followed by *)
346 if !path[pos..].contains(part) {
347 return false;
348 }
349 } else {
350 // Middle parts just need to exist
351 if let Some(found) = path[pos..].find(part) {
352 pos += found + part.len();
353 } else {
354 return false;
355 }
356 }
357 }
358
359 true
360}
361
362#[cfg(test)]
363mod tests {
364 use super::*;
365
366 #[test]
367 fn test_basic_parsing() {
368 let content = r#"
369User-agent: *
370Disallow: /admin/
371Allow: /public/
372
373User-agent: Googlebot
374Disallow: /private/
375
376Sitemap: https://example.com/sitemap.xml
377# This is a comment
378 "#;
379
380 let robots = RobotsTxt::parse(content);
381
382 assert!(robots.can_fetch("Mozilla", "/public/test.html"));
383 assert!(!robots.can_fetch("Mozilla", "/admin/panel"));
384 assert!(!robots.can_fetch("Googlebot", "/private/data"));
385 assert_eq!(robots.get_sitemaps().len(), 1);
386 assert_eq!(robots.get_comments().len(), 1);
387 }
388
389 #[test]
390 fn test_path_matching() {
391 assert!(path_matches("/admin/test", "/admin/"));
392 assert!(path_matches("/admin/", "/admin/"));
393 assert!(!path_matches("/public/", "/admin/"));
394 assert!(path_matches("/file.html", "/*.html"));
395 assert!(path_matches("/admin/file.php", "/admin/*.php"));
396 assert!(path_matches("/test.html", "/test.html$"));
397 assert!(!path_matches("/test.html/more", "/test.html$"));
398 }
399
400 #[test]
401 fn test_domain_extraction() {
402 assert_eq!(
403 extract_domain("https://example.com/robots.txt"),
404 "example.com"
405 );
406 assert_eq!(
407 extract_domain("http://www.google.com/robots.txt"),
408 "www.google.com"
409 );
410 assert_eq!(
411 extract_domain("https://api.github.com:443/robots.txt"),
412 "api.github.com"
413 );
414 assert_eq!(extract_domain("example.org"), "example.org");
415 }
416}