robotstxt_rs/
parser.rs

1use std::collections::HashMap;
2
3#[cfg(feature = "async")]
4use reqwest;
5
6/// A rule for a specific user-agent containing allowed and disallowed paths.
7#[derive(Debug, Clone)]
8pub struct RobotRule {
9    /// The user-agent this rule applies to (e.g., "Googlebot", "*")
10    pub user_agent: String,
11    /// List of paths explicitly allowed for this user-agent
12    pub allowed: Vec<String>,
13    /// List of paths disallowed for this user-agent
14    pub disallowed: Vec<String>,
15}
16
17/// The main structure representing a parsed robots.txt file.
18#[derive(Debug)]
19pub struct RobotsTxt {
20    domain: Option<String>,
21    rules: HashMap<String, RobotRule>,
22    sitemaps: Vec<String>,
23    comments: Vec<String>,
24}
25
26#[cfg(feature = "async")]
27
28impl RobotsTxt {
29    /// Parse a robots.txt file from a string.
30    ///
31    /// # Arguments
32    ///
33    /// * `content` - The robots.txt file content as a string
34    ///
35    /// # Example
36    ///
37    /// ```
38    /// use robotstxt_rs::RobotsTxt;
39    ///
40    /// let content = "User-agent: *\nDisallow: /admin/";
41    /// let robots = RobotsTxt::parse(content);
42    /// ```
43    pub fn parse(content: &str) -> Self {
44        Self::parse_with_domain(content, None)
45    }
46
47    /// Parse a robots.txt file from a string with a specified domain.
48    ///
49    /// # Arguments
50    ///
51    /// * `content` - The robots.txt file content as a string
52    /// * `domain` - Optional domain name to associate with this robots.txt
53    ///
54    /// # Example
55    ///
56    /// ```
57    /// use robotstxt_rs::RobotsTxt;
58    ///
59    /// let content = "User-agent: *\nDisallow: /admin/";
60    /// let robots = RobotsTxt::parse_with_domain(content, Some("example.com".to_string()));
61    /// ```
62    pub fn parse_with_domain(content: &str, domain: Option<String>) -> Self {
63        let mut rules: HashMap<String, RobotRule> = HashMap::new();
64        let mut sitemaps = Vec::new();
65        let mut comments = Vec::new();
66
67        let mut current_agents: Vec<String> = Vec::new();
68
69        for line in content.lines() {
70            let line = line.trim();
71
72            // Handle comments
73            if line.starts_with('#') {
74                comments.push(line[1..].trim().to_string());
75                continue;
76            }
77
78            // Skip empty lines
79            if line.is_empty() {
80                continue;
81            }
82
83            // Split on first colon
84            if let Some((directive, value)) = line.split_once(':') {
85                let directive = directive.trim().to_lowercase();
86                let value = value.trim().to_string();
87
88                match directive.as_str() {
89                    "user-agent" => {
90                        // Start new user-agent group
91                        let agent = value.to_lowercase();
92                        if !rules.contains_key(&agent) {
93                            rules.insert(
94                                agent.clone(),
95                                RobotRule {
96                                    user_agent: agent.clone(),
97                                    allowed: Vec::new(),
98                                    disallowed: Vec::new(),
99                                },
100                            );
101                        }
102                        current_agents.push(agent);
103                    }
104                    "allow" => {
105                        // Add to all current agents
106                        for agent in &current_agents {
107                            if let Some(rule) = rules.get_mut(agent) {
108                                rule.allowed.push(value.clone());
109                            }
110                        }
111                    }
112                    "disallow" => {
113                        // Add to all current agents
114                        for agent in &current_agents {
115                            if let Some(rule) = rules.get_mut(agent) {
116                                rule.disallowed.push(value.clone());
117                            }
118                        }
119                    }
120                    "sitemap" => {
121                        sitemaps.push(value);
122                        // Sitemap is global, reset current agents
123                        current_agents.clear();
124                    }
125                    _ => {
126                        // Unknown directive, could log or ignore
127                    }
128                }
129            }
130        }
131
132        RobotsTxt {
133            domain,
134            rules,
135            sitemaps,
136            comments,
137        }
138    }
139
140    /// Fetch and parse a robots.txt file from a URL (requires async feature).
141    ///
142    /// # Arguments
143    ///
144    /// * `url` - The URL to the robots.txt file
145    ///
146    /// # Example
147    ///
148    /// ```no_run
149    /// use robotstxt_rs::RobotsTxt;
150    ///
151    /// #[tokio::main]
152    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
153    ///     let robots = RobotsTxt::from_url("https://example.com/robots.txt").await?;
154    ///     Ok(())
155    /// }
156    /// ```
157    pub async fn from_url(url: &str) -> Result<Self, Box<dyn std::error::Error>> {
158        let client = reqwest::Client::new();
159        let content = client.get(url).send().await?.text().await?;
160
161        // Extract domain from URL
162        let domain = extract_domain(url);
163
164        Ok(Self::parse_with_domain(&content, Some(domain)))
165    }
166
167    /// Check if a user-agent is allowed to fetch a specific path.
168    ///
169    /// # Arguments
170    ///
171    /// * `user_agent` - The user-agent string (e.g., "Googlebot")
172    /// * `path` - The path to check (e.g., "/admin/panel")
173    ///
174    /// # Returns
175    ///
176    /// Returns `true` if the user-agent is allowed to fetch the path, `false` otherwise.
177    ///
178    /// # Example
179    ///
180    /// ```
181    /// use robotstxt_rs::RobotsTxt;
182    ///
183    /// let content = "User-agent: *\nDisallow: /admin/";
184    /// let robots = RobotsTxt::parse(content);
185    /// assert!(!robots.can_fetch("Googlebot", "/admin/panel"));
186    /// assert!(robots.can_fetch("Googlebot", "/public/page"));
187    /// ```
188    pub fn can_fetch(&self, user_agent: &str, path: &str) -> bool {
189        let user_agent = user_agent.to_lowercase();
190
191        // Try exact match first
192        let rule = if let Some(rule) = self.rules.get(&user_agent) {
193            rule
194        } else if let Some(rule) = self.rules.get("*") {
195            // Fall back to wildcard
196            rule
197        } else {
198            // No rules = allowed
199            return true;
200        };
201
202        // Check disallowed paths first (more restrictive)
203        for disallowed in &rule.disallowed {
204            if disallowed.is_empty() {
205                continue;
206            }
207            if path_matches(path, disallowed) {
208                // Check if there's a more specific allow rule
209                for allowed in &rule.allowed {
210                    if path_matches(path, allowed) && allowed.len() > disallowed.len() {
211                        return true;
212                    }
213                }
214                return false;
215            }
216        }
217
218        // If not explicitly disallowed, check allowed rules
219        // Empty allowed list means everything is allowed
220        if rule.allowed.is_empty() {
221            return true;
222        }
223
224        for allowed in &rule.allowed {
225            if path_matches(path, allowed) {
226                return true;
227            }
228        }
229
230        // If there are allow rules but no match, it's disallowed
231        false
232    }
233
234    /// Get the domain associated with this robots.txt file.
235    ///
236    /// # Returns
237    ///
238    /// Returns `Some(&str)` if a domain was specified, `None` otherwise.
239    pub fn get_domain(&self) -> Option<&str> {
240        self.domain.as_deref()
241    }
242
243    /// Get all sitemap URLs from the robots.txt file.
244    ///
245    /// # Returns
246    ///
247    /// Returns a slice of sitemap URLs.
248    pub fn get_sitemaps(&self) -> &[String] {
249        &self.sitemaps
250    }
251
252    /// Get all comments from the robots.txt file.
253    ///
254    /// # Returns
255    ///
256    /// Returns a slice of comment strings (without the # prefix).
257    pub fn get_comments(&self) -> &[String] {
258        &self.comments
259    }
260
261    /// Get all rules for all user-agents.
262    ///
263    /// # Returns
264    ///
265    /// Returns a HashMap where keys are user-agent strings and values are RobotRule structs.
266    pub fn get_rules(&self) -> &HashMap<String, RobotRule> {
267        &self.rules
268    }
269
270    /// Get the rule for a specific user-agent.
271    ///
272    /// # Arguments
273    ///
274    /// * `user_agent` - The user-agent string to look up
275    ///
276    /// # Returns
277    ///
278    /// Returns `Some(&RobotRule)` if rules exist for this user-agent or the wildcard (*), `None` otherwise.
279    ///
280    /// # Example
281    ///
282    /// ```
283    /// use robotstxt_rs::RobotsTxt;
284    ///
285    /// let content = "User-agent: Googlebot\nDisallow: /private/";
286    /// let robots = RobotsTxt::parse(content);
287    /// if let Some(rule) = robots.get_rule("Googlebot") {
288    ///     println!("Disallowed paths: {:?}", rule.disallowed);
289    /// }
290    /// ```
291    pub fn get_rule(&self, user_agent: &str) -> Option<&RobotRule> {
292        let user_agent = user_agent.to_lowercase();
293        self.rules.get(&user_agent).or_else(|| self.rules.get("*"))
294    }
295}
296
297fn extract_domain(url: &str) -> String {
298    // Simple domain extraction - handles common cases
299    let url = url.trim();
300
301    // Remove protocol
302    let url = url
303        .strip_prefix("https://")
304        .or_else(|| url.strip_prefix("http://"))
305        .unwrap_or(url);
306
307    // Take everything before the first slash
308    let domain = url.split('/').next().unwrap_or(url);
309
310    // Remove port if present
311    let domain = domain.split(':').next().unwrap_or(domain);
312
313    domain.to_string()
314}
315
316fn path_matches(path: &str, pattern: &str) -> bool {
317    // Handle end-of-string anchor $
318    if pattern.ends_with('$') {
319        let pattern = &pattern[..pattern.len() - 1];
320        if pattern.contains('*') {
321            // Complex wildcard matching with end anchor
322            return path == pattern.replace('*', "");
323        } else {
324            return path == pattern;
325        }
326    }
327
328    // Simple prefix matching (most common case)
329    if !pattern.contains('*') {
330        return path.starts_with(pattern);
331    }
332
333    // Handle wildcards
334    let parts: Vec<&str> = pattern.split('*').collect();
335    let mut pos = 0;
336
337    for (i, part) in parts.iter().enumerate() {
338        if i == 0 {
339            // First part must be at the start
340            if !path[pos..].starts_with(part) {
341                return false;
342            }
343            pos += part.len();
344        } else if i == parts.len() - 1 {
345            // Last part must be at the end (or anywhere if followed by *)
346            if !path[pos..].contains(part) {
347                return false;
348            }
349        } else {
350            // Middle parts just need to exist
351            if let Some(found) = path[pos..].find(part) {
352                pos += found + part.len();
353            } else {
354                return false;
355            }
356        }
357    }
358
359    true
360}
361
362#[cfg(test)]
363mod tests {
364    use super::*;
365
366    #[test]
367    fn test_basic_parsing() {
368        let content = r#"
369User-agent: *
370Disallow: /admin/
371Allow: /public/
372
373User-agent: Googlebot
374Disallow: /private/
375
376Sitemap: https://example.com/sitemap.xml
377# This is a comment
378        "#;
379
380        let robots = RobotsTxt::parse(content);
381
382        assert!(robots.can_fetch("Mozilla", "/public/test.html"));
383        assert!(!robots.can_fetch("Mozilla", "/admin/panel"));
384        assert!(!robots.can_fetch("Googlebot", "/private/data"));
385        assert_eq!(robots.get_sitemaps().len(), 1);
386        assert_eq!(robots.get_comments().len(), 1);
387    }
388
389    #[test]
390    fn test_path_matching() {
391        assert!(path_matches("/admin/test", "/admin/"));
392        assert!(path_matches("/admin/", "/admin/"));
393        assert!(!path_matches("/public/", "/admin/"));
394        assert!(path_matches("/file.html", "/*.html"));
395        assert!(path_matches("/admin/file.php", "/admin/*.php"));
396        assert!(path_matches("/test.html", "/test.html$"));
397        assert!(!path_matches("/test.html/more", "/test.html$"));
398    }
399
400    #[test]
401    fn test_domain_extraction() {
402        assert_eq!(
403            extract_domain("https://example.com/robots.txt"),
404            "example.com"
405        );
406        assert_eq!(
407            extract_domain("http://www.google.com/robots.txt"),
408            "www.google.com"
409        );
410        assert_eq!(
411            extract_domain("https://api.github.com:443/robots.txt"),
412            "api.github.com"
413        );
414        assert_eq!(extract_domain("example.org"), "example.org");
415    }
416}