Skip to main content

api_scanner/discovery/
robots.rs

1use std::collections::HashSet;
2
3use tracing::debug;
4
5use crate::{error::CapturedError, http_client::HttpClient};
6
7use super::normalize_path;
8
9pub struct RobotsDiscovery<'a> {
10    client: &'a HttpClient,
11    base_url: &'a str,
12    host: &'a str,
13}
14
15impl<'a> RobotsDiscovery<'a> {
16    pub fn new(client: &'a HttpClient, base_url: &'a str, host: &'a str) -> Self {
17        Self {
18            client,
19            base_url,
20            host,
21        }
22    }
23
24    pub async fn run(&self) -> (HashSet<String>, Vec<CapturedError>) {
25        let mut paths = HashSet::new();
26        let mut errors = Vec::new();
27
28        let robots_url = format!("{}/robots.txt", self.base_url.trim_end_matches('/'));
29
30        match self.client.get(&robots_url).await {
31            Ok(resp) if resp.status < 400 => {
32                for line in resp.body.lines() {
33                    let line = line.trim();
34
35                    // Disallow: /path  or  Allow: /path
36                    if let Some(rest) = line
37                        .strip_prefix("Disallow:")
38                        .or_else(|| line.strip_prefix("Allow:"))
39                    {
40                        let raw = rest.trim().split('#').next().unwrap_or("").trim();
41                        // Skip wildcards / empty / root-only lines
42                        if raw.is_empty() || raw == "/" || raw.contains('*') {
43                            continue;
44                        }
45                        if let Some(p) = normalize_path(raw, self.host) {
46                            paths.insert(p);
47                        }
48                    }
49
50                    // Sitemap: https://... — forward to sitemap discovery
51                    if let Some(rest) = line.strip_prefix("Sitemap:") {
52                        let raw = rest.trim();
53                        if let Some(p) = normalize_path(raw, self.host) {
54                            paths.insert(p);
55                        }
56                    }
57                }
58                debug!("[robots] found {} paths", paths.len());
59            }
60            Ok(_) => {
61                debug!("[robots] non-2xx response, skipping");
62            }
63            Err(e) => errors.push(e),
64        }
65
66        (paths, errors)
67    }
68}