api_scanner/discovery/
robots.rs1use std::collections::HashSet;
2
3use tracing::debug;
4
5use crate::{error::CapturedError, http_client::HttpClient};
6
7use super::normalize_path;
8
9pub struct RobotsDiscovery<'a> {
10 client: &'a HttpClient,
11 base_url: &'a str,
12 host: &'a str,
13}
14
15impl<'a> RobotsDiscovery<'a> {
16 pub fn new(client: &'a HttpClient, base_url: &'a str, host: &'a str) -> Self {
17 Self {
18 client,
19 base_url,
20 host,
21 }
22 }
23
24 pub async fn run(&self) -> (HashSet<String>, Vec<CapturedError>) {
25 let mut paths = HashSet::new();
26 let mut errors = Vec::new();
27
28 let robots_url = format!("{}/robots.txt", self.base_url.trim_end_matches('/'));
29
30 match self.client.get(&robots_url).await {
31 Ok(resp) if resp.status < 400 => {
32 for line in resp.body.lines() {
33 let line = line.trim();
34
35 if let Some(rest) = line
37 .strip_prefix("Disallow:")
38 .or_else(|| line.strip_prefix("Allow:"))
39 {
40 let raw = rest.trim().split('#').next().unwrap_or("").trim();
41 if raw.is_empty() || raw == "/" || raw.contains('*') {
43 continue;
44 }
45 if let Some(p) = normalize_path(raw, self.host) {
46 paths.insert(p);
47 }
48 }
49
50 if let Some(rest) = line.strip_prefix("Sitemap:") {
52 let raw = rest.trim();
53 if let Some(p) = normalize_path(raw, self.host) {
54 paths.insert(p);
55 }
56 }
57 }
58 debug!("[robots] found {} paths", paths.len());
59 }
60 Ok(_) => {
61 debug!("[robots] non-2xx response, skipping");
62 }
63 Err(e) => errors.push(e),
64 }
65
66 (paths, errors)
67 }
68}