use url::Url;
pub fn extract_paths(body: &str) -> Vec<String> {
let mut out = Vec::new();
let mut seen = std::collections::HashSet::new();
for line in body.lines() {
let line = line.split('#').next().unwrap_or("").trim();
if line.is_empty() {
continue;
}
let Some((key, val)) = line.split_once(':') else {
continue;
};
let key = key.trim().to_ascii_lowercase();
if key != "disallow" && key != "allow" {
continue;
}
let raw = val.trim();
if raw.is_empty() || raw == "/" || raw == "*" {
continue;
}
let path = raw.split('*').next().unwrap_or(raw).trim_end_matches('$');
if path.len() <= 1 {
continue;
}
let norm = if path.starts_with('/') {
path.to_string()
} else {
format!("/{path}")
};
if seen.insert(norm.clone()) {
out.push(norm);
}
}
out
}
pub fn seed_urls(origin: &Url, paths: &[String]) -> Vec<Url> {
let mut urls = Vec::new();
for p in paths {
if let Ok(u) = origin.join(p) {
urls.push(u);
}
}
urls
}