api_scanner/discovery/
js.rs1use std::collections::HashSet;
2
3use once_cell::sync::Lazy;
4use regex::Regex;
5use tracing::{debug, warn};
6use url::Url;
7
8use crate::{error::CapturedError, http_client::HttpClient};
9
10use super::normalize_path;
11
12static API_PATTERNS: Lazy<Vec<Regex>> = Lazy::new(|| {
14 vec['"]"#,
18 )
19 .unwrap(),
20 Regex::new(r#"['"](?:/api|/v\d|/graphql|/rest|/internal|/private|/admin)([^'"]{0,120})['"]"#)
22 .unwrap(),
23 Regex::new(r#"(?:url|endpoint|path|baseURL|base_url)\s*[=:]\s*['"](/[^'"]{2,120})['"]"#)
25 .unwrap(),
26 ]
27});
28
29static SCRIPT_SRC: Lazy<Regex> =
31 Lazy::new(|| Regex::new(r#"<script[^>]+src=['"]([^'"]+)['"]"#).unwrap());
32
33static INLINE_SCRIPT: Lazy<Regex> =
35 Lazy::new(|| Regex::new(r"(?s)<script[^>]*>(.*?)</script>").unwrap());
36
37static SOURCEMAP: Lazy<Regex> = Lazy::new(|| Regex::new(r"sourceMappingURL=([^\s*]+)").unwrap());
39
40pub struct JsDiscovery<'a> {
41 client: &'a HttpClient,
42 target_url: &'a str,
43 host: &'a str,
44 max_scripts: usize,
45}
46
47impl<'a> JsDiscovery<'a> {
48 pub fn new(
49 client: &'a HttpClient,
50 target_url: &'a str,
51 host: &'a str,
52 max_scripts: usize,
53 ) -> Self {
54 Self {
55 client,
56 target_url,
57 host,
58 max_scripts,
59 }
60 }
61
62 pub async fn run(&self) -> (HashSet<String>, Vec<CapturedError>) {
64 let mut endpoints = HashSet::new();
65 let mut errors: Vec<CapturedError> = Vec::new();
66
67 let resp = match self.client.get(self.target_url).await {
68 Ok(r) => r,
69 Err(e) => {
70 errors.push(e);
71 return (endpoints, errors);
72 }
73 };
74
75 let page = &resp.body;
76
77 let script_urls: Vec<String> = SCRIPT_SRC
79 .captures_iter(page)
80 .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
81 .take(self.max_scripts)
82 .collect();
83
84 for src in &script_urls {
86 let full_url = match self.resolve(src) {
87 Some(u) => u,
88 None => continue,
89 };
90
91 match self.client.get(&full_url).await {
92 Ok(sr) => {
93 self.extract_from_text(&sr.body, &mut endpoints);
94 if let Some(sm_path) = SOURCEMAP
96 .captures(&sr.body)
97 .and_then(|c| c.get(1))
98 .map(|m| m.as_str().to_string())
99 {
100 if let Some(sm_url) = self.resolve_from(&full_url, &sm_path) {
101 let (mut ep, mut er) = self.fetch_sourcemap(&sm_url).await;
102 endpoints.extend(ep.drain());
103 errors.append(&mut er);
104 }
105 }
106 }
107 Err(e) => errors.push(e),
108 }
109 }
110
111 for cap in INLINE_SCRIPT.captures_iter(page) {
113 if let Some(content) = cap.get(1) {
114 self.extract_from_text(content.as_str(), &mut endpoints);
115 }
116 }
117
118 debug!("[js] found {} endpoints", endpoints.len());
119 (endpoints, errors)
120 }
121
122 fn extract_from_text(&self, text: &str, out: &mut HashSet<String>) {
123 for re in API_PATTERNS.iter() {
124 for cap in re.captures_iter(text) {
125 let raw = cap
127 .get(1)
128 .or_else(|| cap.get(0))
129 .map(|m| m.as_str())
130 .unwrap_or("");
131 if let Some(p) = normalize_path(raw, self.host) {
132 out.insert(p);
133 }
134 }
135 }
136 }
137
138 async fn fetch_sourcemap(&self, sm_url: &str) -> (HashSet<String>, Vec<CapturedError>) {
139 let mut out = HashSet::new();
140 let mut errors = Vec::new();
141
142 match self.client.get(sm_url).await {
143 Ok(r) => match serde_json::from_str::<serde_json::Value>(&r.body) {
144 Ok(map) => {
145 let sources = map
146 .get("sourcesContent")
147 .and_then(|v| v.as_array())
148 .cloned()
149 .unwrap_or_default();
150 for src in sources {
151 if let Some(text) = src.as_str() {
152 self.extract_from_text(text, &mut out);
153 }
154 }
155 }
156 Err(e) => {
157 warn!("[js] sourcemap parse error at {sm_url}: {e}");
158 }
159 },
160 Err(e) => errors.push(e),
161 }
162
163 (out, errors)
164 }
165
166 fn resolve(&self, raw: &str) -> Option<String> {
167 self.resolve_from(self.target_url, raw)
168 }
169
170 fn resolve_from(&self, base: &str, raw: &str) -> Option<String> {
171 let base_url = Url::parse(base).ok()?;
172 let resolved = base_url.join(raw).ok()?;
173 if resolved.host_str()? != self.host {
175 return None;
176 }
177 Some(resolved.to_string())
178 }
179}