Skip to main content

api_scanner/discovery/
js.rs

1use std::collections::HashSet;
2
3use once_cell::sync::Lazy;
4use regex::Regex;
5use tracing::{debug, warn};
6use url::Url;
7
8use crate::{error::CapturedError, http_client::HttpClient};
9
10use super::normalize_path;
11
12// Pre-compiled patterns for extracting API paths from JS
13static API_PATTERNS: Lazy<Vec<Regex>> = Lazy::new(|| {
14    vec![
15        // fetch("/api/...") / axios.get("/v1/...")
16        Regex::new(
17            r#"(?:fetch|axios|\.request|\.get|\.post|\.put|\.delete|\.patch)\s*\(\s*['"](/[^'"]{2,150})['"]"#,
18        )
19        .unwrap(),
20        // Explicit API prefixes
21        Regex::new(r#"['"](?:/api|/v\d|/graphql|/rest|/internal|/private|/admin)([^'"]{0,120})['"]"#)
22            .unwrap(),
23        // url: "/something"
24        Regex::new(r#"(?:url|endpoint|path|baseURL|base_url)\s*[=:]\s*['"](/[^'"]{2,120})['"]"#)
25            .unwrap(),
26    ]
27});
28
29// <script src="...">
30static SCRIPT_SRC: Lazy<Regex> =
31    Lazy::new(|| Regex::new(r#"<script[^>]+src=['"]([^'"]+)['"]"#).unwrap());
32
33// Inline <script>...</script>
34static INLINE_SCRIPT: Lazy<Regex> =
35    Lazy::new(|| Regex::new(r"(?s)<script[^>]*>(.*?)</script>").unwrap());
36
37// sourceMappingURL
38static SOURCEMAP: Lazy<Regex> = Lazy::new(|| Regex::new(r"sourceMappingURL=([^\s*]+)").unwrap());
39
40pub struct JsDiscovery<'a> {
41    client: &'a HttpClient,
42    target_url: &'a str,
43    host: &'a str,
44    max_scripts: usize,
45}
46
47impl<'a> JsDiscovery<'a> {
48    pub fn new(
49        client: &'a HttpClient,
50        target_url: &'a str,
51        host: &'a str,
52        max_scripts: usize,
53    ) -> Self {
54        Self {
55            client,
56            target_url,
57            host,
58            max_scripts,
59        }
60    }
61
62    /// Main entry: parse the target page, extract + analyse JS files
63    pub async fn run(&self) -> (HashSet<String>, Vec<CapturedError>) {
64        let mut endpoints = HashSet::new();
65        let mut errors: Vec<CapturedError> = Vec::new();
66
67        let resp = match self.client.get(self.target_url).await {
68            Ok(r) => r,
69            Err(e) => {
70                errors.push(e);
71                return (endpoints, errors);
72            }
73        };
74
75        let page = &resp.body;
76
77        // 1. Collect external script URLs
78        let script_urls: Vec<String> = SCRIPT_SRC
79            .captures_iter(page)
80            .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
81            .take(self.max_scripts)
82            .collect();
83
84        // 2. Analyse external scripts (+ sourcemaps)
85        for src in &script_urls {
86            let full_url = match self.resolve(src) {
87                Some(u) => u,
88                None => continue,
89            };
90
91            match self.client.get(&full_url).await {
92                Ok(sr) => {
93                    self.extract_from_text(&sr.body, &mut endpoints);
94                    // Try sourcemap
95                    if let Some(sm_path) = SOURCEMAP
96                        .captures(&sr.body)
97                        .and_then(|c| c.get(1))
98                        .map(|m| m.as_str().to_string())
99                    {
100                        if let Some(sm_url) = self.resolve_from(&full_url, &sm_path) {
101                            let (mut ep, mut er) = self.fetch_sourcemap(&sm_url).await;
102                            endpoints.extend(ep.drain());
103                            errors.append(&mut er);
104                        }
105                    }
106                }
107                Err(e) => errors.push(e),
108            }
109        }
110
111        // 3. Analyse inline scripts
112        for cap in INLINE_SCRIPT.captures_iter(page) {
113            if let Some(content) = cap.get(1) {
114                self.extract_from_text(content.as_str(), &mut endpoints);
115            }
116        }
117
118        debug!("[js] found {} endpoints", endpoints.len());
119        (endpoints, errors)
120    }
121
122    fn extract_from_text(&self, text: &str, out: &mut HashSet<String>) {
123        for re in API_PATTERNS.iter() {
124            for cap in re.captures_iter(text) {
125                // Group 1 or group 0 depending on pattern
126                let raw = cap
127                    .get(1)
128                    .or_else(|| cap.get(0))
129                    .map(|m| m.as_str())
130                    .unwrap_or("");
131                if let Some(p) = normalize_path(raw, self.host) {
132                    out.insert(p);
133                }
134            }
135        }
136    }
137
138    async fn fetch_sourcemap(&self, sm_url: &str) -> (HashSet<String>, Vec<CapturedError>) {
139        let mut out = HashSet::new();
140        let mut errors = Vec::new();
141
142        match self.client.get(sm_url).await {
143            Ok(r) => match serde_json::from_str::<serde_json::Value>(&r.body) {
144                Ok(map) => {
145                    let sources = map
146                        .get("sourcesContent")
147                        .and_then(|v| v.as_array())
148                        .cloned()
149                        .unwrap_or_default();
150                    for src in sources {
151                        if let Some(text) = src.as_str() {
152                            self.extract_from_text(text, &mut out);
153                        }
154                    }
155                }
156                Err(e) => {
157                    warn!("[js] sourcemap parse error at {sm_url}: {e}");
158                }
159            },
160            Err(e) => errors.push(e),
161        }
162
163        (out, errors)
164    }
165
166    fn resolve(&self, raw: &str) -> Option<String> {
167        self.resolve_from(self.target_url, raw)
168    }
169
170    fn resolve_from(&self, base: &str, raw: &str) -> Option<String> {
171        let base_url = Url::parse(base).ok()?;
172        let resolved = base_url.join(raw).ok()?;
173        // Only follow same-host scripts
174        if resolved.host_str()? != self.host {
175            return None;
176        }
177        Some(resolved.to_string())
178    }
179}