Skip to main content

cortex_runtime/acquisition/
js_analyzer.rs

1//! JavaScript bundle analyzer for API endpoint discovery.
2//!
3//! Fetches JavaScript bundle files referenced in HTML `<script src="...">` tags
4//! and analyzes them for API endpoints using regex-based pattern matching. This
5//! is a best-effort enrichment layer — if bundles are too large or minified beyond
6//! recognition, it degrades gracefully by returning an empty result set.
7
8use crate::acquisition::action_discovery::{self, HttpAction};
9use crate::acquisition::http_client::HttpClient;
10use regex::Regex;
11
12/// Maximum number of scripts to fetch and analyze (time budget cap).
13const MAX_SCRIPTS: usize = 5;
14
15/// Maximum script size in bytes (5 MB). Scripts larger than this are skipped.
16const MAX_SCRIPT_SIZE: usize = 5 * 1024 * 1024;
17
18/// Fetch JavaScript bundles referenced in HTML and analyze them for API endpoints.
19///
20/// Extracts `<script src="...">` URLs from the provided HTML, fetches same-origin
21/// scripts via HTTP GET (capped at 5 scripts, skipping any over 5 MB), and runs
22/// regex-based API endpoint discovery on each bundle. Results are deduplicated
23/// by URL before returning.
24pub async fn fetch_and_analyze_scripts(
25    html: &str,
26    base_url: &str,
27    client: &HttpClient,
28) -> Vec<HttpAction> {
29    let script_urls = extract_script_urls(html, base_url);
30    if script_urls.is_empty() {
31        return Vec::new();
32    }
33
34    // Cap at MAX_SCRIPTS to stay within time budget
35    let urls_to_fetch: Vec<String> = script_urls.into_iter().take(MAX_SCRIPTS).collect();
36
37    // Fetch all scripts in parallel
38    let responses = client.get_many(&urls_to_fetch, MAX_SCRIPTS, 10_000).await;
39
40    let mut all_actions: Vec<HttpAction> = Vec::new();
41
42    for resp in responses.into_iter().flatten() {
43        // Skip non-200 responses
44        if resp.status != 200 {
45            continue;
46        }
47
48        // Skip scripts that exceed the size limit
49        if resp.body.len() > MAX_SCRIPT_SIZE {
50            continue;
51        }
52
53        let actions = action_discovery::discover_actions_from_js(&resp.body, base_url);
54        all_actions.extend(actions);
55    }
56
57    // Deduplicate by label (which encodes method + path).
58    all_actions.sort_by(|a, b| a.label.cmp(&b.label));
59    all_actions.dedup_by(|a, b| a.label == b.label);
60
61    all_actions
62}
63
64/// Extract all `<script src="...">` URLs from HTML and resolve them against the base URL.
65///
66/// Filters out:
67/// - Scripts without a `src` attribute (inline scripts)
68/// - Known analytics and tracking scripts (Google Analytics, GTM, Facebook, etc.)
69/// - Known CDN library URLs (cdnjs, unpkg, jsdelivr)
70/// - Cross-origin scripts (different domain than base URL)
71pub fn extract_script_urls(html: &str, base_url: &str) -> Vec<String> {
72    let re = Regex::new(r#"<script[^>]+src\s*=\s*["']([^"']+)["']"#).expect("valid regex");
73
74    let mut urls = Vec::new();
75
76    for cap in re.captures_iter(html) {
77        if let Some(src) = cap.get(1) {
78            let raw_url = src.as_str();
79
80            // Resolve relative URLs
81            let resolved = resolve_script_url(raw_url, base_url);
82
83            // Skip analytics/CDN scripts
84            if is_analytics_or_cdn(&resolved) {
85                continue;
86            }
87
88            // Same-origin check
89            if !is_same_origin(&resolved, base_url) {
90                continue;
91            }
92
93            urls.push(resolved);
94        }
95    }
96
97    urls
98}
99
100/// Check if a script URL is from the same origin as the base URL.
101///
102/// Compares the host (domain + port) of both URLs. Returns `false` if either
103/// URL cannot be parsed.
104fn is_same_origin(script_url: &str, base_url: &str) -> bool {
105    let script_parsed = match url::Url::parse(script_url) {
106        Ok(u) => u,
107        Err(_) => return false,
108    };
109    let base_parsed = match url::Url::parse(base_url) {
110        Ok(u) => u,
111        Err(_) => return false,
112    };
113
114    script_parsed.host_str() == base_parsed.host_str()
115}
116
117/// Check if a URL is a known analytics, tracking, or CDN script that should be skipped.
118///
119/// Matches against common third-party domains including Google Analytics,
120/// Google Tag Manager, Facebook, Hotjar, Segment, cdnjs, unpkg, and jsdelivr.
121fn is_analytics_or_cdn(url: &str) -> bool {
122    const SKIP_PATTERNS: &[&str] = &[
123        "google-analytics.com",
124        "googletagmanager.com",
125        "googlesyndication.com",
126        "googleadservices.com",
127        "google.com/recaptcha",
128        "gstatic.com",
129        "facebook.net",
130        "facebook.com/tr",
131        "fbcdn.net",
132        "connect.facebook.net",
133        "hotjar.com",
134        "segment.com",
135        "segment.io",
136        "cdn.segment.com",
137        "analytics.",
138        "cdnjs.cloudflare.com",
139        "unpkg.com",
140        "cdn.jsdelivr.net",
141        "ajax.googleapis.com",
142        "stackpath.bootstrapcdn.com",
143        "maxcdn.bootstrapcdn.com",
144        "code.jquery.com",
145        "newrelic.com",
146        "nr-data.net",
147        "sentry.io",
148        "browser.sentry-cdn.com",
149        "fullstory.com",
150        "mixpanel.com",
151        "heapanalytics.com",
152        "clarity.ms",
153        "doubleclick.net",
154        "quantserve.com",
155        "scorecardresearch.com",
156        "optimizely.com",
157        "crazyegg.com",
158        "mouseflow.com",
159        "tawk.to",
160        "intercom.io",
161        "intercomcdn.com",
162        "crisp.chat",
163        "drift.com",
164        "zendesk.com",
165    ];
166
167    let lower = url.to_lowercase();
168    SKIP_PATTERNS.iter().any(|pattern| lower.contains(pattern))
169}
170
171/// Resolve a script `src` attribute value against a base URL.
172///
173/// Handles absolute URLs (returned as-is), protocol-relative URLs, and
174/// relative paths (resolved against the base URL origin).
175fn resolve_script_url(src: &str, base_url: &str) -> String {
176    // Already absolute
177    if src.starts_with("http://") || src.starts_with("https://") {
178        return src.to_string();
179    }
180
181    // Protocol-relative
182    if src.starts_with("//") {
183        return format!("https:{src}");
184    }
185
186    // Resolve relative URL against base
187    if let Ok(base) = url::Url::parse(base_url) {
188        if let Ok(resolved) = base.join(src) {
189            return resolved.to_string();
190        }
191    }
192
193    // Fallback: prepend origin
194    let base_trimmed = base_url.trim_end_matches('/');
195    if src.starts_with('/') {
196        if let Ok(parsed) = url::Url::parse(base_trimmed) {
197            return format!(
198                "{}://{}{}",
199                parsed.scheme(),
200                parsed.host_str().unwrap_or(""),
201                src
202            );
203        }
204    }
205
206    format!("{base_trimmed}/{src}")
207}
208
209#[cfg(test)]
210mod tests {
211    use super::*;
212
213    #[test]
214    fn test_extract_script_urls_basic() {
215        let html = r#"<html><head><script src="/js/main.js"></script><script src="/js/app.js"></script></head></html>"#;
216        let urls = extract_script_urls(html, "https://example.com");
217        assert_eq!(urls.len(), 2);
218        assert!(urls.contains(&"https://example.com/js/main.js".to_string()));
219        assert!(urls.contains(&"https://example.com/js/app.js".to_string()));
220    }
221
222    #[test]
223    fn test_extract_script_urls_filters_cdn() {
224        let html = r#"<html><head>
225            <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
226            <script src="/js/app.js"></script>
227            <script src="https://www.google-analytics.com/analytics.js"></script>
228        </head></html>"#;
229        let urls = extract_script_urls(html, "https://example.com");
230        assert_eq!(urls.len(), 1);
231        assert_eq!(urls[0], "https://example.com/js/app.js");
232    }
233
234    #[test]
235    fn test_is_same_origin() {
236        assert!(is_same_origin(
237            "https://example.com/js/app.js",
238            "https://example.com/page"
239        ));
240        assert!(!is_same_origin(
241            "https://cdn.example.com/js/app.js",
242            "https://example.com/page"
243        ));
244        assert!(!is_same_origin(
245            "https://other.com/js/app.js",
246            "https://example.com/page"
247        ));
248    }
249
250    #[test]
251    fn test_is_analytics_or_cdn() {
252        assert!(is_analytics_or_cdn(
253            "https://www.google-analytics.com/analytics.js"
254        ));
255        assert!(is_analytics_or_cdn(
256            "https://cdnjs.cloudflare.com/ajax/libs/react/18.0.0/react.min.js"
257        ));
258        assert!(is_analytics_or_cdn(
259            "https://www.googletagmanager.com/gtm.js?id=GTM-XXX"
260        ));
261        assert!(!is_analytics_or_cdn("https://example.com/js/app.js"));
262    }
263
264    #[test]
265    fn test_extract_empty_html() {
266        let urls = extract_script_urls("", "https://example.com");
267        assert!(urls.is_empty());
268    }
269
270    #[test]
271    fn test_extract_inline_scripts_ignored() {
272        let html = r#"<script>console.log("inline")</script><script src="/app.js"></script>"#;
273        let urls = extract_script_urls(html, "https://example.com");
274        assert_eq!(urls.len(), 1);
275        assert_eq!(urls[0], "https://example.com/app.js");
276    }
277}