cortex_runtime/acquisition/
js_analyzer.rs1use crate::acquisition::action_discovery::{self, HttpAction};
9use crate::acquisition::http_client::HttpClient;
10use regex::Regex;
11
12const MAX_SCRIPTS: usize = 5;
14
15const MAX_SCRIPT_SIZE: usize = 5 * 1024 * 1024;
17
18pub async fn fetch_and_analyze_scripts(
25 html: &str,
26 base_url: &str,
27 client: &HttpClient,
28) -> Vec<HttpAction> {
29 let script_urls = extract_script_urls(html, base_url);
30 if script_urls.is_empty() {
31 return Vec::new();
32 }
33
34 let urls_to_fetch: Vec<String> = script_urls.into_iter().take(MAX_SCRIPTS).collect();
36
37 let responses = client.get_many(&urls_to_fetch, MAX_SCRIPTS, 10_000).await;
39
40 let mut all_actions: Vec<HttpAction> = Vec::new();
41
42 for resp in responses.into_iter().flatten() {
43 if resp.status != 200 {
45 continue;
46 }
47
48 if resp.body.len() > MAX_SCRIPT_SIZE {
50 continue;
51 }
52
53 let actions = action_discovery::discover_actions_from_js(&resp.body, base_url);
54 all_actions.extend(actions);
55 }
56
57 all_actions.sort_by(|a, b| a.label.cmp(&b.label));
59 all_actions.dedup_by(|a, b| a.label == b.label);
60
61 all_actions
62}
63
64pub fn extract_script_urls(html: &str, base_url: &str) -> Vec<String> {
72 let re = Regex::new(r#"<script[^>]+src\s*=\s*["']([^"']+)["']"#).expect("valid regex");
73
74 let mut urls = Vec::new();
75
76 for cap in re.captures_iter(html) {
77 if let Some(src) = cap.get(1) {
78 let raw_url = src.as_str();
79
80 let resolved = resolve_script_url(raw_url, base_url);
82
83 if is_analytics_or_cdn(&resolved) {
85 continue;
86 }
87
88 if !is_same_origin(&resolved, base_url) {
90 continue;
91 }
92
93 urls.push(resolved);
94 }
95 }
96
97 urls
98}
99
100fn is_same_origin(script_url: &str, base_url: &str) -> bool {
105 let script_parsed = match url::Url::parse(script_url) {
106 Ok(u) => u,
107 Err(_) => return false,
108 };
109 let base_parsed = match url::Url::parse(base_url) {
110 Ok(u) => u,
111 Err(_) => return false,
112 };
113
114 script_parsed.host_str() == base_parsed.host_str()
115}
116
117fn is_analytics_or_cdn(url: &str) -> bool {
122 const SKIP_PATTERNS: &[&str] = &[
123 "google-analytics.com",
124 "googletagmanager.com",
125 "googlesyndication.com",
126 "googleadservices.com",
127 "google.com/recaptcha",
128 "gstatic.com",
129 "facebook.net",
130 "facebook.com/tr",
131 "fbcdn.net",
132 "connect.facebook.net",
133 "hotjar.com",
134 "segment.com",
135 "segment.io",
136 "cdn.segment.com",
137 "analytics.",
138 "cdnjs.cloudflare.com",
139 "unpkg.com",
140 "cdn.jsdelivr.net",
141 "ajax.googleapis.com",
142 "stackpath.bootstrapcdn.com",
143 "maxcdn.bootstrapcdn.com",
144 "code.jquery.com",
145 "newrelic.com",
146 "nr-data.net",
147 "sentry.io",
148 "browser.sentry-cdn.com",
149 "fullstory.com",
150 "mixpanel.com",
151 "heapanalytics.com",
152 "clarity.ms",
153 "doubleclick.net",
154 "quantserve.com",
155 "scorecardresearch.com",
156 "optimizely.com",
157 "crazyegg.com",
158 "mouseflow.com",
159 "tawk.to",
160 "intercom.io",
161 "intercomcdn.com",
162 "crisp.chat",
163 "drift.com",
164 "zendesk.com",
165 ];
166
167 let lower = url.to_lowercase();
168 SKIP_PATTERNS.iter().any(|pattern| lower.contains(pattern))
169}
170
171fn resolve_script_url(src: &str, base_url: &str) -> String {
176 if src.starts_with("http://") || src.starts_with("https://") {
178 return src.to_string();
179 }
180
181 if src.starts_with("//") {
183 return format!("https:{src}");
184 }
185
186 if let Ok(base) = url::Url::parse(base_url) {
188 if let Ok(resolved) = base.join(src) {
189 return resolved.to_string();
190 }
191 }
192
193 let base_trimmed = base_url.trim_end_matches('/');
195 if src.starts_with('/') {
196 if let Ok(parsed) = url::Url::parse(base_trimmed) {
197 return format!(
198 "{}://{}{}",
199 parsed.scheme(),
200 parsed.host_str().unwrap_or(""),
201 src
202 );
203 }
204 }
205
206 format!("{base_trimmed}/{src}")
207}
208
209#[cfg(test)]
210mod tests {
211 use super::*;
212
213 #[test]
214 fn test_extract_script_urls_basic() {
215 let html = r#"<html><head><script src="/js/main.js"></script><script src="/js/app.js"></script></head></html>"#;
216 let urls = extract_script_urls(html, "https://example.com");
217 assert_eq!(urls.len(), 2);
218 assert!(urls.contains(&"https://example.com/js/main.js".to_string()));
219 assert!(urls.contains(&"https://example.com/js/app.js".to_string()));
220 }
221
222 #[test]
223 fn test_extract_script_urls_filters_cdn() {
224 let html = r#"<html><head>
225 <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
226 <script src="/js/app.js"></script>
227 <script src="https://www.google-analytics.com/analytics.js"></script>
228 </head></html>"#;
229 let urls = extract_script_urls(html, "https://example.com");
230 assert_eq!(urls.len(), 1);
231 assert_eq!(urls[0], "https://example.com/js/app.js");
232 }
233
234 #[test]
235 fn test_is_same_origin() {
236 assert!(is_same_origin(
237 "https://example.com/js/app.js",
238 "https://example.com/page"
239 ));
240 assert!(!is_same_origin(
241 "https://cdn.example.com/js/app.js",
242 "https://example.com/page"
243 ));
244 assert!(!is_same_origin(
245 "https://other.com/js/app.js",
246 "https://example.com/page"
247 ));
248 }
249
250 #[test]
251 fn test_is_analytics_or_cdn() {
252 assert!(is_analytics_or_cdn(
253 "https://www.google-analytics.com/analytics.js"
254 ));
255 assert!(is_analytics_or_cdn(
256 "https://cdnjs.cloudflare.com/ajax/libs/react/18.0.0/react.min.js"
257 ));
258 assert!(is_analytics_or_cdn(
259 "https://www.googletagmanager.com/gtm.js?id=GTM-XXX"
260 ));
261 assert!(!is_analytics_or_cdn("https://example.com/js/app.js"));
262 }
263
264 #[test]
265 fn test_extract_empty_html() {
266 let urls = extract_script_urls("", "https://example.com");
267 assert!(urls.is_empty());
268 }
269
270 #[test]
271 fn test_extract_inline_scripts_ignored() {
272 let html = r#"<script>console.log("inline")</script><script src="/app.js"></script>"#;
273 let urls = extract_script_urls(html, "https://example.com");
274 assert_eq!(urls.len(), 1);
275 assert_eq!(urls[0], "https://example.com/app.js");
276 }
277}