Skip to main content

web_analyzer/
advanced_content_scanner.rs

1use regex::Regex;
2use reqwest::Client;
3use scraper::{Html, Selector};
4use serde::{Deserialize, Serialize};
5use std::collections::{HashSet, VecDeque};
6use std::time::Duration;
7
8use crate::payloads;
9
10// ── Result structs ──────────────────────────────────────────────────────────
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct SecretFinding {
14    pub secret_type: String,
15    pub severity: String,
16    pub masked_value: String,
17    pub source_url: String,
18    pub line: usize,
19    pub entropy: f64,
20    pub recommendation: String,
21}
22
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct JsVulnerability {
25    pub vuln_type: String,
26    pub severity: String,
27    pub source_url: String,
28    pub matched_code: String,
29    pub description: String,
30    pub recommendation: String,
31}
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct SsrfFinding {
35    pub finding_type: String,
36    pub severity: String,
37    pub source_url: String,
38    pub vulnerable_params: Vec<String>,
39    pub description: String,
40}
41
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct ScanSummary {
44    pub total_urls_crawled: usize,
45    pub total_js_files: usize,
46    pub total_api_endpoints: usize,
47    pub secrets_count: usize,
48    pub js_vulnerabilities_count: usize,
49    pub ssrf_vulnerabilities_count: usize,
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct ScannerResult {
54    pub domain: String,
55    pub secrets: Vec<SecretFinding>,
56    pub js_vulnerabilities: Vec<JsVulnerability>,
57    pub ssrf_vulnerabilities: Vec<SsrfFinding>,
58    pub api_endpoints_discovered: Vec<String>,
59    pub summary: ScanSummary,
60}
61
62// ── Helpers ─────────────────────────────────────────────────────────────────
63
64fn shannon_entropy(data: &str) -> f64 {
65    if data.is_empty() {
66        return 0.0;
67    }
68    let mut freq = [0u32; 256];
69    for b in data.bytes() {
70        freq[b as usize] += 1;
71    }
72    let len = data.len() as f64;
73    freq.iter()
74        .filter(|&&c| c > 0)
75        .map(|&c| {
76            let p = c as f64 / len;
77            -p * p.log2()
78        })
79        .sum()
80}
81
82fn mask_secret(s: &str) -> String {
83    if s.len() <= 8 {
84        if s.len() > 2 {
85            format!("****{}", &s[s.len() - 2..])
86        } else {
87            "****".into()
88        }
89    } else {
90        format!("{}****{}", &s[..4], &s[s.len() - 4..])
91    }
92}
93
94fn is_false_positive_context(context: &str) -> bool {
95    let fp = [
96        "example",
97        "sample",
98        "placeholder",
99        "dummy",
100        "test",
101        "demo",
102        "your_",
103        "my_",
104        "template",
105        "undefined",
106        "localhost",
107        "127.0.0.1",
108    ];
109    let ctx_lower = context.to_lowercase();
110    fp.iter().any(|p| ctx_lower.contains(p))
111}
112
113fn is_known_library(url: &str) -> bool {
114    let libs = [
115        "jquery",
116        "bootstrap",
117        "modernizr",
118        "polyfill",
119        "vendor",
120        "bundle",
121        "analytics",
122        "tracking",
123        "ga.js",
124        "gtm.js",
125        "react",
126        "angular",
127        "vue",
128        "lodash",
129        "moment",
130        "cdn",
131        "static",
132        "dist",
133        "chunk",
134    ];
135    let url_lower = url.to_lowercase();
136    libs.iter().any(|lib| url_lower.contains(lib))
137}
138
139// ── Secret patterns ─────────────────────────────────────────────────────────
140
141struct SecretPattern {
142    name: &'static str,
143    pattern: &'static str,
144    severity: &'static str,
145    recommendation: &'static str,
146}
147
148const SECRET_PATTERNS: &[SecretPattern] = &[
149    SecretPattern {
150        name: "AWS Access Key",
151        pattern: r"\bAKIA[0-9A-Z]{16}\b",
152        severity: "Medium",
153        recommendation: "Rotate the key immediately. Use AWS IAM roles instead of hard-coded keys.",
154    },
155    SecretPattern {
156        name: "AWS Secret Key",
157        pattern: r"\b[0-9a-zA-Z/+]{40}\b",
158        severity: "High",
159        recommendation: "Rotate the key immediately. Store secrets in AWS Secrets Manager.",
160    },
161    SecretPattern {
162        name: "Google API Key",
163        pattern: r"\bAIza[0-9A-Za-z\-_]{35}\b",
164        severity: "Medium",
165        recommendation: "Rotate the key and implement API key restrictions.",
166    },
167    SecretPattern {
168        name: "Google OAuth",
169        pattern: r"[0-9]+-[0-9A-Za-z_]{32}\.apps\.googleusercontent\.com",
170        severity: "Medium",
171        recommendation: "Review and potentially regenerate the OAuth credentials.",
172    },
173    SecretPattern {
174        name: "Stripe API Key",
175        pattern: r"\b(?:sk|pk)_(live|test)_[0-9a-zA-Z]{24,34}\b",
176        severity: "High",
177        recommendation: "Rotate the key immediately. Only use server-side code for Stripe API.",
178    },
179    SecretPattern {
180        name: "GitHub Token",
181        pattern: r"\b(?:github|gh)(?:_pat)?_[0-9a-zA-Z]{36,40}\b",
182        severity: "High",
183        recommendation: "Revoke and regenerate the token. Use GitHub Actions secrets for CI/CD.",
184    },
185    SecretPattern {
186        name: "GitHub OAuth",
187        pattern: r"\bgho_[0-9a-zA-Z]{36,40}\b",
188        severity: "High",
189        recommendation: "Revoke and regenerate the OAuth token.",
190    },
191    SecretPattern {
192        name: "Facebook Access Token",
193        pattern: r"EAACEdEose0cBA[0-9A-Za-z]+",
194        severity: "Medium",
195        recommendation: "Revoke the token and regenerate. Store tokens securely.",
196    },
197    SecretPattern {
198        name: "JWT Token",
199        pattern: r"eyJ[a-zA-Z0-9_\-]*\.[a-zA-Z0-9_\-]*\.[a-zA-Z0-9_\-]*",
200        severity: "Medium",
201        recommendation: "If valid, rotate the token. Implement proper expiration.",
202    },
203    SecretPattern {
204        name: "SSH Private Key",
205        pattern: r"-----BEGIN\s+(?:RSA|DSA|EC|OPENSSH)\s+PRIVATE\s+KEY",
206        severity: "High",
207        recommendation: "Generate a new key pair. Never store private keys in code.",
208    },
209    SecretPattern {
210        name: "Password in URL",
211        pattern: r"[a-zA-Z]{3,10}://[^/\s:@]{3,20}:[^/\s:@]{3,20}@.{1,100}",
212        severity: "High",
213        recommendation: "Remove the password from the URL and use secure authentication.",
214    },
215    SecretPattern {
216        name: "Firebase URL",
217        pattern: r"https://[a-z0-9-]+\.firebaseio\.com",
218        severity: "Low",
219        recommendation: "Review Firebase security rules and regenerate any associated secrets.",
220    },
221    SecretPattern {
222        name: "MongoDB Connection String",
223        pattern: r"mongodb(?:\+srv)?://[^/\s]+:[^/\s]+@[^/\s]+",
224        severity: "High",
225        recommendation: "Rotate the password and use environment variables instead.",
226    },
227    SecretPattern {
228        name: "Slack Token",
229        pattern: r"xox[baprs]-[0-9a-zA-Z\-]{10,48}",
230        severity: "Medium",
231        recommendation: "Revoke and regenerate the token.",
232    },
233    SecretPattern {
234        name: "Slack Webhook",
235        pattern: r"https://hooks\.slack\.com/services/T[a-zA-Z0-9_]+/B[a-zA-Z0-9_]+/[a-zA-Z0-9_]+",
236        severity: "Medium",
237        recommendation: "Regenerate the webhook URL and store it securely.",
238    },
239    SecretPattern {
240        name: "API Key",
241        pattern: r#"(?i)\b(?:api[_\-]?key|apikey)\b\s*[=:]\s*["'`]([a-zA-Z0-9_\-\.]{16,64})["'`]"#,
242        severity: "Medium",
243        recommendation: "Rotate the key. Store it in environment variables or a secrets manager.",
244    },
245    SecretPattern {
246        name: "Secret Key",
247        pattern: r#"(?i)\b(?:secret[_\-]?key|secretkey)\b\s*[=:]\s*["'`]([a-zA-Z0-9_\-\.]{16,64})["'`]"#,
248        severity: "Medium",
249        recommendation: "Rotate the key and ensure it's stored in a secure vault.",
250    },
251    SecretPattern {
252        name: "Auth Token",
253        pattern: r#"(?i)\b(?:auth[_\-]?token|authtoken)\b\s*[=:]\s*["'`]([a-zA-Z0-9_\-\.]{16,64})["'`]"#,
254        severity: "Medium",
255        recommendation: "Revoke the token and issue a new one.",
256    },
257    SecretPattern {
258        name: "Access Token",
259        pattern: r#"(?i)\b(?:access[_\-]?token|accesstoken)\b\s*[=:]\s*["'`]([a-zA-Z0-9_\-\.]{16,64})["'`]"#,
260        severity: "Medium",
261        recommendation: "Revoke and regenerate the token.",
262    },
263    SecretPattern {
264        name: "Encryption Key",
265        pattern: r#"(?i)(?:encryption|aes|des|blowfish)[\s_-]?key[\s=:]+["'`][A-Za-z0-9+/]{16,}={0,2}["'`]"#,
266        severity: "High",
267        recommendation: "Rotate the key and store it securely using a key management system.",
268    },
269    SecretPattern {
270        name: "Stripe Publishable Key",
271        pattern: r"\bpk_(live|test)_[0-9a-zA-Z]{24,34}\b",
272        severity: "Low",
273        recommendation:
274            "Publishable keys are public, but verify no secret keys are exposed nearby.",
275    },
276    SecretPattern {
277        name: "Twitter Bearer",
278        pattern: r"AAAAAAAAAAAAAAAAAAA[A-Za-z0-9%]+",
279        severity: "Medium",
280        recommendation: "Rotate the bearer token. Use environment variables for storage.",
281    },
282    SecretPattern {
283        name: "Password",
284        pattern: r#"(?i)(?:password|passwd|pwd)[\s=:]+["'`]([^"'`\s]{8,64})["'`]"#,
285        severity: "High",
286        recommendation:
287            "Remove hardcoded passwords. Use a secrets manager or environment variables.",
288    },
289    SecretPattern {
290        name: "Database Credentials",
291        pattern: r#"(?i)(?:db_pass|db_password|database_password)[\s=:]+["'`]([^"'`\s]+)["'`]"#,
292        severity: "High",
293        recommendation: "Change DB credentials immediately. Store in env vars or a vault.",
294    },
295];
296
297// ── JS vulnerability patterns ───────────────────────────────────────────────
298
299struct JsVulnCategory {
300    name: &'static str,
301    severity: &'static str,
302    patterns: &'static [&'static str],
303    description: &'static str,
304    recommendation: &'static str,
305}
306
307const JS_VULN_CATEGORIES: &[JsVulnCategory] = &[
308    JsVulnCategory {
309        name: "DOM XSS",
310        severity: "High",
311        patterns: &[
312            r"document\.write\s*\(\s*.*?(?:location|URL|documentURI|referrer|href|search|hash)",
313            r"\.innerHTML\s*=\s*.*?(?:location|URL|documentURI|referrer|href|search|hash)",
314            r"\.outerHTML\s*=\s*.*?(?:location|URL|documentURI|referrer|href|search|hash)",
315            r"eval\s*\(\s*.*?(?:location|URL|documentURI|referrer|href|search|hash)",
316        ],
317        description:
318            "DOM-based XSS: user-controllable data passed to a dynamic code execution sink.",
319        recommendation:
320            "Sanitize all user inputs before DOM operations. Use DOMPurify or a strict CSP.",
321    },
322    JsVulnCategory {
323        name: "Open Redirect",
324        severity: "High",
325        patterns: &[
326            r"(?:window\.)?location(?:\.href)?\s*=\s*.*?(?:user|input|param|arg)",
327            r"(?:window\.)?location\.replace\s*\(\s*.*?(?:user|input|param|arg)",
328            r"(?:window\.)?location\.assign\s*\(\s*.*?(?:user|input|param|arg)",
329        ],
330        description: "User input determines redirect destination, enabling phishing attacks.",
331        recommendation: "Implement a whitelist of allowed redirect URLs.",
332    },
333    JsVulnCategory {
334        name: "CORS Misconfiguration",
335        severity: "Medium",
336        patterns: &[
337            r"Access-Control-Allow-Origin\s*:\s*\*",
338            r"Access-Control-Allow-Origin\s*:\s*null",
339            r"Access-Control-Allow-Credentials\s*:\s*true",
340        ],
341        description: "CORS misconfiguration can allow unauthorized cross-origin access.",
342        recommendation: "Be specific with CORS policies. Avoid wildcard origins.",
343    },
344    JsVulnCategory {
345        name: "Insecure Cookie",
346        severity: "Medium",
347        patterns: &[r"document\.cookie\s*="],
348        description: "Cookies set without secure flags can be vulnerable to theft.",
349        recommendation: "Set 'Secure' and 'HttpOnly' flags on sensitive cookies.",
350    },
351    JsVulnCategory {
352        name: "Insecure Data Transmission",
353        severity: "Medium",
354        patterns: &[r#"\.postMessage\([^,]+,\s*["']\*["']\)"#],
355        description: "Data transmitted insecurely via postMessage with wildcard origin.",
356        recommendation: "Use specific origin URLs with postMessage() and validate senders.",
357    },
358    JsVulnCategory {
359        name: "Prototype Pollution",
360        severity: "Medium",
361        patterns: &[r"__proto__\s*[=\[]", r"prototype\["],
362        description: "Prototype pollution can lead to property injection attacks.",
363        recommendation:
364            "Avoid user-controlled data with Object.assign()/prototype. Use Object.create(null).",
365    },
366    JsVulnCategory {
367        name: "Command Injection",
368        severity: "High",
369        patterns: &[
370            r"exec\s*\(\s*.*?(?:user|input|param|arg)",
371            r"spawn\s*\(\s*.*?(?:user|input|param|arg)",
372        ],
373        description: "Command injection allows attackers to execute arbitrary commands.",
374        recommendation: "Avoid executing commands with user input. Implement strict validation.",
375    },
376    JsVulnCategory {
377        name: "Insecure Data Storage",
378        severity: "Low",
379        patterns: &[
380            r"localStorage\.setItem\(\s*[^,]+,\s*.*?(?:password|token|key|secret|credentials)",
381            r"sessionStorage\.setItem\(\s*[^,]+,\s*.*?(?:password|token|key|secret|credentials)",
382        ],
383        description: "Sensitive data stored insecurely in client-side storage.",
384        recommendation: "Don't store sensitive info in localStorage/sessionStorage.",
385    },
386    JsVulnCategory {
387        name: "Event Handler XSS",
388        severity: "Medium",
389        patterns: &[r#"\.setAttribute\(["']on\w+["']\s*,"#],
390        description: "Event handlers assigned dynamically can lead to XSS.",
391        recommendation: "Validate and sanitize data before assigning to event handlers.",
392    },
393    JsVulnCategory {
394        name: "CSP Bypass",
395        severity: "Medium",
396        patterns: &[r#"document\.createElement\(["']script["']\)"#],
397        description: "Dynamic script creation may bypass Content Security Policy.",
398        recommendation: "Implement a strict CSP and avoid dynamic script creation with user input.",
399    },
400    JsVulnCategory {
401        name: "WebSocket Insecurity",
402        severity: "High",
403        patterns: &[r#"new\s+WebSocket\(\s*["']ws://"#],
404        description: "Insecure WebSocket connections (ws://) can be intercepted.",
405        recommendation: "Use secure WebSocket connections (wss://) and validate data.",
406    },
407    JsVulnCategory {
408        name: "Insecure Crypto",
409        severity: "High",
410        patterns: &[
411            r#"(?:createHash|crypto\.subtle).*?["'](?:md5|sha1)["']"#,
412            r"Math\.random\(\)",
413        ],
414        description: "Weak cryptographic methods (MD5/SHA1/Math.random) in use.",
415        recommendation:
416            "Use modern crypto algorithms. Use crypto.getRandomValues() instead of Math.random().",
417    },
418    JsVulnCategory {
419        name: "Path Traversal",
420        severity: "Medium",
421        patterns: &[r"\.\./|\.\.\\"],
422        description: "Path traversal allows access to files outside the intended directory.",
423        recommendation: "Validate and sanitize file paths. Use allowlists.",
424    },
425];
426
427// ── SSRF parameters ─────────────────────────────────────────────────────────
428
429const SSRF_PARAMS: &[&str] = &[
430    "url",
431    "uri",
432    "link",
433    "src",
434    "href",
435    "target",
436    "destination",
437    "redirect",
438    "redirect_to",
439    "redirecturl",
440    "redirect_uri",
441    "return",
442    "return_to",
443    "returnurl",
444    "return_path",
445    "path",
446    "load",
447    "file",
448    "filename",
449    "folder",
450    "folder_url",
451    "image",
452    "img",
453    "image_url",
454    "image_path",
455    "avatar",
456    "document",
457    "doc",
458    "document_url",
459    "fetch",
460    "get",
461    "view",
462    "content",
463    "domain",
464    "callback",
465    "reference",
466    "site",
467    "page",
468    "data",
469    "data_url",
470    "resource",
471    "template",
472    "api_endpoint",
473    "endpoint",
474    "proxy",
475    "feed",
476    "host",
477    "webhook",
478    "address",
479    "media",
480    "video",
481    "audio",
482    "download",
483    "upload",
484    "preview",
485    "source",
486    "location",
487    "goto",
488    "callback_url",
489    "forward",
490    "next",
491    "origin",
492    "continue",
493];
494
495// ── Main scanner ────────────────────────────────────────────────────────────
496
497pub async fn scan_content(
498    domain: &str,
499) -> Result<ScannerResult, Box<dyn std::error::Error + Send + Sync>> {
500    let base_url = if domain.starts_with("http") {
501        domain.to_string()
502    } else {
503        format!("https://{}", domain)
504    };
505
506    let client = Client::builder()
507        .timeout(Duration::from_secs(15))
508        .danger_accept_invalid_certs(true)
509        .build()?;
510
511    let mut secrets = Vec::new();
512    let mut js_vulns = Vec::new();
513    let mut ssrf_findings = Vec::new();
514    let mut visited = HashSet::new();
515    let mut js_file_urls = HashSet::new();
516    let mut api_endpoints: HashSet<String> = HashSet::new();
517    let mut queue: VecDeque<(String, u8)> = VecDeque::new();
518    queue.push_back((base_url.clone(), 0));
519
520    let max_depth: u8 = 2;
521    let max_pages: usize = 50;
522
523    // Compile regex patterns once
524    let secret_regexes: Vec<(&SecretPattern, Regex)> = SECRET_PATTERNS
525        .iter()
526        .filter_map(|sp| Regex::new(sp.pattern).ok().map(|r| (sp, r)))
527        .collect();
528
529    let js_vuln_regexes: Vec<(&JsVulnCategory, Vec<Regex>)> = JS_VULN_CATEGORIES
530        .iter()
531        .map(|cat| {
532            let rxs: Vec<Regex> = cat
533                .patterns
534                .iter()
535                .filter_map(|p| Regex::new(p).ok())
536                .collect();
537            (cat, rxs)
538        })
539        .collect();
540
541    // API endpoint extraction patterns
542    let api_regexes: Vec<Regex> = [
543        r"/api/v\d+/",
544        r"/api/",
545        r"/graphql",
546        r"/rest/",
547        r"/v\d+/\w+",
548        r"/service/",
549        r"/json/",
550        r"/rpc/",
551        r"/gateway/",
552        r"/ajax/",
553        r"/data/",
554        r"/query/",
555        r"/feeds/",
556        r"/svc/",
557        r"/soap/",
558    ]
559    .iter()
560    .filter_map(|p| Regex::new(p).ok())
561    .collect();
562
563    // ── Parse robots.txt ─────────────────────────────────────────────
564    let mut disallowed: Vec<String> = Vec::new();
565    let robots_url = format!("{}/robots.txt", base_url.trim_end_matches('/'));
566    if let Ok(resp) = client.get(&robots_url).send().await {
567        if resp.status().is_success() {
568            if let Ok(body) = resp.text().await {
569                let mut agent_match = false;
570                for line in body.lines() {
571                    let line = line.trim().to_lowercase();
572                    if let Some(agent) = line.strip_prefix("user-agent:") {
573                        let agent = agent.trim();
574                        agent_match = agent == "*";
575                    }
576                    if agent_match {
577                        if let Some(path) = line.strip_prefix("disallow:") {
578                            let path = path.trim();
579                            if !path.is_empty() {
580                                disallowed.push(path.to_string());
581                            }
582                        }
583                    }
584                }
585            }
586        }
587    }
588
589    // ── Process sitemap.xml for seed URLs ─────────────────────────────
590    let sitemap_url = format!("{}/sitemap.xml", base_url.trim_end_matches('/'));
591    if let Ok(resp) = client.get(&sitemap_url).send().await {
592        if resp.status().is_success() {
593            if let Ok(body) = resp.text().await {
594                let loc_rx = Regex::new(r"<loc>([^<]+)</loc>").unwrap();
595                for cap in loc_rx.captures_iter(&body) {
596                    if let Some(url) = cap.get(1) {
597                        let u = url.as_str().to_string();
598                        if is_same_domain(&base_url, &u) && !visited.contains(&u) {
599                            queue.push_back((u, 1));
600                        }
601                    }
602                }
603            }
604        }
605    }
606
607    // ── BFS Crawl ───────────────────────────────────────────────────────
608    while let Some((url, depth)) = queue.pop_front() {
609        if visited.len() >= max_pages || depth > max_depth || visited.contains(&url) {
610            continue;
611        }
612
613        // Respect robots.txt disallow rules
614        let url_path = url.trim_start_matches(&base_url);
615        if disallowed.iter().any(|d| url_path.starts_with(d.as_str())) {
616            continue;
617        }
618
619        visited.insert(url.clone());
620
621        // Check URL parameters for SSRF-vulnerable names
622        check_url_params_ssrf(&url, &mut ssrf_findings);
623
624        let resp = match client.get(&url).send().await {
625            Ok(r) => r,
626            Err(_) => continue,
627        };
628        if !resp.status().is_success() {
629            continue;
630        }
631
632        let content_type = resp
633            .headers()
634            .get("content-type")
635            .and_then(|v| v.to_str().ok())
636            .unwrap_or("")
637            .to_lowercase();
638
639        let body = match resp.text().await {
640            Ok(t) => t,
641            Err(_) => continue,
642        };
643
644        // Scan this page's content for secrets
645        scan_for_secrets(&body, &url, &secret_regexes, &mut secrets);
646
647        // Extract API endpoints from the body
648        extract_api_endpoints(&body, &base_url, &api_regexes, &mut api_endpoints);
649
650        if content_type.contains("text/html") {
651            let doc = Html::parse_document(&body);
652
653            // ── Extract & queue links ───────────────────────────────────
654            if depth < max_depth {
655                let a_sel = Selector::parse("a[href]").unwrap();
656                for el in doc.select(&a_sel) {
657                    if let Some(href) = el.value().attr("href") {
658                        let abs = resolve_url(&base_url, href);
659                        if let Some(abs_url) = abs {
660                            if is_same_domain(&base_url, &abs_url) && !visited.contains(&abs_url) {
661                                queue.push_back((abs_url, depth + 1));
662                            }
663                        }
664                    }
665                }
666            }
667
668            // ── Extract inline JS & external JS URLs ────────────────────
669            let script_sel = Selector::parse("script").unwrap();
670            for el in doc.select(&script_sel) {
671                // Inline JS
672                let inline = el.text().collect::<String>();
673                if inline.len() > 10 {
674                    scan_js_security(&inline, &url, &js_vuln_regexes, &mut js_vulns);
675                    scan_for_secrets(&inline, &url, &secret_regexes, &mut secrets);
676                }
677                // External JS src
678                if let Some(src) = el.value().attr("src") {
679                    if let Some(js_url) = resolve_url(&base_url, src) {
680                        if !is_known_library(&js_url) {
681                            js_file_urls.insert(js_url);
682                        }
683                    }
684                }
685            }
686
687            // ── Check forms for SSRF-vulnerable params ──────────────────
688            let form_sel = Selector::parse("form").unwrap();
689            let input_sel = Selector::parse("input[name], textarea[name]").unwrap();
690            for form in doc.select(&form_sel) {
691                let mut vuln_params = Vec::new();
692                for input in form.select(&input_sel) {
693                    if let Some(name) = input.value().attr("name") {
694                        let name_lower = name.to_lowercase();
695                        if SSRF_PARAMS.iter().any(|p| name_lower.contains(p)) {
696                            vuln_params.push(name.to_string());
697                        }
698                    }
699                }
700                if !vuln_params.is_empty() {
701                    ssrf_findings.push(SsrfFinding {
702                        finding_type: "Potential SSRF in Form".into(),
703                        severity: "Medium".into(),
704                        source_url: url.clone(),
705                        vulnerable_params: vuln_params,
706                        description: "Form contains fields that could be used for Server-Side Request Forgery.".into(),
707                    });
708                }
709            }
710
711            // ── Check meta CSP for weak policies ────────────────────────
712            let meta_sel =
713                Selector::parse(r#"meta[http-equiv="Content-Security-Policy"]"#).unwrap();
714            for meta in doc.select(&meta_sel) {
715                if let Some(content) = meta.value().attr("content") {
716                    let c_lower = content.to_lowercase();
717                    if c_lower.contains("unsafe-inline") || c_lower.contains("unsafe-eval") {
718                        js_vulns.push(JsVulnerability {
719                            vuln_type: "Weak CSP".into(),
720                            severity: "Medium".into(),
721                            source_url: url.clone(),
722                            matched_code: content.to_string(),
723                            description: "CSP allows unsafe-inline or unsafe-eval.".into(),
724                            recommendation: "Remove unsafe-inline and unsafe-eval from your CSP."
725                                .into(),
726                        });
727                    }
728                }
729            }
730
731            // ── Check forms for missing CSRF tokens ─────────────────────
732            let csrf_sel = Selector::parse(
733                r#"input[name*="csrf" i], input[name*="xsrf" i], input[name*="token" i]"#,
734            )
735            .unwrap();
736            for form in doc.select(&form_sel) {
737                if form.select(&csrf_sel).next().is_none() {
738                    js_vulns.push(JsVulnerability {
739                        vuln_type: "Missing CSRF Protection".into(),
740                        severity: "Medium".into(),
741                        source_url: url.clone(),
742                        matched_code: String::new(),
743                        description: "Form found without CSRF token.".into(),
744                        recommendation: "Add CSRF tokens to all state-changing forms.".into(),
745                    });
746                }
747            }
748        } else if (content_type.contains("javascript") || url.ends_with(".js"))
749            && !is_known_library(&url) {
750                js_file_urls.insert(url.clone());
751                scan_js_security(&body, &url, &js_vuln_regexes, &mut js_vulns);
752                scan_for_secrets(&body, &url, &secret_regexes, &mut secrets);
753            }
754    }
755
756    // ── Fetch & analyze external JS files ────────────────────────────────
757    for js_url in &js_file_urls {
758        if visited.contains(js_url) {
759            continue;
760        }
761        if let Ok(resp) = client.get(js_url).send().await {
762            if resp.status().is_success() {
763                if let Ok(js_body) = resp.text().await {
764                    if js_body.len() > 10 {
765                        scan_js_security(&js_body, js_url, &js_vuln_regexes, &mut js_vulns);
766                        scan_for_secrets(&js_body, js_url, &secret_regexes, &mut secrets);
767                        extract_api_endpoints(
768                            &js_body,
769                            &base_url,
770                            &api_regexes,
771                            &mut api_endpoints,
772                        );
773                    }
774                }
775            }
776        }
777    }
778
779    // ── Probe discovered API endpoints for SSRF ─────────────────────────
780    let ssrf_probes = payloads::lines(payloads::SSRF);
781    for endpoint in api_endpoints.iter().take(20) {
782        // limit to 20 to avoid flooding
783        for probe in ssrf_probes.iter().take(5) {
784            // top 5 probes per endpoint
785            let test_url = format!("{}?url={}", endpoint, probe);
786            if let Ok(resp) = client.get(&test_url).header("Accept", "*/*").send().await {
787                // Check if response indicates SSRF (redirect to our probe)
788                if resp.status().is_redirection() {
789                    if let Some(loc) = resp.headers().get("location") {
790                        if let Ok(loc_str) = loc.to_str() {
791                            if loc_str.contains(probe) {
792                                ssrf_findings.push(SsrfFinding {
793                                    finding_type: "Confirmed SSRF in API Endpoint".into(),
794                                    severity: "High".into(),
795                                    source_url: endpoint.clone(),
796                                    vulnerable_params: vec!["url".into()],
797                                    description: format!(
798                                        "API endpoint redirects to SSRF probe: {}",
799                                        loc_str
800                                    ),
801                                });
802                            }
803                        }
804                    }
805                }
806            }
807        }
808    }
809
810    // ── Deduplicate ─────────────────────────────────────────────────────
811    dedup_secrets(&mut secrets);
812    dedup_js_vulns(&mut js_vulns);
813
814    let api_list: Vec<String> = api_endpoints.into_iter().collect();
815
816    let summary = ScanSummary {
817        total_urls_crawled: visited.len(),
818        total_js_files: js_file_urls.len(),
819        total_api_endpoints: api_list.len(),
820        secrets_count: secrets.len(),
821        js_vulnerabilities_count: js_vulns.len(),
822        ssrf_vulnerabilities_count: ssrf_findings.len(),
823    };
824
825    Ok(ScannerResult {
826        domain: domain.to_string(),
827        secrets,
828        js_vulnerabilities: js_vulns,
829        ssrf_vulnerabilities: ssrf_findings,
830        api_endpoints_discovered: api_list,
831        summary,
832    })
833}
834
835// ── Scanner sub-functions ───────────────────────────────────────────────────
836
837fn scan_for_secrets(
838    content: &str,
839    source_url: &str,
840    patterns: &[(&SecretPattern, Regex)],
841    results: &mut Vec<SecretFinding>,
842) {
843    for (sp, rx) in patterns {
844        for m in rx.find_iter(content) {
845            let value = m.as_str();
846            let line = content[..m.start()].matches('\n').count() + 1;
847            let entropy = shannon_entropy(value);
848
849            // Skip low-entropy matches for key-type secrets
850            if matches!(
851                sp.name,
852                "AWS Secret Key" | "Google API Key" | "API Key" | "Secret Key"
853            ) && entropy < 3.5
854            {
855                continue;
856            }
857
858            // Context-based false positive check
859            let ctx_start = m.start().saturating_sub(80);
860            let ctx_end = (m.end() + 80).min(content.len());
861            let context = &content[ctx_start..ctx_end];
862            if is_false_positive_context(context) {
863                continue;
864            }
865
866            results.push(SecretFinding {
867                secret_type: sp.name.to_string(),
868                severity: sp.severity.to_string(),
869                masked_value: mask_secret(value),
870                source_url: source_url.to_string(),
871                line,
872                entropy: (entropy * 100.0).round() / 100.0,
873                recommendation: sp.recommendation.to_string(),
874            });
875        }
876    }
877}
878
879fn scan_js_security(
880    content: &str,
881    source_url: &str,
882    categories: &[(&JsVulnCategory, Vec<Regex>)],
883    results: &mut Vec<JsVulnerability>,
884) {
885    // Skip analysis on very large minified files for non-critical checks
886    let is_minified = content.len() > 5000 && content.matches('\n').count() < 50;
887
888    for (cat, rxs) in categories {
889        // For minified files, only check high-severity issues
890        if is_minified && cat.severity != "High" {
891            continue;
892        }
893
894        for rx in rxs {
895            for m in rx.find_iter(content) {
896                let matched = m.as_str();
897                // Limit matched_code length
898                let display = if matched.len() > 200 {
899                    &matched[..200]
900                } else {
901                    matched
902                };
903
904                results.push(JsVulnerability {
905                    vuln_type: cat.name.to_string(),
906                    severity: cat.severity.to_string(),
907                    source_url: source_url.to_string(),
908                    matched_code: display.to_string(),
909                    description: cat.description.to_string(),
910                    recommendation: cat.recommendation.to_string(),
911                });
912            }
913        }
914    }
915}
916
917fn dedup_secrets(v: &mut Vec<SecretFinding>) {
918    let mut seen = HashSet::new();
919    v.retain(|s| {
920        seen.insert(format!(
921            "{}:{}:{}",
922            s.secret_type, s.source_url, s.masked_value
923        ))
924    });
925}
926
927fn dedup_js_vulns(v: &mut Vec<JsVulnerability>) {
928    let mut seen = HashSet::new();
929    v.retain(|j| {
930        seen.insert(format!(
931            "{}:{}:{}",
932            j.vuln_type, j.source_url, j.matched_code
933        ))
934    });
935}
936
937fn check_url_params_ssrf(url: &str, findings: &mut Vec<SsrfFinding>) {
938    if let Some(query_start) = url.find('?') {
939        let query = &url[query_start + 1..];
940        let mut vuln_params = Vec::new();
941        for pair in query.split('&') {
942            if let Some(eq) = pair.find('=') {
943                let param = pair[..eq].to_lowercase();
944                if SSRF_PARAMS.iter().any(|p| param.contains(p)) {
945                    vuln_params.push(pair[..eq].to_string());
946                }
947            }
948        }
949        if !vuln_params.is_empty() {
950            findings.push(SsrfFinding {
951                finding_type: "Potential SSRF in URL Parameter".into(),
952                severity: "Medium".into(),
953                source_url: url.to_string(),
954                vulnerable_params: vuln_params,
955                description: "URL contains parameters that could be used for SSRF.".into(),
956            });
957        }
958    }
959}
960
961fn extract_api_endpoints(
962    content: &str,
963    base_url: &str,
964    patterns: &[Regex],
965    endpoints: &mut HashSet<String>,
966) {
967    for rx in patterns {
968        for m in rx.find_iter(content) {
969            let path = m.as_str();
970            let full_url = format!("{}{}", base_url.trim_end_matches('/'), path);
971            endpoints.insert(full_url);
972        }
973    }
974}
975
976fn resolve_url(base: &str, href: &str) -> Option<String> {
977    if href.starts_with("javascript:")
978        || href.starts_with('#')
979        || href.starts_with("mailto:")
980        || href.starts_with("tel:")
981    {
982        return None;
983    }
984    if href.starts_with("//") {
985        return Some(format!("https:{}", href));
986    }
987    if href.starts_with("http://") || href.starts_with("https://") {
988        return Some(href.to_string());
989    }
990    // Relative URL
991    let base_trimmed = if let Some(idx) = base.rfind('/') {
992        &base[..idx + 1]
993    } else {
994        base
995    };
996    Some(format!("{}{}", base_trimmed, href.trim_start_matches('/')))
997}
998
999fn is_same_domain(base: &str, url: &str) -> bool {
1000    let extract_host = |u: &str| -> String {
1001        u.trim_start_matches("https://")
1002            .trim_start_matches("http://")
1003            .split('/')
1004            .next()
1005            .unwrap_or("")
1006            .to_lowercase()
1007    };
1008    extract_host(base) == extract_host(url)
1009}