web_analyzer/
advanced_content_scanner.rs

1use regex::Regex;
2use scraper::{Html, Selector};
3use serde::{Deserialize, Serialize};
4use std::collections::{HashSet, VecDeque};
5use std::time::Duration;
6
7use crate::payloads;
8
9// ── Result structs ──────────────────────────────────────────────────────────
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct SecretFinding {
13    pub secret_type: String,
14    pub severity: String,
15    pub masked_value: String,
16    pub source_url: String,
17    pub line: usize,
18    pub entropy: f64,
19    pub recommendation: String,
20}
21
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct JsVulnerability {
24    pub vuln_type: String,
25    pub severity: String,
26    pub source_url: String,
27    pub matched_code: String,
28    pub description: String,
29    pub recommendation: String,
30}
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct SsrfFinding {
34    pub finding_type: String,
35    pub severity: String,
36    pub source_url: String,
37    pub vulnerable_params: Vec<String>,
38    pub description: String,
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct ScanSummary {
43    pub total_urls_crawled: usize,
44    pub total_js_files: usize,
45    pub total_api_endpoints: usize,
46    pub secrets_count: usize,
47    pub js_vulnerabilities_count: usize,
48    pub ssrf_vulnerabilities_count: usize,
49}
50
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct ScannerResult {
53    pub domain: String,
54    pub secrets: Vec<SecretFinding>,
55    pub js_vulnerabilities: Vec<JsVulnerability>,
56    pub ssrf_vulnerabilities: Vec<SsrfFinding>,
57    pub api_endpoints_discovered: Vec<String>,
58    pub summary: ScanSummary,
59}
60
61// ── Helpers ─────────────────────────────────────────────────────────────────
62
63fn shannon_entropy(data: &str) -> f64 {
64    if data.is_empty() {
65        return 0.0;
66    }
67    let mut freq = [0u32; 256];
68    for b in data.bytes() {
69        freq[b as usize] += 1;
70    }
71    let len = data.len() as f64;
72    freq.iter()
73        .filter(|&&c| c > 0)
74        .map(|&c| {
75            let p = c as f64 / len;
76            -p * p.log2()
77        })
78        .sum()
79}
80
81fn mask_secret(s: &str) -> String {
82    if s.len() <= 8 {
83        if s.len() > 2 {
84            format!("****{}", &s[s.len() - 2..])
85        } else {
86            "****".into()
87        }
88    } else {
89        format!("{}****{}", &s[..4], &s[s.len() - 4..])
90    }
91}
92
93fn is_false_positive_context(context: &str) -> bool {
94    let fp = [
95        "example",
96        "sample",
97        "placeholder",
98        "dummy",
99        "test",
100        "demo",
101        "your_",
102        "my_",
103        "template",
104        "undefined",
105        "localhost",
106        "127.0.0.1",
107    ];
108    let ctx_lower = context.to_lowercase();
109    fp.iter().any(|p| ctx_lower.contains(p))
110}
111
112fn is_known_library(url: &str) -> bool {
113    let libs = [
114        "jquery",
115        "bootstrap",
116        "modernizr",
117        "polyfill",
118        "vendor",
119        "bundle",
120        "analytics",
121        "tracking",
122        "ga.js",
123        "gtm.js",
124        "react",
125        "angular",
126        "vue",
127        "lodash",
128        "moment",
129        "cdn",
130        "static",
131        "dist",
132        "chunk",
133    ];
134    let url_lower = url.to_lowercase();
135    libs.iter().any(|lib| url_lower.contains(lib))
136}
137
138// ── Secret patterns ─────────────────────────────────────────────────────────
139
140struct SecretPattern {
141    name: &'static str,
142    pattern: &'static str,
143    severity: &'static str,
144    recommendation: &'static str,
145}
146
147const SECRET_PATTERNS: &[SecretPattern] = &[
148    SecretPattern {
149        name: "AWS Access Key",
150        pattern: r"\bAKIA[0-9A-Z]{16}\b",
151        severity: "Medium",
152        recommendation: "Rotate the key immediately. Use AWS IAM roles instead of hard-coded keys.",
153    },
154    SecretPattern {
155        name: "AWS Secret Key",
156        pattern: r"\b[0-9a-zA-Z/+]{40}\b",
157        severity: "High",
158        recommendation: "Rotate the key immediately. Store secrets in AWS Secrets Manager.",
159    },
160    SecretPattern {
161        name: "Google API Key",
162        pattern: r"\bAIza[0-9A-Za-z\-_]{35}\b",
163        severity: "Medium",
164        recommendation: "Rotate the key and implement API key restrictions.",
165    },
166    SecretPattern {
167        name: "Google OAuth",
168        pattern: r"[0-9]+-[0-9A-Za-z_]{32}\.apps\.googleusercontent\.com",
169        severity: "Medium",
170        recommendation: "Review and potentially regenerate the OAuth credentials.",
171    },
172    SecretPattern {
173        name: "Stripe API Key",
174        pattern: r"\b(?:sk|pk)_(live|test)_[0-9a-zA-Z]{24,34}\b",
175        severity: "High",
176        recommendation: "Rotate the key immediately. Only use server-side code for Stripe API.",
177    },
178    SecretPattern {
179        name: "GitHub Token",
180        pattern: r"\b(?:github|gh)(?:_pat)?_[0-9a-zA-Z]{36,40}\b",
181        severity: "High",
182        recommendation: "Revoke and regenerate the token. Use GitHub Actions secrets for CI/CD.",
183    },
184    SecretPattern {
185        name: "GitHub OAuth",
186        pattern: r"\bgho_[0-9a-zA-Z]{36,40}\b",
187        severity: "High",
188        recommendation: "Revoke and regenerate the OAuth token.",
189    },
190    SecretPattern {
191        name: "Facebook Access Token",
192        pattern: r"EAACEdEose0cBA[0-9A-Za-z]+",
193        severity: "Medium",
194        recommendation: "Revoke the token and regenerate. Store tokens securely.",
195    },
196    SecretPattern {
197        name: "JWT Token",
198        pattern: r"eyJ[a-zA-Z0-9_\-]*\.[a-zA-Z0-9_\-]*\.[a-zA-Z0-9_\-]*",
199        severity: "Medium",
200        recommendation: "If valid, rotate the token. Implement proper expiration.",
201    },
202    SecretPattern {
203        name: "SSH Private Key",
204        pattern: r"-----BEGIN\s+(?:RSA|DSA|EC|OPENSSH)\s+PRIVATE\s+KEY",
205        severity: "High",
206        recommendation: "Generate a new key pair. Never store private keys in code.",
207    },
208    SecretPattern {
209        name: "Password in URL",
210        pattern: r"[a-zA-Z]{3,10}://[^/\s:@]{3,20}:[^/\s:@]{3,20}@.{1,100}",
211        severity: "High",
212        recommendation: "Remove the password from the URL and use secure authentication.",
213    },
214    SecretPattern {
215        name: "Firebase URL",
216        pattern: r"https://[a-z0-9-]+\.firebaseio\.com",
217        severity: "Low",
218        recommendation: "Review Firebase security rules and regenerate any associated secrets.",
219    },
220    SecretPattern {
221        name: "MongoDB Connection String",
222        pattern: r"mongodb(?:\+srv)?://[^/\s]+:[^/\s]+@[^/\s]+",
223        severity: "High",
224        recommendation: "Rotate the password and use environment variables instead.",
225    },
226    SecretPattern {
227        name: "Slack Token",
228        pattern: r"xox[baprs]-[0-9a-zA-Z\-]{10,48}",
229        severity: "Medium",
230        recommendation: "Revoke and regenerate the token.",
231    },
232    SecretPattern {
233        name: "Slack Webhook",
234        pattern: r"https://hooks\.slack\.com/services/T[a-zA-Z0-9_]+/B[a-zA-Z0-9_]+/[a-zA-Z0-9_]+",
235        severity: "Medium",
236        recommendation: "Regenerate the webhook URL and store it securely.",
237    },
238    SecretPattern {
239        name: "API Key",
240        pattern: r#"(?i)\b(?:api[_\-]?key|apikey)\b\s*[=:]\s*["'`]([a-zA-Z0-9_\-\.]{16,64})["'`]"#,
241        severity: "Medium",
242        recommendation: "Rotate the key. Store it in environment variables or a secrets manager.",
243    },
244    SecretPattern {
245        name: "Secret Key",
246        pattern: r#"(?i)\b(?:secret[_\-]?key|secretkey)\b\s*[=:]\s*["'`]([a-zA-Z0-9_\-\.]{16,64})["'`]"#,
247        severity: "Medium",
248        recommendation: "Rotate the key and ensure it's stored in a secure vault.",
249    },
250    SecretPattern {
251        name: "Auth Token",
252        pattern: r#"(?i)\b(?:auth[_\-]?token|authtoken)\b\s*[=:]\s*["'`]([a-zA-Z0-9_\-\.]{16,64})["'`]"#,
253        severity: "Medium",
254        recommendation: "Revoke the token and issue a new one.",
255    },
256    SecretPattern {
257        name: "Access Token",
258        pattern: r#"(?i)\b(?:access[_\-]?token|accesstoken)\b\s*[=:]\s*["'`]([a-zA-Z0-9_\-\.]{16,64})["'`]"#,
259        severity: "Medium",
260        recommendation: "Revoke and regenerate the token.",
261    },
262    SecretPattern {
263        name: "Encryption Key",
264        pattern: r#"(?i)(?:encryption|aes|des|blowfish)[\s_-]?key[\s=:]+["'`][A-Za-z0-9+/]{16,}={0,2}["'`]"#,
265        severity: "High",
266        recommendation: "Rotate the key and store it securely using a key management system.",
267    },
268    SecretPattern {
269        name: "Stripe Publishable Key",
270        pattern: r"\bpk_(live|test)_[0-9a-zA-Z]{24,34}\b",
271        severity: "Low",
272        recommendation:
273            "Publishable keys are public, but verify no secret keys are exposed nearby.",
274    },
275    SecretPattern {
276        name: "Twitter Bearer",
277        pattern: r"AAAAAAAAAAAAAAAAAAA[A-Za-z0-9%]+",
278        severity: "Medium",
279        recommendation: "Rotate the bearer token. Use environment variables for storage.",
280    },
281    SecretPattern {
282        name: "Password",
283        pattern: r#"(?i)(?:password|passwd|pwd)[\s=:]+["'`]([^"'`\s]{8,64})["'`]"#,
284        severity: "High",
285        recommendation:
286            "Remove hardcoded passwords. Use a secrets manager or environment variables.",
287    },
288    SecretPattern {
289        name: "Database Credentials",
290        pattern: r#"(?i)(?:db_pass|db_password|database_password)[\s=:]+["'`]([^"'`\s]+)["'`]"#,
291        severity: "High",
292        recommendation: "Change DB credentials immediately. Store in env vars or a vault.",
293    },
294];
295
296// ── JS vulnerability patterns ───────────────────────────────────────────────
297
298struct JsVulnCategory {
299    name: &'static str,
300    severity: &'static str,
301    patterns: &'static [&'static str],
302    description: &'static str,
303    recommendation: &'static str,
304}
305
306const JS_VULN_CATEGORIES: &[JsVulnCategory] = &[
307    JsVulnCategory {
308        name: "DOM XSS",
309        severity: "High",
310        patterns: &[
311            r"document\.write\s*\(\s*.*?(?:location|URL|documentURI|referrer|href|search|hash)",
312            r"\.innerHTML\s*=\s*.*?(?:location|URL|documentURI|referrer|href|search|hash)",
313            r"\.outerHTML\s*=\s*.*?(?:location|URL|documentURI|referrer|href|search|hash)",
314            r"eval\s*\(\s*.*?(?:location|URL|documentURI|referrer|href|search|hash)",
315        ],
316        description:
317            "DOM-based XSS: user-controllable data passed to a dynamic code execution sink.",
318        recommendation:
319            "Sanitize all user inputs before DOM operations. Use DOMPurify or a strict CSP.",
320    },
321    JsVulnCategory {
322        name: "Open Redirect",
323        severity: "High",
324        patterns: &[
325            r"(?:window\.)?location(?:\.href)?\s*=\s*.*?(?:user|input|param|arg)",
326            r"(?:window\.)?location\.replace\s*\(\s*.*?(?:user|input|param|arg)",
327            r"(?:window\.)?location\.assign\s*\(\s*.*?(?:user|input|param|arg)",
328        ],
329        description: "User input determines redirect destination, enabling phishing attacks.",
330        recommendation: "Implement a whitelist of allowed redirect URLs.",
331    },
332    JsVulnCategory {
333        name: "CORS Misconfiguration",
334        severity: "Medium",
335        patterns: &[
336            r"Access-Control-Allow-Origin\s*:\s*\*",
337            r"Access-Control-Allow-Origin\s*:\s*null",
338            r"Access-Control-Allow-Credentials\s*:\s*true",
339        ],
340        description: "CORS misconfiguration can allow unauthorized cross-origin access.",
341        recommendation: "Be specific with CORS policies. Avoid wildcard origins.",
342    },
343    JsVulnCategory {
344        name: "Insecure Cookie",
345        severity: "Medium",
346        patterns: &[r"document\.cookie\s*="],
347        description: "Cookies set without secure flags can be vulnerable to theft.",
348        recommendation: "Set 'Secure' and 'HttpOnly' flags on sensitive cookies.",
349    },
350    JsVulnCategory {
351        name: "Insecure Data Transmission",
352        severity: "Medium",
353        patterns: &[r#"\.postMessage\([^,]+,\s*["']\*["']\)"#],
354        description: "Data transmitted insecurely via postMessage with wildcard origin.",
355        recommendation: "Use specific origin URLs with postMessage() and validate senders.",
356    },
357    JsVulnCategory {
358        name: "Prototype Pollution",
359        severity: "Medium",
360        patterns: &[r"__proto__\s*[=\[]", r"prototype\["],
361        description: "Prototype pollution can lead to property injection attacks.",
362        recommendation:
363            "Avoid user-controlled data with Object.assign()/prototype. Use Object.create(null).",
364    },
365    JsVulnCategory {
366        name: "Command Injection",
367        severity: "High",
368        patterns: &[
369            r"exec\s*\(\s*.*?(?:user|input|param|arg)",
370            r"spawn\s*\(\s*.*?(?:user|input|param|arg)",
371        ],
372        description: "Command injection allows attackers to execute arbitrary commands.",
373        recommendation: "Avoid executing commands with user input. Implement strict validation.",
374    },
375    JsVulnCategory {
376        name: "Insecure Data Storage",
377        severity: "Low",
378        patterns: &[
379            r"localStorage\.setItem\(\s*[^,]+,\s*.*?(?:password|token|key|secret|credentials)",
380            r"sessionStorage\.setItem\(\s*[^,]+,\s*.*?(?:password|token|key|secret|credentials)",
381        ],
382        description: "Sensitive data stored insecurely in client-side storage.",
383        recommendation: "Don't store sensitive info in localStorage/sessionStorage.",
384    },
385    JsVulnCategory {
386        name: "Event Handler XSS",
387        severity: "Medium",
388        patterns: &[r#"\.setAttribute\(["']on\w+["']\s*,"#],
389        description: "Event handlers assigned dynamically can lead to XSS.",
390        recommendation: "Validate and sanitize data before assigning to event handlers.",
391    },
392    JsVulnCategory {
393        name: "CSP Bypass",
394        severity: "Medium",
395        patterns: &[r#"document\.createElement\(["']script["']\)"#],
396        description: "Dynamic script creation may bypass Content Security Policy.",
397        recommendation: "Implement a strict CSP and avoid dynamic script creation with user input.",
398    },
399    JsVulnCategory {
400        name: "WebSocket Insecurity",
401        severity: "High",
402        patterns: &[r#"new\s+WebSocket\(\s*["']ws://"#],
403        description: "Insecure WebSocket connections (ws://) can be intercepted.",
404        recommendation: "Use secure WebSocket connections (wss://) and validate data.",
405    },
406    JsVulnCategory {
407        name: "Insecure Crypto",
408        severity: "High",
409        patterns: &[
410            r#"(?:createHash|crypto\.subtle).*?["'](?:md5|sha1)["']"#,
411            r"Math\.random\(\)",
412        ],
413        description: "Weak cryptographic methods (MD5/SHA1/Math.random) in use.",
414        recommendation:
415            "Use modern crypto algorithms. Use crypto.getRandomValues() instead of Math.random().",
416    },
417    JsVulnCategory {
418        name: "Path Traversal",
419        severity: "Medium",
420        patterns: &[r"\.\./|\.\.\\"],
421        description: "Path traversal allows access to files outside the intended directory.",
422        recommendation: "Validate and sanitize file paths. Use allowlists.",
423    },
424];
425
426// ── SSRF parameters ─────────────────────────────────────────────────────────
427
428const SSRF_PARAMS: &[&str] = &[
429    "url",
430    "uri",
431    "link",
432    "src",
433    "href",
434    "target",
435    "destination",
436    "redirect",
437    "redirect_to",
438    "redirecturl",
439    "redirect_uri",
440    "return",
441    "return_to",
442    "returnurl",
443    "return_path",
444    "path",
445    "load",
446    "file",
447    "filename",
448    "folder",
449    "folder_url",
450    "image",
451    "img",
452    "image_url",
453    "image_path",
454    "avatar",
455    "document",
456    "doc",
457    "document_url",
458    "fetch",
459    "get",
460    "view",
461    "content",
462    "domain",
463    "callback",
464    "reference",
465    "site",
466    "page",
467    "data",
468    "data_url",
469    "resource",
470    "template",
471    "api_endpoint",
472    "endpoint",
473    "proxy",
474    "feed",
475    "host",
476    "webhook",
477    "address",
478    "media",
479    "video",
480    "audio",
481    "download",
482    "upload",
483    "preview",
484    "source",
485    "location",
486    "goto",
487    "callback_url",
488    "forward",
489    "next",
490    "origin",
491    "continue",
492];
493
494// ── Main scanner ────────────────────────────────────────────────────────────
495
496pub async fn scan_content(
497    domain: &str,
498    progress_tx: Option<tokio::sync::mpsc::Sender<crate::ScanProgress>>,
499) -> Result<ScannerResult, Box<dyn std::error::Error + Send + Sync>> {
500    report_progress(
501        &progress_tx,
502        5.0,
503        format!("Preparing advanced content scan for {}", domain),
504        "Info",
505    );
506
507    let base_url = if domain.starts_with("http") {
508        domain.to_string()
509    } else {
510        format!("https://{}", domain)
511    };
512
513    let client = crate::http_client_builder()
514        .timeout(Duration::from_secs(15))
515        .danger_accept_invalid_certs(true)
516        .build()?;
517
518    let mut secrets = Vec::new();
519    let mut js_vulns = Vec::new();
520    let mut ssrf_findings = Vec::new();
521    let mut visited = HashSet::new();
522    let mut js_file_urls = HashSet::new();
523    let mut api_endpoints: HashSet<String> = HashSet::new();
524    let mut queue: VecDeque<(String, u8)> = VecDeque::new();
525    queue.push_back((base_url.clone(), 0));
526
527    let max_depth: u8 = 2;
528    let max_pages: usize = 50;
529
530    report_progress(&progress_tx, 10.0, "Compiling detection patterns", "Info");
531
532    // Compile regex patterns once
533    let secret_regexes: Vec<(&SecretPattern, Regex)> = SECRET_PATTERNS
534        .iter()
535        .filter_map(|sp| Regex::new(sp.pattern).ok().map(|r| (sp, r)))
536        .collect();
537
538    let js_vuln_regexes: Vec<(&JsVulnCategory, Vec<Regex>)> = JS_VULN_CATEGORIES
539        .iter()
540        .map(|cat| {
541            let rxs: Vec<Regex> = cat
542                .patterns
543                .iter()
544                .filter_map(|p| Regex::new(p).ok())
545                .collect();
546            (cat, rxs)
547        })
548        .collect();
549
550    // API endpoint extraction patterns
551    let api_regexes: Vec<Regex> = [
552        r"/api/v\d+/",
553        r"/api/",
554        r"/graphql",
555        r"/rest/",
556        r"/v\d+/\w+",
557        r"/service/",
558        r"/json/",
559        r"/rpc/",
560        r"/gateway/",
561        r"/ajax/",
562        r"/data/",
563        r"/query/",
564        r"/feeds/",
565        r"/svc/",
566        r"/soap/",
567    ]
568    .iter()
569    .filter_map(|p| Regex::new(p).ok())
570    .collect();
571
572    // ── Parse robots.txt ─────────────────────────────────────────────
573    let mut disallowed: Vec<String> = Vec::new();
574    let robots_url = format!("{}/robots.txt", base_url.trim_end_matches('/'));
575    report_progress(&progress_tx, 15.0, "Checking robots.txt", "Info");
576    if let Ok(resp) = client.get(&robots_url).send().await {
577        if resp.status().is_success() {
578            if let Ok(body) = resp.text().await {
579                let mut agent_match = false;
580                for line in body.lines() {
581                    let line = line.trim().to_lowercase();
582                    if let Some(agent) = line.strip_prefix("user-agent:") {
583                        let agent = agent.trim();
584                        agent_match = agent == "*";
585                    }
586                    if agent_match {
587                        if let Some(path) = line.strip_prefix("disallow:") {
588                            let path = path.trim();
589                            if !path.is_empty() {
590                                disallowed.push(path.to_string());
591                            }
592                        }
593                    }
594                }
595            }
596        }
597    }
598
599    // ── Process sitemap.xml for seed URLs ─────────────────────────────
600    let sitemap_url = format!("{}/sitemap.xml", base_url.trim_end_matches('/'));
601    report_progress(
602        &progress_tx,
603        20.0,
604        "Checking sitemap.xml for seed URLs",
605        "Info",
606    );
607    if let Ok(resp) = client.get(&sitemap_url).send().await {
608        if resp.status().is_success() {
609            if let Ok(body) = resp.text().await {
610                let loc_rx = Regex::new(r"<loc>([^<]+)</loc>").unwrap();
611                for cap in loc_rx.captures_iter(&body) {
612                    if let Some(url) = cap.get(1) {
613                        let u = url.as_str().to_string();
614                        if is_same_domain(&base_url, &u) && !visited.contains(&u) {
615                            queue.push_back((u, 1));
616                        }
617                    }
618                }
619            }
620        }
621    }
622
623    // ── BFS Crawl ───────────────────────────────────────────────────────
624    while let Some((url, depth)) = queue.pop_front() {
625        if visited.len() >= max_pages || depth > max_depth || visited.contains(&url) {
626            continue;
627        }
628
629        // Respect robots.txt disallow rules
630        let url_path = url.trim_start_matches(&base_url);
631        if disallowed.iter().any(|d| url_path.starts_with(d.as_str())) {
632            continue;
633        }
634
635        visited.insert(url.clone());
636        let crawl_progress = 20.0 + (40.0 * (visited.len() as f32 / max_pages as f32)).min(40.0);
637        report_progress(
638            &progress_tx,
639            crawl_progress,
640            format!("Scanning page {}", url),
641            "Info",
642        );
643
644        // Check URL parameters for SSRF-vulnerable names
645        check_url_params_ssrf(&url, &mut ssrf_findings);
646
647        let resp = match client.get(&url).send().await {
648            Ok(r) => r,
649            Err(_) => continue,
650        };
651        if !resp.status().is_success() {
652            continue;
653        }
654
655        let content_type = resp
656            .headers()
657            .get("content-type")
658            .and_then(|v| v.to_str().ok())
659            .unwrap_or("")
660            .to_lowercase();
661
662        let body = match resp.text().await {
663            Ok(t) => t,
664            Err(_) => continue,
665        };
666
667        // Scan this page's content for secrets
668        scan_for_secrets(&body, &url, &secret_regexes, &mut secrets);
669
670        // Extract API endpoints from the body
671        extract_api_endpoints(&body, &base_url, &api_regexes, &mut api_endpoints);
672
673        if content_type.contains("text/html") {
674            let doc = Html::parse_document(&body);
675
676            // ── Extract & queue links ───────────────────────────────────
677            if depth < max_depth {
678                let a_sel = Selector::parse("a[href]").unwrap();
679                for el in doc.select(&a_sel) {
680                    if let Some(href) = el.value().attr("href") {
681                        let abs = resolve_url(&base_url, href);
682                        if let Some(abs_url) = abs {
683                            if is_same_domain(&base_url, &abs_url) && !visited.contains(&abs_url) {
684                                queue.push_back((abs_url, depth + 1));
685                            }
686                        }
687                    }
688                }
689            }
690
691            // ── Extract inline JS & external JS URLs ────────────────────
692            let script_sel = Selector::parse("script").unwrap();
693            for el in doc.select(&script_sel) {
694                // Inline JS
695                let inline = el.text().collect::<String>();
696                if inline.len() > 10 {
697                    scan_js_security(&inline, &url, &js_vuln_regexes, &mut js_vulns);
698                    scan_for_secrets(&inline, &url, &secret_regexes, &mut secrets);
699                }
700                // External JS src
701                if let Some(src) = el.value().attr("src") {
702                    if let Some(js_url) = resolve_url(&base_url, src) {
703                        if !is_known_library(&js_url) {
704                            js_file_urls.insert(js_url);
705                        }
706                    }
707                }
708            }
709
710            // ── Check forms for SSRF-vulnerable params ──────────────────
711            let form_sel = Selector::parse("form").unwrap();
712            let input_sel = Selector::parse("input[name], textarea[name]").unwrap();
713            for form in doc.select(&form_sel) {
714                let mut vuln_params = Vec::new();
715                for input in form.select(&input_sel) {
716                    if let Some(name) = input.value().attr("name") {
717                        let name_lower = name.to_lowercase();
718                        if SSRF_PARAMS.iter().any(|p| name_lower.contains(p)) {
719                            vuln_params.push(name.to_string());
720                        }
721                    }
722                }
723                if !vuln_params.is_empty() {
724                    ssrf_findings.push(SsrfFinding {
725                        finding_type: "Potential SSRF in Form".into(),
726                        severity: "Medium".into(),
727                        source_url: url.clone(),
728                        vulnerable_params: vuln_params,
729                        description: "Form contains fields that could be used for Server-Side Request Forgery.".into(),
730                    });
731                }
732            }
733
734            // ── Check meta CSP for weak policies ────────────────────────
735            let meta_sel =
736                Selector::parse(r#"meta[http-equiv="Content-Security-Policy"]"#).unwrap();
737            for meta in doc.select(&meta_sel) {
738                if let Some(content) = meta.value().attr("content") {
739                    let c_lower = content.to_lowercase();
740                    if c_lower.contains("unsafe-inline") || c_lower.contains("unsafe-eval") {
741                        js_vulns.push(JsVulnerability {
742                            vuln_type: "Weak CSP".into(),
743                            severity: "Medium".into(),
744                            source_url: url.clone(),
745                            matched_code: content.to_string(),
746                            description: "CSP allows unsafe-inline or unsafe-eval.".into(),
747                            recommendation: "Remove unsafe-inline and unsafe-eval from your CSP."
748                                .into(),
749                        });
750                    }
751                }
752            }
753
754            // ── Check forms for missing CSRF tokens ─────────────────────
755            let csrf_sel = Selector::parse(
756                r#"input[name*="csrf" i], input[name*="xsrf" i], input[name*="token" i]"#,
757            )
758            .unwrap();
759            for form in doc.select(&form_sel) {
760                if form.select(&csrf_sel).next().is_none() {
761                    js_vulns.push(JsVulnerability {
762                        vuln_type: "Missing CSRF Protection".into(),
763                        severity: "Medium".into(),
764                        source_url: url.clone(),
765                        matched_code: String::new(),
766                        description: "Form found without CSRF token.".into(),
767                        recommendation: "Add CSRF tokens to all state-changing forms.".into(),
768                    });
769                }
770            }
771        } else if (content_type.contains("javascript") || url.ends_with(".js"))
772            && !is_known_library(&url)
773        {
774            js_file_urls.insert(url.clone());
775            scan_js_security(&body, &url, &js_vuln_regexes, &mut js_vulns);
776            scan_for_secrets(&body, &url, &secret_regexes, &mut secrets);
777        }
778    }
779
780    // ── Fetch & analyze external JS files ────────────────────────────────
781    let total_js_files = js_file_urls.len().max(1);
782    for (index, js_url) in js_file_urls.iter().enumerate() {
783        if visited.contains(js_url) {
784            continue;
785        }
786        report_progress(
787            &progress_tx,
788            65.0 + (15.0 * (index as f32 / total_js_files as f32)),
789            format!("Analyzing JavaScript asset {}", js_url),
790            "Info",
791        );
792        if let Ok(resp) = client.get(js_url).send().await {
793            if resp.status().is_success() {
794                if let Ok(js_body) = resp.text().await {
795                    if js_body.len() > 10 {
796                        scan_js_security(&js_body, js_url, &js_vuln_regexes, &mut js_vulns);
797                        scan_for_secrets(&js_body, js_url, &secret_regexes, &mut secrets);
798                        extract_api_endpoints(
799                            &js_body,
800                            &base_url,
801                            &api_regexes,
802                            &mut api_endpoints,
803                        );
804                    }
805                }
806            }
807        }
808    }
809
810    // ── Probe discovered API endpoints for SSRF ─────────────────────────
811    let ssrf_probes = payloads::lines(payloads::SSRF);
812    let ssrf_limit = api_endpoints.len().clamp(1, 20);
813    for (index, endpoint) in api_endpoints.iter().take(20).enumerate() {
814        // limit to 20 to avoid flooding
815        report_progress(
816            &progress_tx,
817            85.0 + (10.0 * (index as f32 / ssrf_limit as f32)),
818            format!("Probing API endpoint {}", endpoint),
819            "Info",
820        );
821        for probe in ssrf_probes.iter().take(5) {
822            // top 5 probes per endpoint
823            let test_url = format!("{}?url={}", endpoint, probe);
824            if let Ok(resp) = client.get(&test_url).header("Accept", "*/*").send().await {
825                // Check if response indicates SSRF (redirect to our probe)
826                if resp.status().is_redirection() {
827                    if let Some(loc) = resp.headers().get("location") {
828                        if let Ok(loc_str) = loc.to_str() {
829                            if loc_str.contains(probe) {
830                                ssrf_findings.push(SsrfFinding {
831                                    finding_type: "Confirmed SSRF in API Endpoint".into(),
832                                    severity: "High".into(),
833                                    source_url: endpoint.clone(),
834                                    vulnerable_params: vec!["url".into()],
835                                    description: format!(
836                                        "API endpoint redirects to SSRF probe: {}",
837                                        loc_str
838                                    ),
839                                });
840                            }
841                        }
842                    }
843                }
844            }
845        }
846    }
847
848    // ── Deduplicate ─────────────────────────────────────────────────────
849    report_progress(&progress_tx, 98.0, "Deduplicating findings", "Info");
850    dedup_secrets(&mut secrets);
851    dedup_js_vulns(&mut js_vulns);
852
853    let api_list: Vec<String> = api_endpoints.into_iter().collect();
854
855    let summary = ScanSummary {
856        total_urls_crawled: visited.len(),
857        total_js_files: js_file_urls.len(),
858        total_api_endpoints: api_list.len(),
859        secrets_count: secrets.len(),
860        js_vulnerabilities_count: js_vulns.len(),
861        ssrf_vulnerabilities_count: ssrf_findings.len(),
862    };
863
864    report_progress(
865        &progress_tx,
866        100.0,
867        format!(
868            "Advanced content scan complete: {} URL(s), {} JS file(s), {} API endpoint(s)",
869            summary.total_urls_crawled, summary.total_js_files, summary.total_api_endpoints
870        ),
871        "Success",
872    );
873
874    Ok(ScannerResult {
875        domain: domain.to_string(),
876        secrets,
877        js_vulnerabilities: js_vulns,
878        ssrf_vulnerabilities: ssrf_findings,
879        api_endpoints_discovered: api_list,
880        summary,
881    })
882}
883
884fn report_progress(
885    progress_tx: &Option<tokio::sync::mpsc::Sender<crate::ScanProgress>>,
886    percentage: f32,
887    message: impl Into<String>,
888    status: &str,
889) {
890    if let Some(tx) = progress_tx {
891        let _ = tx.try_send(crate::ScanProgress {
892            module: "Advanced Content".into(),
893            percentage,
894            message: message.into(),
895            status: status.into(),
896        });
897    }
898}
899
900// ── Scanner sub-functions ───────────────────────────────────────────────────
901
902fn scan_for_secrets(
903    content: &str,
904    source_url: &str,
905    patterns: &[(&SecretPattern, Regex)],
906    results: &mut Vec<SecretFinding>,
907) {
908    for (sp, rx) in patterns {
909        for m in rx.find_iter(content) {
910            let value = m.as_str();
911            let line = content[..m.start()].matches('\n').count() + 1;
912            let entropy = shannon_entropy(value);
913
914            // Skip low-entropy matches for key-type secrets
915            if matches!(
916                sp.name,
917                "AWS Secret Key" | "Google API Key" | "API Key" | "Secret Key"
918            ) && entropy < 3.5
919            {
920                continue;
921            }
922
923            // Context-based false positive check
924            let ctx_start = m.start().saturating_sub(80);
925            let ctx_end = (m.end() + 80).min(content.len());
926            let context = &content[ctx_start..ctx_end];
927            if is_false_positive_context(context) {
928                continue;
929            }
930
931            results.push(SecretFinding {
932                secret_type: sp.name.to_string(),
933                severity: sp.severity.to_string(),
934                masked_value: mask_secret(value),
935                source_url: source_url.to_string(),
936                line,
937                entropy: (entropy * 100.0).round() / 100.0,
938                recommendation: sp.recommendation.to_string(),
939            });
940        }
941    }
942}
943
944fn scan_js_security(
945    content: &str,
946    source_url: &str,
947    categories: &[(&JsVulnCategory, Vec<Regex>)],
948    results: &mut Vec<JsVulnerability>,
949) {
950    // Skip analysis on very large minified files for non-critical checks
951    let is_minified = content.len() > 5000 && content.matches('\n').count() < 50;
952
953    for (cat, rxs) in categories {
954        // For minified files, only check high-severity issues
955        if is_minified && cat.severity != "High" {
956            continue;
957        }
958
959        for rx in rxs {
960            for m in rx.find_iter(content) {
961                let matched = m.as_str();
962                // Limit matched_code length
963                let display = if matched.len() > 200 {
964                    &matched[..200]
965                } else {
966                    matched
967                };
968
969                results.push(JsVulnerability {
970                    vuln_type: cat.name.to_string(),
971                    severity: cat.severity.to_string(),
972                    source_url: source_url.to_string(),
973                    matched_code: display.to_string(),
974                    description: cat.description.to_string(),
975                    recommendation: cat.recommendation.to_string(),
976                });
977            }
978        }
979    }
980}
981
982fn dedup_secrets(v: &mut Vec<SecretFinding>) {
983    let mut seen = HashSet::new();
984    v.retain(|s| {
985        seen.insert(format!(
986            "{}:{}:{}",
987            s.secret_type, s.source_url, s.masked_value
988        ))
989    });
990}
991
992fn dedup_js_vulns(v: &mut Vec<JsVulnerability>) {
993    let mut seen = HashSet::new();
994    v.retain(|j| {
995        seen.insert(format!(
996            "{}:{}:{}",
997            j.vuln_type, j.source_url, j.matched_code
998        ))
999    });
1000}
1001
1002fn check_url_params_ssrf(url: &str, findings: &mut Vec<SsrfFinding>) {
1003    if let Some(query_start) = url.find('?') {
1004        let query = &url[query_start + 1..];
1005        let mut vuln_params = Vec::new();
1006        for pair in query.split('&') {
1007            if let Some(eq) = pair.find('=') {
1008                let param = pair[..eq].to_lowercase();
1009                if SSRF_PARAMS.iter().any(|p| param.contains(p)) {
1010                    vuln_params.push(pair[..eq].to_string());
1011                }
1012            }
1013        }
1014        if !vuln_params.is_empty() {
1015            findings.push(SsrfFinding {
1016                finding_type: "Potential SSRF in URL Parameter".into(),
1017                severity: "Medium".into(),
1018                source_url: url.to_string(),
1019                vulnerable_params: vuln_params,
1020                description: "URL contains parameters that could be used for SSRF.".into(),
1021            });
1022        }
1023    }
1024}
1025
1026fn extract_api_endpoints(
1027    content: &str,
1028    base_url: &str,
1029    patterns: &[Regex],
1030    endpoints: &mut HashSet<String>,
1031) {
1032    for rx in patterns {
1033        for m in rx.find_iter(content) {
1034            let path = m.as_str();
1035            let full_url = format!("{}{}", base_url.trim_end_matches('/'), path);
1036            endpoints.insert(full_url);
1037        }
1038    }
1039}
1040
1041fn resolve_url(base: &str, href: &str) -> Option<String> {
1042    if href.starts_with("javascript:")
1043        || href.starts_with('#')
1044        || href.starts_with("mailto:")
1045        || href.starts_with("tel:")
1046    {
1047        return None;
1048    }
1049    if href.starts_with("//") {
1050        return Some(format!("https:{}", href));
1051    }
1052    if href.starts_with("http://") || href.starts_with("https://") {
1053        return Some(href.to_string());
1054    }
1055    // Relative URL
1056    let base_trimmed = if let Some(idx) = base.rfind('/') {
1057        &base[..idx + 1]
1058    } else {
1059        base
1060    };
1061    Some(format!("{}{}", base_trimmed, href.trim_start_matches('/')))
1062}
1063
1064fn is_same_domain(base: &str, url: &str) -> bool {
1065    let extract_host = |u: &str| -> String {
1066        u.trim_start_matches("https://")
1067            .trim_start_matches("http://")
1068            .split('/')
1069            .next()
1070            .unwrap_or("")
1071            .to_lowercase()
1072    };
1073    extract_host(base) == extract_host(url)
1074}
web_analyzer/advanced_content_scanner.rs

web_analyzer/
advanced_content_scanner.rs