token_analyzer/
analyzer.rs

1//! Token Security Analyzer - Fast, parallel token usage analysis
2//!
3//! This module provides a standalone security analyzer that scans codebases
4//! for token usage patterns and identifies potential security risks like
5//! plaintext exposure in logs, prints, or debug statements.
6//!
7//! # Features
8//! - **Blazing fast**: Uses ripgrep's `ignore` crate for file walking
9//! - **Parallel**: Leverages `rayon` for multi-threaded file scanning
10//! - **Smart**: Respects `.gitignore` and common ignore patterns
11//! - **Security-focused**: Detects dangerous patterns (print, log, echo)
12//! - **Context-aware**: Prioritizes sensitive files (.env, configs)
13//! - **Entropy detection**: Identifies high-entropy strings (real secrets)
14//! - **Known prefixes**: Detects known token formats (AWS, GitHub, Slack...)
15//!
16//! # Example
17//! ```no_run
18//! use token_analyzer::{TokenSecurityAnalyzer, AnalyzerConfig};
19//! use std::path::PathBuf;
20//!
21//! let analyzer = TokenSecurityAnalyzer::new(AnalyzerConfig::default());
22//! let report = analyzer.analyze("API_KEY", &PathBuf::from(".")).unwrap();
23//!
24//! println!("Found {} calls in {} files", report.total_calls, report.files.len());
25//! for file in &report.files {
26//!     if file.has_exposure {
27//!         println!("⚠️  {} - EXPOSED! (risk: {:?})", file.path.display(), file.risk_level);
28//!     }
29//! }
30//! ```
31
32use anyhow::Result;
33use ignore::WalkBuilder;
34use parking_lot::Mutex;
35use rayon::prelude::*;
36use regex::Regex;
37use std::fs;
38use std::path::{Path, PathBuf};
39use std::sync::Arc;
40use std::time::{Duration, Instant};
41
42// ============================================================================
43// Risk Classification
44// ============================================================================
45
46/// Risk level for a file based on its type and content
47#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
48pub enum RiskLevel {
49    /// Low risk - regular source code
50    Low = 1,
51    /// Medium risk - configuration files
52    Medium = 2,
53    /// High risk - sensitive config files
54    High = 3,
55    /// Critical risk - environment/secrets files
56    Critical = 4,
57}
58
59impl RiskLevel {
60    /// Returns the risk multiplier for scoring
61    pub fn multiplier(&self) -> usize {
62        match self {
63            RiskLevel::Low => 1,
64            RiskLevel::Medium => 2,
65            RiskLevel::High => 3,
66            RiskLevel::Critical => 4,
67        }
68    }
69}
70
71/// Known token prefixes from popular services
72pub const KNOWN_TOKEN_PREFIXES: &[(&str, &str)] = &[
73    // GitHub
74    ("ghp_", "GitHub Personal Access Token"),
75    ("gho_", "GitHub OAuth Token"),
76    ("ghu_", "GitHub User-to-Server Token"),
77    ("ghs_", "GitHub Server-to-Server Token"),
78    ("ghr_", "GitHub Refresh Token"),
79    // AWS
80    ("AKIA", "AWS Access Key ID"),
81    ("ABIA", "AWS STS Token"),
82    ("ACCA", "AWS Context-specific Credential"),
83    ("ASIA", "AWS Temporary Access Key"),
84    // Slack
85    ("xoxb-", "Slack Bot Token"),
86    ("xoxp-", "Slack User Token"),
87    ("xoxa-", "Slack App Token"),
88    ("xoxr-", "Slack Refresh Token"),
89    // Stripe
90    ("sk_live_", "Stripe Live Secret Key"),
91    ("sk_test_", "Stripe Test Secret Key"),
92    ("pk_live_", "Stripe Live Publishable Key"),
93    ("rk_live_", "Stripe Live Restricted Key"),
94    // OpenAI
95    ("sk-", "OpenAI API Key"),
96    // Anthropic
97    ("sk-ant-", "Anthropic API Key"),
98    // Google
99    ("AIza", "Google API Key"),
100    // Hugging Face
101    ("hf_", "Hugging Face Token"),
102    // npm
103    ("npm_", "npm Access Token"),
104    // PyPI
105    ("pypi-", "PyPI API Token"),
106    // Discord
107    ("NDc", "Discord Bot Token (Base64)"),
108    ("MTk", "Discord Bot Token (Base64)"),
109    // Telegram
110    ("bot", "Telegram Bot Token"),
111    // Twilio
112    ("SK", "Twilio API Key"),
113    // SendGrid
114    ("SG.", "SendGrid API Key"),
115    // Mailgun
116    ("key-", "Mailgun API Key"),
117    // DigitalOcean
118    ("dop_v1_", "DigitalOcean Personal Access Token"),
119    ("doo_v1_", "DigitalOcean OAuth Token"),
120    // Vercel
121    ("vercel_", "Vercel Token"),
122    // Supabase
123    ("sbp_", "Supabase Token"),
124    // PlanetScale
125    ("pscale_", "PlanetScale Token"),
126    // Railway
127    ("railway_", "Railway Token"),
128    // Render
129    ("rnd_", "Render Token"),
130    // Netlify
131    ("netlify_", "Netlify Token"),
132];
133
134/// Critical file patterns (highest risk)
135const CRITICAL_FILE_PATTERNS: &[&str] = &[
136    ".env",
137    ".env.local",
138    ".env.development",
139    ".env.production",
140    ".env.staging",
141    ".envrc",
142    "secrets",
143    "credentials",
144    ".secrets",
145    ".credentials",
146    "id_rsa",
147    "id_ed25519",
148    ".pem",
149    ".key",
150    ".p12",
151    ".pfx",
152    ".htpasswd",
153    ".netrc",
154    ".npmrc",
155    ".pypirc",
156    ".dockerconfigjson",
157    "service_account",
158    "serviceaccount",
159];
160
161/// High risk file patterns
162const HIGH_RISK_FILE_PATTERNS: &[&str] = &[
163    "docker-compose",
164    "dockerfile",
165    "terraform.tfvars",
166    "terraform.tfstate",
167    ".tfvars",
168    "ansible",
169    "vault",
170    "consul",
171    "kubernetes",
172    "k8s",
173    "helm",
174    "kustomize",
175    "application.yml",
176    "application.yaml",
177    "application.properties",
178    "appsettings.json",
179    "config.yml",
180    "config.yaml",
181    "config.json",
182    "settings.yml",
183    "settings.yaml",
184    "settings.json",
185    "parameters.yml",
186    "parameters.yaml",
187    "database.yml",
188];
189
190/// Medium risk file extensions
191const MEDIUM_RISK_EXTENSIONS: &[&str] = &[
192    "yml",
193    "yaml",
194    "toml",
195    "ini",
196    "cfg",
197    "conf",
198    "config",
199    "properties",
200];
201
202// ============================================================================
203// Configuration
204// ============================================================================
205
206/// Configuration for the token analyzer
207#[derive(Debug, Clone)]
208pub struct AnalyzerConfig {
209    /// Maximum number of files to scan (0 = unlimited)
210    pub max_files: usize,
211    /// Maximum file size in bytes to scan (skip larger files)
212    pub max_file_size: u64,
213    /// Timeout for the entire analysis in milliseconds (0 = no timeout)
214    pub timeout_ms: u64,
215    /// Whether to follow symbolic links
216    pub follow_symlinks: bool,
217    /// Whether to include hidden files
218    pub include_hidden: bool,
219    /// File extensions to scan (empty = use defaults)
220    pub extensions: Vec<String>,
221    /// Additional directories to ignore
222    pub ignore_dirs: Vec<String>,
223    /// Number of threads to use (0 = auto)
224    pub num_threads: usize,
225}
226
227impl Default for AnalyzerConfig {
228    fn default() -> Self {
229        Self {
230            max_files: 10_000,
231            max_file_size: 10 * 1024 * 1024, // 10 MB
232            timeout_ms: 30_000,              // 30 seconds
233            follow_symlinks: false,
234            include_hidden: false,
235            extensions: vec![],
236            ignore_dirs: vec![
237                "node_modules".into(),
238                "target".into(),
239                ".git".into(),
240                "__pycache__".into(),
241                "venv".into(),
242                ".venv".into(),
243                "dist".into(),
244                "build".into(),
245                ".cache".into(),
246            ],
247            num_threads: 0, // Auto-detect
248        }
249    }
250}
251
252impl AnalyzerConfig {
253    /// Creates a fast config for quick scans
254    pub fn fast() -> Self {
255        Self {
256            max_files: 1_000,
257            max_file_size: 1024 * 1024, // 1 MB
258            timeout_ms: 5_000,          // 5 seconds
259            ..Default::default()
260        }
261    }
262
263    /// Creates a thorough config for complete scans
264    pub fn thorough() -> Self {
265        Self {
266            max_files: 0,                    // Unlimited
267            max_file_size: 50 * 1024 * 1024, // 50 MB
268            timeout_ms: 120_000,             // 2 minutes
269            include_hidden: true,
270            ..Default::default()
271        }
272    }
273
274    /// Get default extensions for code files
275    fn default_extensions() -> Vec<&'static str> {
276        vec![
277            "py", "js", "ts", "jsx", "tsx", "rs", "go", "rb", "java", "kt", "swift", "c", "cpp",
278            "h", "hpp", "cs", "php", "sh", "bash", "zsh", "fish", "yaml", "yml", "json", "toml",
279            "env", "conf", "cfg", "ini", "md", "txt", "sql", "graphql", "prisma",
280        ]
281    }
282}
283
284// ============================================================================
285// Results
286// ============================================================================
287
288/// Exposure type detected
289#[derive(Debug, Clone, PartialEq, Eq)]
290pub enum ExposureType {
291    /// Hardcoded value in source code
292    HardcodedValue,
293    /// Logged or printed to output
294    LoggedOutput,
295    /// Found in environment file (.env)
296    EnvironmentFile,
297    /// Found in configuration file
298    ConfigFile,
299    /// High entropy string (likely real secret)
300    HighEntropy,
301    /// Known token prefix detected
302    KnownTokenPrefix(String),
303}
304
305impl std::fmt::Display for ExposureType {
306    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
307        match self {
308            ExposureType::HardcodedValue => write!(f, "Hardcoded value"),
309            ExposureType::LoggedOutput => write!(f, "Logged/printed"),
310            ExposureType::EnvironmentFile => write!(f, "In .env file"),
311            ExposureType::ConfigFile => write!(f, "In config file"),
312            ExposureType::HighEntropy => write!(f, "High entropy (real secret)"),
313            ExposureType::KnownTokenPrefix(prefix) => write!(f, "Known prefix: {}", prefix),
314        }
315    }
316}
317
318/// Detailed exposure information
319#[derive(Debug, Clone)]
320pub struct ExposureDetail {
321    /// Line number where exposure was detected
322    pub line: usize,
323    /// Type of exposure
324    pub exposure_type: ExposureType,
325    /// The actual content that was matched (redacted if sensitive)
326    pub context: String,
327}
328
329/// Analysis report for a single file
330#[derive(Debug, Clone)]
331pub struct FileAnalysis {
332    /// Path to the file
333    pub path: PathBuf,
334    /// Number of token occurrences in this file
335    pub call_count: usize,
336    /// Whether the token appears to be exposed (print, log, etc.)
337    pub has_exposure: bool,
338    /// Risk level based on file type
339    pub risk_level: RiskLevel,
340    /// Computed risk score (call_count * risk_multiplier)
341    pub risk_score: usize,
342    /// Detailed exposure information
343    pub exposures: Vec<ExposureDetail>,
344    /// Line numbers where exposure was detected (legacy compatibility)
345    pub exposure_lines: Vec<usize>,
346    /// Line numbers of all occurrences
347    pub occurrence_lines: Vec<usize>,
348}
349
350/// Complete analysis report
351#[derive(Debug, Clone)]
352pub struct AnalysisReport {
353    /// Token that was analyzed
354    pub token_name: String,
355    /// Directory that was scanned
356    pub search_dir: PathBuf,
357    /// Total number of calls found
358    pub total_calls: usize,
359    /// Number of files with exposure warnings
360    pub exposure_count: usize,
361    /// Total risk score across all files
362    pub total_risk_score: usize,
363    /// Number of critical-risk files found
364    pub critical_files: usize,
365    /// Per-file analysis results
366    pub files: Vec<FileAnalysis>,
367    /// Time taken for the analysis
368    pub duration: Duration,
369    /// Number of files scanned
370    pub files_scanned: usize,
371    /// Whether the analysis was truncated due to limits
372    pub truncated: bool,
373    /// Error messages encountered during scan
374    pub errors: Vec<String>,
375}
376
377impl AnalysisReport {
378    /// Returns files sorted by risk score (highest first), then exposure, then call count
379    pub fn files_sorted(&self) -> Vec<&FileAnalysis> {
380        let mut sorted: Vec<_> = self.files.iter().collect();
381        sorted.sort_by(|a, b| {
382            // Risk score first, then exposure, then call count
383            b.risk_score
384                .cmp(&a.risk_score)
385                .then_with(|| b.has_exposure.cmp(&a.has_exposure))
386                .then_with(|| b.call_count.cmp(&a.call_count))
387        });
388        sorted
389    }
390
391    /// Returns only files with exposure warnings
392    pub fn exposed_files(&self) -> Vec<&FileAnalysis> {
393        self.files.iter().filter(|f| f.has_exposure).collect()
394    }
395
396    /// Returns files at critical or high risk level
397    pub fn high_risk_files(&self) -> Vec<&FileAnalysis> {
398        self.files
399            .iter()
400            .filter(|f| f.risk_level >= RiskLevel::High)
401            .collect()
402    }
403
404    /// Check if any exposure was found
405    pub fn has_security_issues(&self) -> bool {
406        self.exposure_count > 0
407    }
408
409    /// Check if critical issues were found
410    pub fn has_critical_issues(&self) -> bool {
411        self.files
412            .iter()
413            .any(|f| f.has_exposure && f.risk_level == RiskLevel::Critical)
414    }
415}
416
417// ============================================================================
418// Analyzer
419// ============================================================================
420
421/// Token Security Analyzer
422///
423/// Scans directories for token usage and identifies security risks.
424pub struct TokenSecurityAnalyzer {
425    config: AnalyzerConfig,
426}
427
428impl TokenSecurityAnalyzer {
429    /// Creates a new analyzer with the given configuration
430    pub fn new(config: AnalyzerConfig) -> Self {
431        Self { config }
432    }
433
434    /// Creates an analyzer with default configuration
435    pub fn default_analyzer() -> Self {
436        Self::new(AnalyzerConfig::default())
437    }
438
439    /// Analyzes token usage in the specified directory
440    pub fn analyze(&self, token_name: &str, search_dir: &Path) -> Result<AnalysisReport> {
441        let start = Instant::now();
442        let timeout = if self.config.timeout_ms > 0 {
443            Some(Duration::from_millis(self.config.timeout_ms))
444        } else {
445            None
446        };
447
448        // Validate inputs
449        if token_name.is_empty() {
450            anyhow::bail!("Token name cannot be empty");
451        }
452        if !search_dir.exists() {
453            anyhow::bail!("Search directory does not exist: {}", search_dir.display());
454        }
455
456        // Build the file walker
457        let files = self.collect_files(search_dir, &start, timeout)?;
458        let files_scanned = files.len();
459        let truncated = self.config.max_files > 0 && files_scanned >= self.config.max_files;
460
461        // Check timeout before processing
462        if let Some(t) = timeout {
463            if start.elapsed() >= t {
464                return Ok(self.timeout_report(token_name, search_dir, start));
465            }
466        }
467
468        // Build regex patterns
469        let patterns = self.build_patterns(token_name)?;
470
471        // Parallel analysis
472        let results = self.analyze_files_parallel(&files, &patterns, &start, timeout)?;
473
474        // Build report
475        let total_calls: usize = results.iter().map(|f| f.call_count).sum();
476        let exposure_count = results.iter().filter(|f| f.has_exposure).count();
477        let total_risk_score: usize = results.iter().map(|f| f.risk_score).sum();
478        let critical_files = results
479            .iter()
480            .filter(|f| f.risk_level == RiskLevel::Critical)
481            .count();
482
483        Ok(AnalysisReport {
484            token_name: token_name.to_string(),
485            search_dir: search_dir.to_path_buf(),
486            total_calls,
487            exposure_count,
488            total_risk_score,
489            critical_files,
490            files: results,
491            duration: start.elapsed(),
492            files_scanned,
493            truncated,
494            errors: vec![],
495        })
496    }
497
498    /// Determines the risk level for a file based on its path and name
499    fn get_file_risk_level(path: &Path) -> RiskLevel {
500        let filename = path
501            .file_name()
502            .map(|n| n.to_string_lossy().to_lowercase())
503            .unwrap_or_default();
504        let path_str = path.to_string_lossy().to_lowercase();
505
506        // Check for critical patterns
507        for pattern in CRITICAL_FILE_PATTERNS {
508            if filename.contains(pattern) || filename.starts_with(pattern) {
509                return RiskLevel::Critical;
510            }
511        }
512
513        // Check for high-risk patterns
514        for pattern in HIGH_RISK_FILE_PATTERNS {
515            if filename.contains(pattern) || path_str.contains(pattern) {
516                return RiskLevel::High;
517            }
518        }
519
520        // Check for medium-risk extensions
521        if let Some(ext) = path.extension() {
522            let ext_str = ext.to_string_lossy().to_lowercase();
523            if MEDIUM_RISK_EXTENSIONS.contains(&ext_str.as_str()) {
524                return RiskLevel::Medium;
525            }
526        }
527
528        RiskLevel::Low
529    }
530
531    /// Calculates Shannon entropy of a string (higher = more random = more likely real secret)
532    fn calculate_entropy(s: &str) -> f64 {
533        if s.is_empty() {
534            return 0.0;
535        }
536
537        let mut char_counts = std::collections::HashMap::new();
538        for c in s.chars() {
539            *char_counts.entry(c).or_insert(0) += 1;
540        }
541
542        let len = s.len() as f64;
543        let mut entropy = 0.0;
544
545        for count in char_counts.values() {
546            let p = *count as f64 / len;
547            entropy -= p * p.log2();
548        }
549
550        entropy
551    }
552
553    /// Checks if a string appears to be a high-entropy secret
554    fn is_high_entropy_secret(value: &str) -> bool {
555        // Minimum length for a real secret
556        if value.len() < 8 {
557            return false;
558        }
559
560        // Skip obvious placeholders
561        let lower = value.to_lowercase();
562        if lower.contains("example")
563            || lower.contains("placeholder")
564            || lower.contains("your_")
565            || lower.contains("xxx")
566            || lower.contains("todo")
567            || lower.contains("replace")
568            || lower == "test"
569            || lower == "secret"
570            || lower == "password"
571        {
572            return false;
573        }
574
575        // Calculate entropy - real secrets typically have entropy > 3.5
576        let entropy = Self::calculate_entropy(value);
577        entropy > 3.5
578    }
579
580    /// Checks if a value matches a known token prefix
581    fn detect_known_prefix(value: &str) -> Option<&'static str> {
582        for (prefix, description) in KNOWN_TOKEN_PREFIXES {
583            if value.starts_with(prefix) {
584                return Some(*description);
585            }
586        }
587        None
588    }
589
590    /// Collects files to analyze
591    fn collect_files(
592        &self,
593        search_dir: &Path,
594        start: &Instant,
595        timeout: Option<Duration>,
596    ) -> Result<Vec<PathBuf>> {
597        let mut files = Vec::new();
598        let extensions: Vec<&str> = if self.config.extensions.is_empty() {
599            AnalyzerConfig::default_extensions()
600        } else {
601            self.config.extensions.iter().map(|s| s.as_str()).collect()
602        };
603
604        let mut builder = WalkBuilder::new(search_dir);
605        builder
606            .hidden(!self.config.include_hidden)
607            .follow_links(self.config.follow_symlinks)
608            .git_ignore(true)
609            .git_global(true)
610            .git_exclude(true);
611
612        // Set thread count for parallel walking
613        if self.config.num_threads > 0 {
614            builder.threads(self.config.num_threads);
615        }
616
617        for result in builder.build() {
618            // Check timeout
619            if let Some(t) = timeout {
620                if start.elapsed() >= t {
621                    break;
622                }
623            }
624
625            // Check file limit
626            if self.config.max_files > 0 && files.len() >= self.config.max_files {
627                break;
628            }
629
630            let entry = match result {
631                Ok(e) => e,
632                Err(_) => continue,
633            };
634
635            let path = entry.path();
636
637            // Skip directories
638            if path.is_dir() {
639                continue;
640            }
641
642            // Check ignored directories
643            if self.is_ignored_dir(path) {
644                continue;
645            }
646
647            // Check if file should be included based on extension or critical pattern
648            let filename = path
649                .file_name()
650                .map(|n| n.to_string_lossy().to_lowercase())
651                .unwrap_or_default();
652
653            // Always include critical files (like .env) regardless of extension
654            let is_critical = CRITICAL_FILE_PATTERNS
655                .iter()
656                .any(|p| filename.contains(p) || filename.starts_with(p));
657
658            if !is_critical {
659                // Check extension for non-critical files
660                if let Some(ext) = path.extension() {
661                    let ext_str = ext.to_string_lossy().to_lowercase();
662                    if !extensions.contains(&ext_str.as_str()) {
663                        continue;
664                    }
665                } else {
666                    // No extension and not critical - skip
667                    continue;
668                }
669            }
670
671            // Check file size
672            if let Ok(metadata) = path.metadata() {
673                if metadata.len() > self.config.max_file_size {
674                    continue;
675                }
676            }
677
678            files.push(path.to_path_buf());
679        }
680
681        Ok(files)
682    }
683
684    /// Checks if a path is in an ignored directory
685    fn is_ignored_dir(&self, path: &Path) -> bool {
686        for component in path.components() {
687            if let std::path::Component::Normal(name) = component {
688                let name_str = name.to_string_lossy();
689                if self
690                    .config
691                    .ignore_dirs
692                    .iter()
693                    .any(|d| d == name_str.as_ref())
694                {
695                    return true;
696                }
697            }
698        }
699        false
700    }
701
702    /// Builds regex patterns for token detection
703    fn build_patterns(&self, token_name: &str) -> Result<AnalysisPatterns> {
704        // Escape special regex characters in token name
705        let escaped = regex::escape(token_name);
706
707        // Main pattern: exact token match (word boundary)
708        let token_pattern = format!(r"\b{}\b", escaped);
709        let token_regex = Regex::new(&token_pattern)
710            .map_err(|e| anyhow::anyhow!("Failed to build token regex: {}", e))?;
711
712        // Exposure patterns: detect dangerous usage
713        // These patterns match lines that could expose the token value
714        // Note: We explicitly match string literals with quotes to avoid false positives
715        // from safe patterns like os.environ.get("TOKEN") or process.env.TOKEN
716        let exposure_patterns = [
717            // === HARDCODED VALUES (most critical) ===
718            // Direct assignment with string value: TOKEN="value", TOKEN='value', TOKEN = "value"
719            // This catches hardcoded secrets but NOT environment variable reads
720            format!(r#"\b{}\b\s*=\s*["'][^"']+["']"#, escaped),
721            // Dict/object literal with hardcoded value: "TOKEN": "value", 'TOKEN': 'value'
722            format!(r#"["']{}\s*["']\s*:\s*["'][^"']+["']"#, escaped),
723            // === LOGGING/PRINT STATEMENTS ===
724            // Print statements
725            format!(
726                r"(?i)(print|println!?|printf|echo|puts)\s*[\(\[].*\b{}\b",
727                escaped
728            ),
729            // Console logging (JS/TS)
730            format!(
731                r"(?i)console\.(log|info|warn|error|debug)\s*\(.*\b{}\b",
732                escaped
733            ),
734            // Python logging
735            format!(
736                r"(?i)(logging\.|logger\.)(info|debug|warning|error|critical)\s*\(.*\b{}\b",
737                escaped
738            ),
739            // Rust logging
740            format!(
741                r"(?i)(log::)?(info!|debug!|warn!|error!|trace!)\s*\(.*\b{}\b",
742                escaped
743            ),
744            // Generic log calls
745            format!(r"(?i)\blog\s*[\(\[].*\b{}\b", escaped),
746            // Write to stdout/stderr
747            format!(
748                r"(?i)(stdout|stderr|write|writeln!?)\s*[\(\[].*\b{}\b",
749                escaped
750            ),
751            // Format strings with the token (f-strings, format!)
752            format!(r#"(?i)f["'].*\b{}\b"#, escaped),
753            format!(r"(?i)format!\s*\(.*\b{}\b", escaped),
754        ];
755
756        let exposure_regex = Regex::new(&exposure_patterns.join("|"))
757            .map_err(|e| anyhow::anyhow!("Failed to build exposure regex: {}", e))?;
758
759        Ok(AnalysisPatterns {
760            token_regex,
761            exposure_regex,
762        })
763    }
764
765    /// Analyzes files in parallel
766    fn analyze_files_parallel(
767        &self,
768        files: &[PathBuf],
769        patterns: &AnalysisPatterns,
770        start: &Instant,
771        timeout: Option<Duration>,
772    ) -> Result<Vec<FileAnalysis>> {
773        let results: Arc<Mutex<Vec<FileAnalysis>>> = Arc::new(Mutex::new(Vec::new()));
774        let timed_out = Arc::new(Mutex::new(false));
775
776        files.par_iter().for_each(|file| {
777            // Check timeout
778            if let Some(t) = timeout {
779                if start.elapsed() >= t {
780                    *timed_out.lock() = true;
781                    return;
782                }
783            }
784
785            if *timed_out.lock() {
786                return;
787            }
788
789            if let Ok(analysis) = self.analyze_file(file, patterns) {
790                if analysis.call_count > 0 {
791                    results.lock().push(analysis);
792                }
793            }
794        });
795
796        let inner = Arc::try_unwrap(results)
797            .map(|m| m.into_inner())
798            .unwrap_or_else(|arc| arc.lock().clone());
799
800        Ok(inner)
801    }
802
803    /// Analyzes a single file with advanced detection
804    fn analyze_file(&self, path: &Path, patterns: &AnalysisPatterns) -> Result<FileAnalysis> {
805        let content = fs::read_to_string(path)?;
806        let risk_level = Self::get_file_risk_level(path);
807        let is_env_file = path
808            .file_name()
809            .map(|n| n.to_string_lossy().to_lowercase().contains(".env"))
810            .unwrap_or(false);
811        let is_config_file = risk_level >= RiskLevel::Medium;
812
813        let mut call_count = 0;
814        let mut occurrence_lines = Vec::new();
815        let mut exposures: Vec<ExposureDetail> = Vec::new();
816
817        // Regex to extract values from assignments
818        let value_pattern =
819            Regex::new(r#"[=:]\s*["']([^"']+)["']|[=:]\s*([a-zA-Z0-9_\-./+]{8,})"#).ok();
820
821        for (line_num, line) in content.lines().enumerate() {
822            let line_number = line_num + 1; // 1-indexed
823
824            // Skip comments
825            let trimmed = line.trim();
826            if trimmed.starts_with('#')
827                || trimmed.starts_with("//")
828                || trimmed.starts_with("/*")
829                || trimmed.starts_with('*')
830            {
831                continue;
832            }
833
834            // Count occurrences of the token in this line
835            let matches: Vec<_> = patterns.token_regex.find_iter(line).collect();
836            if matches.is_empty() {
837                continue;
838            }
839
840            call_count += matches.len();
841            occurrence_lines.push(line_number);
842
843            // === Advanced exposure detection ===
844
845            // 1. Check for .env file exposure (any assignment is dangerous)
846            if is_env_file {
847                if let Some(ref vp) = value_pattern {
848                    if let Some(caps) = vp.captures(line) {
849                        let value = caps.get(1).or(caps.get(2)).map(|m| m.as_str());
850                        if let Some(v) = value {
851                            // Check for known token prefix
852                            if let Some(prefix_desc) = Self::detect_known_prefix(v) {
853                                exposures.push(ExposureDetail {
854                                    line: line_number,
855                                    exposure_type: ExposureType::KnownTokenPrefix(
856                                        prefix_desc.to_string(),
857                                    ),
858                                    context: Self::redact_value(v),
859                                });
860                            } else if Self::is_high_entropy_secret(v) {
861                                exposures.push(ExposureDetail {
862                                    line: line_number,
863                                    exposure_type: ExposureType::HighEntropy,
864                                    context: Self::redact_value(v),
865                                });
866                            } else {
867                                exposures.push(ExposureDetail {
868                                    line: line_number,
869                                    exposure_type: ExposureType::EnvironmentFile,
870                                    context: format!("{}=***", patterns.token_regex.as_str()),
871                                });
872                            }
873                        }
874                    }
875                }
876                continue;
877            }
878
879            // 2. Check for hardcoded values in config files
880            if is_config_file {
881                if let Some(ref vp) = value_pattern {
882                    if let Some(caps) = vp.captures(line) {
883                        let value = caps.get(1).or(caps.get(2)).map(|m| m.as_str());
884                        if let Some(v) = value {
885                            // Skip environment variable references
886                            if v.starts_with('$')
887                                || v.contains("env.")
888                                || v.contains("ENV[")
889                                || v.contains("getenv")
890                            {
891                                continue;
892                            }
893
894                            if let Some(prefix_desc) = Self::detect_known_prefix(v) {
895                                exposures.push(ExposureDetail {
896                                    line: line_number,
897                                    exposure_type: ExposureType::KnownTokenPrefix(
898                                        prefix_desc.to_string(),
899                                    ),
900                                    context: Self::redact_value(v),
901                                });
902                            } else if Self::is_high_entropy_secret(v) {
903                                exposures.push(ExposureDetail {
904                                    line: line_number,
905                                    exposure_type: ExposureType::HighEntropy,
906                                    context: Self::redact_value(v),
907                                });
908                            } else {
909                                exposures.push(ExposureDetail {
910                                    line: line_number,
911                                    exposure_type: ExposureType::ConfigFile,
912                                    context: format!(
913                                        "Hardcoded in {}",
914                                        risk_level_name(risk_level)
915                                    ),
916                                });
917                            }
918                        }
919                    }
920                }
921            }
922
923            // 3. Standard exposure pattern check (logging, hardcoded values)
924            if patterns.exposure_regex.is_match(line) {
925                // Determine exposure type
926                let exposure_type = if line.to_lowercase().contains("log")
927                    || line.to_lowercase().contains("print")
928                    || line.to_lowercase().contains("console")
929                    || line.to_lowercase().contains("echo")
930                {
931                    ExposureType::LoggedOutput
932                } else {
933                    ExposureType::HardcodedValue
934                };
935
936                // Avoid duplicates
937                if !exposures.iter().any(|e| e.line == line_number) {
938                    exposures.push(ExposureDetail {
939                        line: line_number,
940                        exposure_type,
941                        context: Self::truncate_line(line),
942                    });
943                }
944            }
945        }
946
947        let exposure_lines: Vec<usize> = exposures.iter().map(|e| e.line).collect();
948        let risk_score = call_count * risk_level.multiplier();
949
950        Ok(FileAnalysis {
951            path: path.to_path_buf(),
952            call_count,
953            has_exposure: !exposures.is_empty(),
954            risk_level,
955            risk_score,
956            exposures,
957            exposure_lines,
958            occurrence_lines,
959        })
960    }
961
962    /// Redacts a secret value for safe display
963    fn redact_value(value: &str) -> String {
964        if value.len() <= 8 {
965            return "***".to_string();
966        }
967        let prefix = &value[..4];
968        let suffix = &value[value.len() - 4..];
969        format!("{}...{}", prefix, suffix)
970    }
971
972    /// Truncates a line for display
973    fn truncate_line(line: &str) -> String {
974        let trimmed = line.trim();
975        if trimmed.len() <= 50 {
976            trimmed.to_string()
977        } else {
978            format!("{}...", &trimmed[..47])
979        }
980    }
981
982    /// Creates a timeout report
983    fn timeout_report(
984        &self,
985        token_name: &str,
986        search_dir: &Path,
987        start: Instant,
988    ) -> AnalysisReport {
989        AnalysisReport {
990            token_name: token_name.to_string(),
991            search_dir: search_dir.to_path_buf(),
992            total_calls: 0,
993            exposure_count: 0,
994            total_risk_score: 0,
995            critical_files: 0,
996            files: vec![],
997            duration: start.elapsed(),
998            files_scanned: 0,
999            truncated: true,
1000            errors: vec!["Analysis timed out".to_string()],
1001        }
1002    }
1003}
1004
1005/// Helper to get a readable name for risk level
1006fn risk_level_name(level: RiskLevel) -> &'static str {
1007    match level {
1008        RiskLevel::Low => "source file",
1009        RiskLevel::Medium => "config file",
1010        RiskLevel::High => "sensitive config",
1011        RiskLevel::Critical => "secrets file",
1012    }
1013}
1014
1015/// Internal struct for regex patterns
1016struct AnalysisPatterns {
1017    token_regex: Regex,
1018    exposure_regex: Regex,
1019}
1020
1021// ============================================================================
1022// Tests
1023// ============================================================================
1024
1025#[cfg(test)]
1026mod tests {
1027    use super::*;
1028    use tempfile::TempDir;
1029
1030    fn setup_test_dir() -> TempDir {
1031        let dir = TempDir::new().unwrap();
1032
1033        // Create test files
1034        fs::write(
1035            dir.path().join("config.py"),
1036            r#"
1037import os
1038API_KEY = os.getenv("API_KEY")
1039db_url = f"postgres://{API_KEY}@localhost/db"
1040"#,
1041        )
1042        .unwrap();
1043
1044        fs::write(
1045            dir.path().join("main.js"),
1046            r#"
1047const API_KEY = process.env.API_KEY;
1048console.log("API Key:", API_KEY);
1049fetch(url, { headers: { "Authorization": API_KEY } });
1050"#,
1051        )
1052        .unwrap();
1053
1054        fs::write(
1055            dir.path().join("safe.rs"),
1056            r#"
1057let api_key = std::env::var("API_KEY")?;
1058client.set_header("Authorization", &api_key);
1059"#,
1060        )
1061        .unwrap();
1062
1063        fs::write(
1064            dir.path().join("debug.py"),
1065            r#"
1066import logging
1067logger = logging.getLogger(__name__)
1068logger.debug(f"Using API_KEY: {API_KEY}")
1069print(f"Debug: API_KEY = {API_KEY}")
1070"#,
1071        )
1072        .unwrap();
1073
1074        // Create a subdirectory with more files
1075        let subdir = dir.path().join("src");
1076        fs::create_dir(&subdir).unwrap();
1077        fs::write(
1078            subdir.join("api.ts"),
1079            r#"
1080export const API_KEY = process.env.API_KEY;
1081export function getHeaders() {
1082    return { "X-API-Key": API_KEY };
1083}
1084"#,
1085        )
1086        .unwrap();
1087
1088        dir
1089    }
1090
1091    #[test]
1092    fn test_analyzer_finds_token_occurrences() {
1093        let dir = setup_test_dir();
1094        let analyzer = TokenSecurityAnalyzer::default_analyzer();
1095
1096        let report = analyzer.analyze("API_KEY", dir.path()).unwrap();
1097
1098        assert!(report.total_calls > 0, "Should find token occurrences");
1099        assert!(!report.files.is_empty(), "Should have files with matches");
1100    }
1101
1102    #[test]
1103    fn test_analyzer_detects_exposure() {
1104        let dir = setup_test_dir();
1105        let analyzer = TokenSecurityAnalyzer::default_analyzer();
1106
1107        let report = analyzer.analyze("API_KEY", dir.path()).unwrap();
1108
1109        assert!(report.exposure_count > 0, "Should detect exposure");
1110        assert!(report.has_security_issues(), "Should have security issues");
1111
1112        // Check specific exposure files
1113        let exposed = report.exposed_files();
1114        let exposed_paths: Vec<_> = exposed
1115            .iter()
1116            .map(|f| f.path.file_name().unwrap().to_string_lossy().to_string())
1117            .collect();
1118
1119        assert!(
1120            exposed_paths.iter().any(|p| p == "main.js"),
1121            "main.js should be exposed (console.log)"
1122        );
1123        assert!(
1124            exposed_paths.iter().any(|p| p == "debug.py"),
1125            "debug.py should be exposed (logger.debug, print)"
1126        );
1127    }
1128
1129    #[test]
1130    fn test_analyzer_respects_word_boundaries() {
1131        let dir = TempDir::new().unwrap();
1132
1133        fs::write(
1134            dir.path().join("test.py"),
1135            r#"
1136API_KEY_NAME = "test"
1137MY_API_KEY = "value"
1138API_KEY = "secret"
1139"#,
1140        )
1141        .unwrap();
1142
1143        let analyzer = TokenSecurityAnalyzer::default_analyzer();
1144        let report = analyzer.analyze("API_KEY", dir.path()).unwrap();
1145
1146        // Should only find exact "API_KEY", not "API_KEY_NAME" or "MY_API_KEY"
1147        assert_eq!(report.total_calls, 1, "Should match exact token only");
1148    }
1149
1150    #[test]
1151    fn test_analyzer_config_fast() {
1152        let config = AnalyzerConfig::fast();
1153        assert_eq!(config.max_files, 1_000);
1154        assert_eq!(config.timeout_ms, 5_000);
1155    }
1156
1157    #[test]
1158    fn test_analyzer_config_thorough() {
1159        let config = AnalyzerConfig::thorough();
1160        assert_eq!(config.max_files, 0);
1161        assert!(config.include_hidden);
1162    }
1163
1164    #[test]
1165    fn test_analyzer_empty_token() {
1166        let dir = TempDir::new().unwrap();
1167        let analyzer = TokenSecurityAnalyzer::default_analyzer();
1168
1169        let result = analyzer.analyze("", dir.path());
1170        assert!(result.is_err(), "Should reject empty token");
1171    }
1172
1173    #[test]
1174    fn test_analyzer_nonexistent_dir() {
1175        let analyzer = TokenSecurityAnalyzer::default_analyzer();
1176
1177        let result = analyzer.analyze("TOKEN", Path::new("/nonexistent/path"));
1178        assert!(result.is_err(), "Should reject nonexistent directory");
1179    }
1180
1181    #[test]
1182    fn test_analyzer_report_sorting() {
1183        let dir = setup_test_dir();
1184        let analyzer = TokenSecurityAnalyzer::default_analyzer();
1185        let report = analyzer.analyze("API_KEY", dir.path()).unwrap();
1186
1187        let sorted = report.files_sorted();
1188
1189        // First files should have exposure
1190        if !sorted.is_empty() && sorted[0].has_exposure {
1191            assert!(
1192                sorted.iter().take_while(|f| f.has_exposure).count() > 0,
1193                "Exposed files should come first"
1194            );
1195        }
1196    }
1197
1198    #[test]
1199    fn test_analyzer_ignores_node_modules() {
1200        let dir = TempDir::new().unwrap();
1201
1202        // Create node_modules directory with matching file
1203        let nm = dir.path().join("node_modules");
1204        fs::create_dir(&nm).unwrap();
1205        fs::write(nm.join("test.js"), "const API_KEY = 'test';").unwrap();
1206
1207        // Create regular file
1208        fs::write(dir.path().join("main.js"), "const API_KEY = 'test';").unwrap();
1209
1210        let analyzer = TokenSecurityAnalyzer::default_analyzer();
1211        let report = analyzer.analyze("API_KEY", dir.path()).unwrap();
1212
1213        // Should only find in main.js, not node_modules
1214        assert_eq!(report.files.len(), 1);
1215        assert!(report.files[0].path.file_name().unwrap() == "main.js");
1216    }
1217
1218    #[test]
1219    fn test_analyzer_performance_metrics() {
1220        let dir = setup_test_dir();
1221        let analyzer = TokenSecurityAnalyzer::default_analyzer();
1222
1223        let report = analyzer.analyze("API_KEY", dir.path()).unwrap();
1224
1225        assert!(
1226            report.duration.as_millis() < 5000,
1227            "Analysis should complete quickly (< 5s)"
1228        );
1229        assert!(report.files_scanned > 0, "Should report files scanned");
1230    }
1231
1232    #[test]
1233    fn test_analyzer_multiple_occurrences_per_line() {
1234        let dir = TempDir::new().unwrap();
1235
1236        fs::write(
1237            dir.path().join("test.py"),
1238            "x = API_KEY + API_KEY + API_KEY\n",
1239        )
1240        .unwrap();
1241
1242        let analyzer = TokenSecurityAnalyzer::default_analyzer();
1243        let report = analyzer.analyze("API_KEY", dir.path()).unwrap();
1244
1245        assert_eq!(
1246            report.total_calls, 3,
1247            "Should count all occurrences on same line"
1248        );
1249        assert_eq!(
1250            report.files[0].occurrence_lines.len(),
1251            1,
1252            "Should only have 1 line"
1253        );
1254    }
1255}
token_analyzer/analyzer.rs

token_analyzer/
analyzer.rs