syncable_cli/analyzer/security/turbo/
pattern_engine.rs

1//! # Pattern Engine Module
2//! 
3//! Ultra-fast multi-pattern matching using Aho-Corasick algorithm and compiled regex sets.
4
5use std::sync::Arc;
6use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7use regex::Regex;
8use ahash::AHashMap;
9use log::debug;
10
11use super::{TurboConfig, SecurityError};
12use crate::analyzer::security::{SecuritySeverity, SecurityCategory};
13
14/// A compiled pattern for ultra-fast matching
15#[derive(Debug, Clone)]
16pub struct CompiledPattern {
17    pub id: String,
18    pub name: String,
19    pub severity: SecuritySeverity,
20    pub category: SecurityCategory,
21    pub description: String,
22    pub remediation: Vec<String>,
23    pub references: Vec<String>,
24    pub cwe_id: Option<String>,
25    pub confidence_boost_keywords: Vec<String>,
26    pub false_positive_keywords: Vec<String>,
27}
28
29/// Pattern match result
30#[derive(Debug, Clone)]
31pub struct PatternMatch {
32    pub pattern: Arc<CompiledPattern>,
33    pub line_number: usize,
34    pub column_number: usize,
35    pub evidence: String,
36    pub confidence: f32,
37}
38
39/// High-performance pattern matching engine
40pub struct PatternEngine {
41    // Multi-pattern matchers
42    secret_matcher: AhoCorasick,
43    env_var_matcher: AhoCorasick,
44    api_key_matcher: AhoCorasick,
45    
46    // Pattern lookup maps
47    secret_patterns: AHashMap<usize, Arc<CompiledPattern>>,
48    env_var_patterns: AHashMap<usize, Arc<CompiledPattern>>,
49    api_key_patterns: AHashMap<usize, Arc<CompiledPattern>>,
50    
51    // Specialized matchers for complex patterns
52    complex_patterns: Vec<(Regex, Arc<CompiledPattern>)>,
53    
54    // Performance counters
55    total_patterns: usize,
56}
57
58impl PatternEngine {
59    pub fn new(config: &TurboConfig) -> Result<Self, SecurityError> {
60        debug!("Initializing pattern engine with pattern sets: {:?}", config.pattern_sets);
61        
62        // Load patterns based on configuration
63        let (secret_patterns, env_var_patterns, api_key_patterns, complex_patterns) = 
64            Self::load_patterns(&config.pattern_sets)?;
65        
66        // Build Aho-Corasick matchers
67        let secret_matcher = Self::build_matcher(&secret_patterns)?;
68        let env_var_matcher = Self::build_matcher(&env_var_patterns)?;
69        let api_key_matcher = Self::build_matcher(&api_key_patterns)?;
70        
71        let total_patterns = secret_patterns.len() + env_var_patterns.len() + 
72                           api_key_patterns.len() + complex_patterns.len();
73        
74        debug!("Pattern engine initialized with {} total patterns", total_patterns);
75        
76        Ok(Self {
77            secret_matcher,
78            env_var_matcher,
79            api_key_matcher,
80            secret_patterns: Self::create_pattern_map(secret_patterns),
81            env_var_patterns: Self::create_pattern_map(env_var_patterns),
82            api_key_patterns: Self::create_pattern_map(api_key_patterns),
83            complex_patterns,
84            total_patterns,
85        })
86    }
87    
88    /// Get total pattern count
89    pub fn pattern_count(&self) -> usize {
90        self.total_patterns
91    }
92    
93    /// Scan content for all patterns
94    pub fn scan_content(&self, content: &str, quick_reject: bool, file_meta: &super::file_discovery::FileMetadata) -> Vec<PatternMatch> {
95        // Quick reject using Boyer-Moore substring search
96        if quick_reject && !self.quick_contains_secrets(content) {
97            return Vec::new();
98        }
99        
100        let mut matches = Vec::new();
101        
102        // Split content into lines for line number tracking
103        let lines: Vec<&str> = content.lines().collect();
104        let mut line_offsets = vec![0];
105        let mut offset = 0;
106        
107        for line in &lines {
108            offset += line.len() + 1; // +1 for newline
109            line_offsets.push(offset);
110        }
111        
112        // Run multi-pattern matchers
113        matches.extend(self.run_matcher(&self.secret_matcher, content, &self.secret_patterns, &lines, &line_offsets, file_meta));
114        matches.extend(self.run_matcher(&self.env_var_matcher, content, &self.env_var_patterns, &lines, &line_offsets, file_meta));
115        matches.extend(self.run_matcher(&self.api_key_matcher, content, &self.api_key_patterns, &lines, &line_offsets, file_meta));
116        
117        // Run complex patterns (regex-based)
118        for (line_num, line) in lines.iter().enumerate() {
119            for (regex, pattern) in &self.complex_patterns {
120                if let Some(mat) = regex.find(line) {
121                    let confidence = self.calculate_confidence(line, content, &pattern, file_meta);
122                    
123                    matches.push(PatternMatch {
124                        pattern: Arc::clone(pattern),
125                        line_number: line_num + 1,
126                        column_number: mat.start() + 1,
127                        evidence: self.extract_evidence(line, mat.start(), mat.end()),
128                        confidence,
129                    });
130                }
131            }
132        }
133        
134        // Intelligent confidence filtering - adaptive threshold based on pattern type
135        matches.retain(|m| {
136            let threshold = match m.pattern.id.as_str() {
137                id if id.contains("aws-access-key") => 0.4, // AWS keys need higher confidence
138                id if id.contains("openai-api-key") => 0.4, // OpenAI keys need higher confidence
139                id if id.contains("jwt-token") => 0.6, // JWT tokens need high confidence (often in examples)
140                id if id.contains("database-url") => 0.5, // Database URLs medium confidence
141                id if id.contains("bearer-token") => 0.7, // Bearer tokens often in examples
142                id if id.contains("generic") => 0.8, // Generic patterns need very high confidence
143                id if id.contains("long-secret-value") => 0.7, // Long secret values need high confidence
144                _ => 0.7, // Increased default threshold
145            };
146            m.confidence > threshold
147        });
148        
149        matches
150    }
151    
152    /// Quick check if content might contain secrets
153    fn quick_contains_secrets(&self, content: &str) -> bool {
154        // Enhanced quick rejection for common false positive patterns
155        if self.is_likely_false_positive_content(content) {
156            return false;
157        }
158        
159        // Common secret indicators (optimized for speed)
160        const QUICK_PATTERNS: &[&str] = &[
161            "api", "key", "secret", "token", "password", "credential",
162            "auth", "private", "-----BEGIN", "sk_", "pk_", "eyJ",
163        ];
164        
165        let content_lower = content.to_lowercase();
166        QUICK_PATTERNS.iter().any(|&pattern| content_lower.contains(pattern))
167    }
168    
169    /// Check if content is likely a false positive (encoded data, minified code, etc.)
170    fn is_likely_false_positive_content(&self, content: &str) -> bool {
171        let content_len = content.len();
172        
173        // Skip empty or very small content
174        if content_len < 10 {
175            return true;
176        }
177        
178        // Check for base64 data URLs (common in SVG, images)
179        if content.contains("data:image/") || content.contains("data:font/") {
180            return true;
181        }
182        
183        // Check for minified JavaScript (very long lines, no spaces)
184        let lines: Vec<&str> = content.lines().collect();
185        if lines.len() < 5 && lines.iter().any(|line| line.len() > 500 && line.matches(' ').count() < line.len() / 50) {
186            return true;
187        }
188        
189        // Check for high percentage of base64-like characters (but not a JWT)
190        let base64_chars = content.chars().filter(|c| c.is_alphanumeric() || *c == '+' || *c == '/' || *c == '=').count();
191        let base64_ratio = base64_chars as f32 / content_len as f32;
192        
193        // High base64 ratio but doesn't look like JWT tokens
194        if base64_ratio > 0.8 && !content.contains("eyJ") && content_len > 1000 {
195            return true;
196        }
197        
198        // Check for SVG content
199        if content.contains("<svg") || content.contains("xmlns=\"http://www.w3.org/2000/svg\"") {
200            return true;
201        }
202        
203        // Check for CSS content
204        if content.contains("@media") || content.contains("@import") || 
205           (content.contains("{") && content.contains("}") && content.contains(":")) {
206            return true;
207        }
208        
209        false
210    }
211    
212    /// Run Aho-Corasick matcher and collect results
213    fn run_matcher(
214        &self,
215        matcher: &AhoCorasick,
216        content: &str,
217        patterns: &AHashMap<usize, Arc<CompiledPattern>>,
218        lines: &[&str],
219        line_offsets: &[usize],
220        file_meta: &super::file_discovery::FileMetadata,
221    ) -> Vec<PatternMatch> {
222        let mut matches = Vec::new();
223        
224        for mat in matcher.find_iter(content) {
225            let pattern_id = mat.pattern().as_usize();
226            if let Some(pattern) = patterns.get(&pattern_id) {
227                // Find line and column
228                let (line_num, col_num) = self.offset_to_line_col(mat.start(), line_offsets);
229                let line = lines.get(line_num.saturating_sub(1)).unwrap_or(&"");
230                
231                let confidence = self.calculate_confidence(line, content, pattern, file_meta);
232                
233                matches.push(PatternMatch {
234                    pattern: Arc::clone(pattern),
235                    line_number: line_num,
236                    column_number: col_num,
237                    evidence: self.extract_evidence(line, mat.start(), mat.end()),
238                    confidence,
239                });
240            }
241        }
242        
243        matches
244    }
245    
246    /// Convert byte offset to line and column numbers
247    fn offset_to_line_col(&self, offset: usize, line_offsets: &[usize]) -> (usize, usize) {
248        let line_num = line_offsets.binary_search(&offset)
249            .unwrap_or_else(|i| i.saturating_sub(1));
250        
251        let line_start = line_offsets.get(line_num).copied().unwrap_or(0);
252        let col_num = offset - line_start + 1;
253        
254        (line_num + 1, col_num)
255    }
256    
257    /// Calculate confidence score for a match
258    fn calculate_confidence(&self, line: &str, content: &str, pattern: &CompiledPattern, file_meta: &super::file_discovery::FileMetadata) -> f32 {
259        let mut confidence: f32 = 0.6;
260        
261        let _line_lower = line.to_lowercase();
262        let _content_lower = content.to_lowercase();
263        
264        // Enhanced false positive detection
265        if self.is_obvious_false_positive(line, content, file_meta) {
266            return 0.0;
267        }
268        
269        // Context-based confidence adjustments
270        confidence = self.adjust_confidence_for_context(confidence, line, content, pattern);
271        
272        // Pattern-specific adjustments
273        confidence = self.adjust_confidence_for_pattern(confidence, line, content, pattern);
274        
275        confidence.clamp(0.0, 1.0)
276    }
277    
278    /// Check for obvious false positives
279    fn is_obvious_false_positive(&self, line: &str, content: &str, file_meta: &super::file_discovery::FileMetadata) -> bool {
280        let line_lower = line.to_lowercase();
281        
282        // Comments and documentation
283        if line_lower.trim_start().starts_with("//") || 
284           line_lower.trim_start().starts_with("#") ||
285           line_lower.trim_start().starts_with("*") ||
286           line_lower.trim_start().starts_with("<!--") {
287            return true;
288        }
289        
290        // Check for safe keys in common dependency management files
291        if self.is_safe_dependency_metadata(line, file_meta) {
292            return true;
293        }
294        
295        // JavaScript/TypeScript template literals (${...})
296        if line.contains("${") && line.contains("}") {
297            return true;
298        }
299        
300        // Template strings and interpolation patterns
301        if line.contains("${selectedApiKey") || line.contains("${apiKey") || 
302           line.contains("${key") || line.contains("${token") {
303            return true;
304        }
305        
306        // Code generation contexts (functions that generate example code)
307        if self.is_in_code_generation_context(content) && self.looks_like_template_code(line) {
308            return true;
309        }
310        
311        // Common example/placeholder patterns
312        let false_positive_patterns = [
313            "example", "placeholder", "your_", "todo", "fixme", "xxx",
314            "xxxxxxxx", "12345", "abcdef", "test", "demo", "sample",
315            "lorem", "ipsum", "change_me", "replace_me", "insert_",
316            "enter_your", "add_your", "put_your", "use_your",
317            // React/JSX specific patterns
318            "props.", "state.", "this.", "component",
319        ];
320        
321        if false_positive_patterns.iter().any(|&pattern| line_lower.contains(pattern)) {
322            return true;
323        }
324        
325        // Check for JSON schema or TypeScript interfaces
326        if line_lower.contains("@example") || line_lower.contains("@param") ||
327           line_lower.contains("interface") || line_lower.contains("type ") {
328            return true;
329        }
330        
331        // Check for base64 data URLs
332        if line.contains("data:image/") || line.contains("data:font/") || 
333           line.contains("data:application/") {
334            return true;
335        }
336        
337        // Check for URLs in an array context
338        if (line.contains("http://") || line.contains("https://")) && self.is_in_array_or_list(content) {
339            return true;
340        }
341        
342        // Check for command-line scripts which often contain high-entropy strings
343        // that are not secrets (e.g., project IDs, build hashes).
344        if self.is_command_line_script(line) {
345            return true;
346        }
347        
348        // Check for environment variable interpolations, which are secure.
349        if self.is_env_var_interpolation(line, file_meta) {
350            return true;
351        }
352        
353        // Check for minified content (very long line with little whitespace)
354        if line.len() > 200 && line.matches(' ').count() < line.len() / 20 {
355            return true;
356        }
357        
358        // React/JSX template patterns
359        if line.contains("return `") || line.contains("const ") && line.contains(" = `") {
360            return true;
361        }
362        
363        false
364    }
365    
366    /// Check if we're inside an array or list definition
367    fn is_in_array_or_list(&self, content: &str) -> bool {
368        let content_lower = content.to_lowercase();
369        // Language-agnostic checks for array/list definitions
370        let array_patterns = [
371            "const ", "let ", "var ", "export const ", "export let ",
372            "authorized_parties", "allowed_origins", "authorized_domains",
373            "hosts", "urls", "uris", "endpoints", "domains",
374            "redirect_uris", "allowed_hosts", "cors_origins",
375            "trusted_sources",
376        ];
377
378        array_patterns.iter().any(|p| content_lower.contains(p)) &&
379        (content.contains("[") && content.contains("]")) || // JS, Python, Rust arrays/lists
380        (content.contains("(") && content.contains(")")) || // Python tuples
381        (content.contains("{") && content.contains("}")) // Go slices
382    }
383    
384    /// Check if a line looks like a command-line script.
385    /// This is to avoid flagging project IDs, build hashes, or other identifiers
386    /// inside shell commands as secrets.
387    fn is_command_line_script(&self, line: &str) -> bool {
388        // Quick check for flags, which are a strong indicator of a shell command.
389        if !line.contains("--") {
390            return false;
391        }
392
393        let line_lower = line.to_lowercase();
394        
395        // Common script/command keywords.
396        // The presence of these alongside flags increases confidence that it's a script.
397        let command_keywords = [
398            // Verbs
399            "run", "exec", "build", "start", "test", "deploy", "gen", "generate",
400            "get", "set", "create", "delete", "update", "push", "pull", "watch",
401            "serve", "lint", "format",
402            
403            // Nouns/Context
404            "client", "server", "output", "input", "file", "env", "environment",
405            "config", "path", "dir", "port", "host", "watch", "prod", "dev",
406            
407            // Common tools
408            "npm", "yarn", "pnpm", "npx", "node", "python", "pip", "go", "cargo",
409            "docker", "aws", "gcloud", "az", "kubectl", "terraform", "encore", "bun", "bunx",
410            "maven", "gradle", "gradlew", "gradlew.bat", "gradlew.sh", "gradlew.jar", "gradlew.zip",
411            "mvn", "pipx", "pipenv", "poetry", "ruff", "black", "isort", "flake8", "mypy", "pytest",
412            "jest", "mocha", "jasmine", "cypress", "playwright", "selenium", "puppeteer", "webdriver",
413            "puppeteer-extra", "puppeteer-extra-plugin-stealth", "puppeteer-extra-plugin-recaptcha"
414        ];
415
416        // If we find a flag AND a common command keyword, it's very likely a script.
417        if command_keywords.iter().any(|&kw| line_lower.contains(kw)) {
418            return true;
419        }
420        
421        // Also consider it a script if it looks like a file path assignment after a flag
422        if line.contains("--") && (line.contains('/') || line.contains('\\') || line.contains('=')) {
423            return true;
424        }
425
426        false
427    }
428    
429    /// Check if we're in a code generation context
430    fn is_in_code_generation_context(&self, content: &str) -> bool {
431        let content_lower = content.to_lowercase();
432        
433        // Common code generation function names and patterns
434        let code_gen_patterns = [
435            "getcode", "generatecode", "codecomponent", "apicodedialog",
436            "const getcode", "function getcode", "const code", "function code",
437            "codesnippet", "codeexample", "template", "example code",
438            "code generator", "api example", "curl example",
439            // React/JSX specific
440            "codeblock", "copyblock", "syntax highlight"
441        ];
442        
443        code_gen_patterns.iter().any(|&pattern| content_lower.contains(pattern))
444    }
445    
446    /// Check if a line looks like template code
447    fn looks_like_template_code(&self, line: &str) -> bool {
448        // Template string patterns
449        if line.contains("return `") || line.contains("= `") {
450            return true;
451        }
452        
453        // API URL construction patterns
454        if line.contains("API_URL") || line.contains("/api/v1/") || line.contains("/prediction/") {
455            return true;
456        }
457        
458        // Typical code example patterns
459        if line.contains("requests.post") || line.contains("fetch(") || 
460           line.contains("curl ") || line.contains("import requests") {
461            return true;
462        }
463        
464        // Authorization header patterns in templates
465        if line.contains("Authorization:") || line.contains("Bearer ") {
466            return true;
467        }
468        
469        false
470    }
471    
472    /// Adjust confidence based on context
473    fn adjust_confidence_for_context(&self, mut confidence: f32, line: &str, content: &str, _pattern: &CompiledPattern) -> f32 {
474        let line_lower = line.to_lowercase();
475        let content_lower = content.to_lowercase();
476        
477        // Boost confidence for actual assignments
478        if line.contains("=") || line.contains(":") {
479            confidence += 0.2;
480        }
481        
482        // Boost for environment variable assignment
483        if line_lower.contains("export ") || line_lower.contains("process.env") {
484            confidence += 0.3;
485        }
486        
487        // Boost for import statements with API keys
488        if line_lower.contains("import") && (line_lower.contains("api") || line_lower.contains("key")) {
489            confidence += 0.1;
490        }
491        
492        // Reduce confidence for certain file types based on content
493        if content_lower.contains("package.json") || content_lower.contains("node_modules") {
494            confidence -= 0.2;
495        }
496        
497        // Reduce confidence for test files
498        if content_lower.contains("/test/") || content_lower.contains("__test__") ||
499           content_lower.contains(".test.") || content_lower.contains(".spec.") {
500            confidence -= 0.3;
501        }
502        
503        // Reduce confidence for documentation
504        if content_lower.contains("readme") || content_lower.contains("documentation") ||
505           content_lower.contains("docs/") {
506            confidence -= 0.4;
507        }
508        
509        confidence
510    }
511    
512    /// Adjust confidence based on pattern-specific rules
513    fn adjust_confidence_for_pattern(&self, mut confidence: f32, line: &str, content: &str, pattern: &CompiledPattern) -> f32 {
514        let line_lower = line.to_lowercase();
515        let content_lower = content.to_lowercase();
516        
517        // Major confidence reduction for template/code generation contexts
518        if self.is_in_code_generation_context(content) {
519            confidence -= 0.6;
520        }
521        
522        // Check pattern-specific confidence boost keywords
523        for keyword in &pattern.confidence_boost_keywords {
524            if content_lower.contains(&keyword.to_lowercase()) {
525                confidence += 0.1;
526            }
527        }
528        
529        // Check pattern-specific false positive keywords
530        for keyword in &pattern.false_positive_keywords {
531            if line_lower.contains(&keyword.to_lowercase()) {
532                confidence -= 0.4;
533            }
534        }
535        
536        // Special handling for specific pattern types
537        match pattern.id.as_str() {
538            "jwt-token" => {
539                // JWT tokens should have proper structure
540                if !line.contains("eyJ") || line.split('.').count() != 3 {
541                    confidence -= 0.3;
542                }
543                // Less confident if in a comment or documentation
544                if line_lower.contains("example") || line_lower.contains("jwt") {
545                    confidence -= 0.2;
546                }
547                // Very low confidence for template literals
548                if line.contains("${") {
549                    confidence -= 0.8;
550                }
551            }
552            "openai-api-key" => {
553                // OpenAI keys should start with sk- and be proper length
554                if !line.contains("sk-") {
555                    confidence -= 0.5;
556                }
557                // Boost if in actual code context
558                if line_lower.contains("openai") || line_lower.contains("gpt") {
559                    confidence += 0.2;
560                }
561                // Major reduction for template literals
562                if line.contains("${") || line.contains("selectedApiKey") {
563                    confidence -= 0.9;
564                }
565            }
566            "database-url-with-creds" => {
567                // Should be a valid URL format
568                if !line.contains("://") || line.contains("example.com") {
569                    confidence -= 0.4;
570                }
571                
572                // Check for placeholder credentials
573                let placeholder_creds = [
574                    "user:pass", "user:password", "admin:admin", "admin:password",
575                    "username:password", "test:test", "root:root", "postgres:postgres",
576                ];
577                if placeholder_creds.iter().any(|p| line.contains(p)) {
578                    confidence -= 0.8; // Drastically reduce confidence for placeholders
579                }
580                
581                // Reduce for template patterns
582                if line.contains("${") {
583                    confidence -= 0.7;
584                }
585            }
586            "long-secret-value" | "generic-api-key" => {
587                // High reduction for template literals and code generation
588                if line.contains("${") || line.contains("selectedApiKey") || 
589                   line.contains("apiKey") && line.contains("?") {
590                    confidence -= 0.8;
591                }
592                // Reduce for Bearer token patterns in templates
593                if line.contains("Bearer ") && line.contains("${") {
594                    confidence -= 0.9;
595                }
596            }
597            _ => {
598                // General template literal reduction
599                if line.contains("${") {
600                    confidence -= 0.6;
601                }
602            }
603        }
604        
605        // Additional React/JSX specific reductions
606        if content_lower.contains("react") || content_lower.contains("jsx") || 
607           content_lower.contains("component") {
608            if line.contains("${") || line.contains("props.") || line.contains("state.") {
609                confidence -= 0.5;
610            }
611        }
612        
613        confidence
614    }
615    
616    /// Extract evidence with context
617    fn extract_evidence(&self, line: &str, start: usize, end: usize) -> String {
618        // Mask the actual secret value
619        let prefix = &line[..start.min(line.len())];
620        let suffix = &line[end.min(line.len())..];
621        let masked = "*".repeat((end - start).min(20));
622        
623        format!("{}{}{}", prefix, masked, suffix).trim().to_string()
624    }
625    
626    /// Build Aho-Corasick matcher from patterns
627    fn build_matcher(patterns: &[(String, Arc<CompiledPattern>)]) -> Result<AhoCorasick, SecurityError> {
628        let strings: Vec<&str> = patterns.iter().map(|(s, _)| s.as_str()).collect();
629        
630        let matcher = AhoCorasickBuilder::new()
631            .match_kind(MatchKind::LeftmostFirst)
632            .ascii_case_insensitive(true)
633            .build(&strings)
634            .map_err(|e| SecurityError::PatternEngine(format!("Failed to build matcher: {}", e)))?;
635        
636        Ok(matcher)
637    }
638    
639    /// Create pattern lookup map
640    fn create_pattern_map(patterns: Vec<(String, Arc<CompiledPattern>)>) -> AHashMap<usize, Arc<CompiledPattern>> {
641        patterns.into_iter()
642            .enumerate()
643            .map(|(id, (_, pattern))| (id, pattern))
644            .collect()
645    }
646    
647    /// Load patterns based on pattern sets
648    fn load_patterns(pattern_sets: &[String]) -> Result<(
649        Vec<(String, Arc<CompiledPattern>)>,
650        Vec<(String, Arc<CompiledPattern>)>,
651        Vec<(String, Arc<CompiledPattern>)>,
652        Vec<(Regex, Arc<CompiledPattern>)>,
653    ), SecurityError> {
654        let mut secret_patterns = Vec::new();
655        let mut env_var_patterns = Vec::new();
656        let mut api_key_patterns = Vec::new();
657        let mut complex_patterns = Vec::new();
658        
659        // Load default patterns
660        if pattern_sets.contains(&"default".to_string()) {
661            Self::load_default_patterns(&mut secret_patterns, &mut env_var_patterns, 
662                                      &mut api_key_patterns, &mut complex_patterns)?;
663        }
664        
665        // Load additional pattern sets
666        for set in pattern_sets {
667            match set.as_str() {
668                "aws" => Self::load_aws_patterns(&mut api_key_patterns)?,
669                "gcp" => Self::load_gcp_patterns(&mut api_key_patterns)?,
670                "azure" => Self::load_azure_patterns(&mut api_key_patterns)?,
671                "crypto" => Self::load_crypto_patterns(&mut secret_patterns)?,
672                _ => {}
673            }
674        }
675        
676        Ok((secret_patterns, env_var_patterns, api_key_patterns, complex_patterns))
677    }
678    
679    /// Load default security patterns - focused on ACTUAL secrets, not references
680    fn load_default_patterns(
681        secret_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
682        _env_var_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
683        api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
684        complex_patterns: &mut Vec<(Regex, Arc<CompiledPattern>)>,
685    ) -> Result<(), SecurityError> {
686        // ONLY detect actual API key values, not variable names
687        
688        // OpenAI API Keys - actual key format
689        api_key_patterns.push((
690            "sk-".to_string(),
691            Arc::new(CompiledPattern {
692                id: "openai-api-key".to_string(),
693                name: "OpenAI API Key".to_string(),
694                severity: SecuritySeverity::Critical,
695                category: SecurityCategory::SecretsExposure,
696                description: "OpenAI API key detected".to_string(),
697                remediation: vec![
698                    "Remove API key from source code".to_string(),
699                    "Use environment variables".to_string(),
700                ],
701                references: vec!["https://platform.openai.com/docs/api-reference".to_string()],
702                cwe_id: Some("CWE-798".to_string()),
703                confidence_boost_keywords: vec!["openai".to_string(), "gpt".to_string()],
704                false_positive_keywords: vec![
705                    "sk-xxxxxxxx".to_string(), "sk-...".to_string(), "sk_test".to_string(),
706                    "example".to_string(), "placeholder".to_string(), "your_".to_string(),
707                    "TODO".to_string(), "FIXME".to_string(), "XXX".to_string(),
708                ],
709            }),
710        ));
711        
712        // Complex regex patterns for ACTUAL secret assignments with values
713        complex_patterns.push((
714            // Only match when there's an actual long value, not just variable names
715            Regex::new(r#"(?i)(?:api[_-]?key|secret[_-]?key|access[_-]?token)\s*[:=]\s*['"]([a-zA-Z0-9+/=]{32,})['"]"#)
716                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
717            Arc::new(CompiledPattern {
718                id: "long-secret-value".to_string(),
719                name: "Hardcoded Secret Value".to_string(),
720                severity: SecuritySeverity::Critical,
721                category: SecurityCategory::SecretsExposure,
722                description: "Long secret value hardcoded in source code".to_string(),
723                remediation: vec![
724                    "Use environment variables for secrets".to_string(),
725                    "Implement proper secret management".to_string(),
726                ],
727                references: vec![],
728                cwe_id: Some("CWE-798".to_string()),
729                confidence_boost_keywords: vec!["bearer".to_string(), "auth".to_string()],
730                false_positive_keywords: vec![
731                    "process.env".to_string(), "getenv".to_string(), "example".to_string(),
732                    "placeholder".to_string(), "your_".to_string(), "TODO".to_string(),
733                    "test".to_string(), "demo".to_string(), "fake".to_string(),
734                ],
735            }),
736        ));
737        
738        // JWT tokens (actual token format)
739        complex_patterns.push((
740            Regex::new(r#"\beyJ[a-zA-Z0-9+/=]{100,}\b"#)
741                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
742            Arc::new(CompiledPattern {
743                id: "jwt-token".to_string(),
744                name: "JWT Token".to_string(),
745                severity: SecuritySeverity::High,
746                category: SecurityCategory::SecretsExposure,
747                description: "JWT token detected in source code".to_string(),
748                remediation: vec![
749                    "Never hardcode JWT tokens".to_string(),
750                    "Use secure token storage".to_string(),
751                ],
752                references: vec![],
753                cwe_id: Some("CWE-798".to_string()),
754                confidence_boost_keywords: vec!["bearer".to_string(), "authorization".to_string()],
755                false_positive_keywords: vec!["example".to_string(), "demo".to_string()],
756            }),
757        ));
758        
759        // Database connection strings with embedded credentials
760        complex_patterns.push((
761            Regex::new(r#"(?i)(?:postgres|postgresql|mysql|mongodb|redis|mariadb)://[^:\s]+:[^@\s]+@[^/\s]+/[^\s]*"#)
762                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
763            Arc::new(CompiledPattern {
764                id: "database-url-with-creds".to_string(),
765                name: "Database URL with Credentials".to_string(),
766                severity: SecuritySeverity::Critical,
767                category: SecurityCategory::SecretsExposure,
768                description: "Database connection string with embedded credentials".to_string(),
769                remediation: vec![
770                    "Use environment variables for database credentials".to_string(),
771                    "Use connection string without embedded passwords".to_string(),
772                ],
773                references: vec![],
774                cwe_id: Some("CWE-798".to_string()),
775                confidence_boost_keywords: vec!["connection".to_string(), "database".to_string()],
776                false_positive_keywords: vec![
777                    "example.com".to_string(), "localhost".to_string(), "placeholder".to_string(),
778                    "your_".to_string(), "user:pass".to_string(),
779                ],
780            }),
781        ));
782        
783        // Private SSH/SSL keys
784        secret_patterns.push((
785            "-----BEGIN".to_string(),
786            Arc::new(CompiledPattern {
787                id: "private-key-header".to_string(),
788                name: "Private Key".to_string(),
789                severity: SecuritySeverity::Critical,
790                category: SecurityCategory::SecretsExposure,
791                description: "Private key detected".to_string(),
792                remediation: vec![
793                    "Never commit private keys to version control".to_string(),
794                    "Use secure key storage solutions".to_string(),
795                ],
796                references: vec![],
797                cwe_id: Some("CWE-321".to_string()),
798                confidence_boost_keywords: vec!["PRIVATE".to_string(), "RSA".to_string(), "DSA".to_string()],
799                false_positive_keywords: vec!["PUBLIC".to_string(), "CERTIFICATE".to_string()],
800            }),
801        ));
802        
803        Ok(())
804    }
805    
806    /// Load AWS-specific patterns
807    fn load_aws_patterns(api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
808        api_key_patterns.push((
809            "AKIA".to_string(),
810            Arc::new(CompiledPattern {
811                id: "aws-access-key".to_string(),
812                name: "AWS Access Key".to_string(),
813                severity: SecuritySeverity::Critical,
814                category: SecurityCategory::SecretsExposure,
815                description: "AWS Access Key ID detected".to_string(),
816                remediation: vec![
817                    "Remove AWS credentials from source code".to_string(),
818                    "Use IAM roles or environment variables".to_string(),
819                    "Rotate the exposed key immediately".to_string(),
820                ],
821                references: vec!["https://docs.aws.amazon.com/security/".to_string()],
822                cwe_id: Some("CWE-798".to_string()),
823                confidence_boost_keywords: vec!["aws".to_string(), "s3".to_string(), "ec2".to_string()],
824                false_positive_keywords: vec!["AKIA00000000".to_string()],
825            }),
826        ));
827        
828        Ok(())
829    }
830    
831    /// Load GCP-specific patterns
832    fn load_gcp_patterns(api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
833        api_key_patterns.push((
834            "AIza".to_string(),
835            Arc::new(CompiledPattern {
836                id: "gcp-api-key".to_string(),
837                name: "Google Cloud API Key".to_string(),
838                severity: SecuritySeverity::High,
839                category: SecurityCategory::SecretsExposure,
840                description: "Google Cloud API key detected".to_string(),
841                remediation: vec![
842                    "Use service accounts instead of API keys".to_string(),
843                    "Restrict API key usage by IP/referrer".to_string(),
844                ],
845                references: vec!["https://cloud.google.com/security/".to_string()],
846                cwe_id: Some("CWE-798".to_string()),
847                confidence_boost_keywords: vec!["google".to_string(), "gcp".to_string(), "firebase".to_string()],
848                false_positive_keywords: vec![],
849            }),
850        ));
851        
852        Ok(())
853    }
854    
855    /// Load Azure-specific patterns
856    fn load_azure_patterns(_api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
857        // Azure patterns would go here
858        Ok(())
859    }
860    
861    /// Load cryptocurrency-related patterns
862    fn load_crypto_patterns(secret_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
863        secret_patterns.push((
864            "-----BEGIN".to_string(),
865            Arc::new(CompiledPattern {
866                id: "private-key".to_string(),
867                name: "Private Key".to_string(),
868                severity: SecuritySeverity::Critical,
869                category: SecurityCategory::SecretsExposure,
870                description: "Private key detected".to_string(),
871                remediation: vec![
872                    "Never commit private keys to version control".to_string(),
873                    "Use secure key storage solutions".to_string(),
874                ],
875                references: vec![],
876                cwe_id: Some("CWE-321".to_string()),
877                confidence_boost_keywords: vec!["RSA".to_string(), "PRIVATE".to_string()],
878                false_positive_keywords: vec!["PUBLIC".to_string()],
879            }),
880        ));
881        
882        Ok(())
883    }
884
885    /// Checks if a line is a safe, non-secret key-value pair in a known dependency file.
886    fn is_safe_dependency_metadata(&self, line: &str, file_meta: &super::file_discovery::FileMetadata) -> bool {
887        let filename = file_meta.path.file_name().and_then(|s| s.to_str()).unwrap_or("");
888        let line_trimmed = line.trim();
889
890        match filename {
891            "package.json" => {
892                // Keys in JSON are quoted strings
893                let safe_keys = [
894                    "\"name\"", "\"version\"", "\"description\"", "\"main\"", "\"module\"",
895                    "\"type\"", "\"private\"", "\"license\"", "\"author\"", "\"homepage\"",
896                    "\"repository\"", "\"bugs\"", "\"keywords\"", "\"workspaces\"",
897                ];
898                safe_keys.iter().any(|key| line_trimmed.starts_with(key))
899            }
900            "Cargo.toml" | "pyproject.toml" => {
901                // Keys in TOML are typically not quoted
902                let safe_keys = [
903                    "name =", "version =", "description =", "edition =", "license =",
904                    "authors =", "homepage =", "repository =", "documentation =", "keywords =",
905                ];
906                safe_keys.iter().any(|key| line_trimmed.starts_with(key))
907            }
908            "go.mod" => {
909                line_trimmed.starts_with("module ") || line_trimmed.starts_with("go ")
910            }
911            "pom.xml" => {
912                // Keys in XML are tags
913                let safe_tags = ["<groupId>", "<artifactId>", "<version>", "<name>", "<description>", "<url>", "<license>"];
914                safe_tags.iter().any(|tag| line_trimmed.contains(tag))
915            }
916            "build.gradle" | "build.gradle.kts" => {
917                let safe_assignments = ["rootProject.name =", "group =", "version ="];
918                safe_assignments.iter().any(|s| line_trimmed.starts_with(s))
919            }
920            _ => false,
921        }
922    }
923
924    /// Checks if a line contains a reference to an environment variable, not a hardcoded secret.
925    fn is_env_var_interpolation(&self, line: &str, file_meta: &super::file_discovery::FileMetadata) -> bool {
926        let filename = file_meta.path.file_name().and_then(|s| s.to_str()).unwrap_or("");
927
928        // Pattern 1: JSON-based `{"$env": "VAR"}`. This is a very specific and safe pattern.
929        if line.contains("\"$env\"") {
930            return true;
931        }
932
933        // Pattern 2: Shell/YAML/Dockerfile `${VAR}` or `$VAR`. This is more generic.
934        if line.contains('$') {
935            // Check for `${...}` or `$VAR` patterns
936            if line.contains("${") && line.contains("}") {
937                let is_config_file = matches!(
938                    filename,
939                    "docker-compose.yml"
940                        | "docker-compose.yaml"
941                        | "Dockerfile"
942                        | "Jenkinsfile"
943                        | "Makefile"
944                ) || filename.ends_with(".env")
945                    || filename.ends_with(".sh")
946                    || filename.ends_with(".yml")
947                    || filename.ends_with(".yaml");
948
949                if is_config_file {
950                    return true;
951                }
952
953                // Also check for context keywords in any file
954                let line_lower = line.to_lowercase();
955                let env_context_keywords = ["environment:", "command:", "entrypoint:", "value:", "args:"];
956                if env_context_keywords.iter().any(|kw| line_lower.contains(kw)) {
957                    return true;
958                }
959            }
960        }
961
962        false
963    }
964}
965
966#[cfg(test)]
967mod tests {
968    use super::*;
969    use crate::analyzer::security::turbo::file_discovery::{FileMetadata, PriorityHints};
970    use std::path::PathBuf;
971    use std::time::SystemTime;
972
973    fn dummy_metadata(path: &str) -> FileMetadata {
974        FileMetadata {
975            path: PathBuf::from(path),
976            size: 100,
977            extension: Some(
978                PathBuf::from(path)
979                    .extension()
980                    .and_then(|s| s.to_str())
981                    .unwrap_or("")
982                    .to_string(),
983            ),
984            is_gitignored: false,
985            modified: SystemTime::now(),
986            priority_hints: PriorityHints::default(),
987        }
988    }
989    
990    #[test]
991    fn test_pattern_engine_creation() {
992        let config = TurboConfig::default();
993        let engine = PatternEngine::new(&config);
994        assert!(engine.is_ok());
995        
996        let engine = engine.unwrap();
997        assert!(engine.pattern_count() > 0);
998    }
999    
1000    #[test]
1001    fn test_pattern_matching() {
1002        let config = TurboConfig::default();
1003        let engine = PatternEngine::new(&config).unwrap();
1004        let meta = dummy_metadata("test.js");
1005        
1006        let content = r#"
1007            const apiKey = "sk-1234567890abcdef1234567890abcdef12345678";
1008            password = "super_secret_password_that_is_long_enough";
1009            process.env.DATABASE_URL
1010        "#;
1011        
1012        let matches = engine.scan_content(content, false, &meta);
1013        assert!(!matches.is_empty());
1014        
1015        // Should find API key (if long enough and not a template)
1016        assert!(matches.iter().any(|m| m.pattern.id.contains("openai") || m.pattern.id.contains("secret")));
1017    }
1018    
1019    #[test]
1020    fn test_template_literal_filtering() {
1021        let config = TurboConfig::default();
1022        let engine = PatternEngine::new(&config).unwrap();
1023        let meta = dummy_metadata("test.js");
1024        
1025        // Template literal content (should be filtered out)
1026        let template_content = r#"
1027            const getCode = () => {
1028                return `Authorization: "Bearer ${selectedApiKey?.apiKey}"`;
1029            }
1030            
1031            function generateExample() {
1032                return "Bearer " + apiKey;
1033            }
1034        "#;
1035        
1036        let matches = engine.scan_content(template_content, false, &meta);
1037        // Should have very few or no matches due to template literal detection
1038        assert!(matches.len() <= 1, "Template literals should be filtered out");
1039    }
1040    
1041    #[test]
1042    fn test_code_generation_context() {
1043        let config = TurboConfig::default();
1044        let engine = PatternEngine::new(&config).unwrap();
1045        let meta = dummy_metadata("APICodeDialog.jsx");
1046        
1047        // Code generation context (like React component that generates examples)
1048        let code_gen_content = r#"
1049            import { CopyBlock } from 'react-code-blocks';
1050            
1051            const APICodeDialog = () => {
1052                const getCodeWithAuthorization = () => {
1053                    return `
1054                        headers: {
1055                            Authorization: "Bearer ${selectedApiKey?.apiKey}",
1056                            "Content-Type": "application/json"
1057                        }
1058                    `;
1059                };
1060                
1061                return <CopyBlock text={getCodeWithAuthorization()} />;
1062            };
1063        "#;
1064        
1065        let matches = engine.scan_content(code_gen_content, false, &meta);
1066        // Should have minimal matches due to code generation detection
1067        assert!(matches.is_empty() || matches.iter().all(|m| m.confidence < 0.3), 
1068                "Code generation context should have very low confidence");
1069    }
1070    
1071    #[test]
1072    fn test_quick_reject() {
1073        let config = TurboConfig::default();
1074        let engine = PatternEngine::new(&config).unwrap();
1075        let meta = dummy_metadata("main.rs");
1076        
1077        let safe_content = "fn main() { println!(\"Hello, world!\"); }";
1078        let matches = engine.scan_content(safe_content, true, &meta);
1079        assert!(matches.is_empty());
1080    }
1081
1082    #[test]
1083    fn test_package_json_filtering() {
1084        let config = TurboConfig::default();
1085        let engine = PatternEngine::new(&config).unwrap();
1086        let meta = dummy_metadata("package.json");
1087
1088        let content = r#"
1089            {
1090                "name": "my-cool-package-with-a-long-name-that-could-be-a-secret",
1091                "version": "1.0.0-beta.this.is.a.very.long.version.string.that.is.not.a.key",
1092                "description": "a string that is not a secret"
1093            }
1094        "#;
1095
1096        // Use a generic regex that would normally match these lines
1097        let mut test_engine = engine;
1098        test_engine.complex_patterns.push((
1099            Regex::new(r#"[a-zA-Z0-9-]{20,}"#).unwrap(),
1100            Arc::new(CompiledPattern {
1101                id: "generic-long-string".to_string(),
1102                name: "Generic Long String".to_string(),
1103                severity: SecuritySeverity::High,
1104                category: SecurityCategory::SecretsExposure,
1105                description: "A generic long string.".to_string(),
1106                remediation: vec![],
1107                references: vec![],
1108                cwe_id: None,
1109                confidence_boost_keywords: vec![],
1110                false_positive_keywords: vec![],
1111            }),
1112        ));
1113
1114        let matches = test_engine.scan_content(content, false, &meta);
1115        assert!(matches.is_empty(), "Should not find secrets in safe package.json keys");
1116    }
1117}