syncable_cli/analyzer/security/turbo/
pattern_engine.rs

1//! # Pattern Engine Module
2//! 
3//! Ultra-fast multi-pattern matching using Aho-Corasick algorithm and compiled regex sets.
4
5use std::sync::Arc;
6use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7use regex::Regex;
8use ahash::AHashMap;
9use log::debug;
10
11use super::{TurboConfig, SecurityError};
12use crate::analyzer::security::{SecuritySeverity, SecurityCategory};
13
14/// A compiled pattern for ultra-fast matching
15#[derive(Debug, Clone)]
16pub struct CompiledPattern {
17    pub id: String,
18    pub name: String,
19    pub severity: SecuritySeverity,
20    pub category: SecurityCategory,
21    pub description: String,
22    pub remediation: Vec<String>,
23    pub references: Vec<String>,
24    pub cwe_id: Option<String>,
25    pub confidence_boost_keywords: Vec<String>,
26    pub false_positive_keywords: Vec<String>,
27}
28
29/// Pattern match result
30#[derive(Debug, Clone)]
31pub struct PatternMatch {
32    pub pattern: Arc<CompiledPattern>,
33    pub line_number: usize,
34    pub column_number: usize,
35    pub evidence: String,
36    pub confidence: f32,
37}
38
39/// High-performance pattern matching engine
40pub struct PatternEngine {
41    // Multi-pattern matchers
42    secret_matcher: AhoCorasick,
43    env_var_matcher: AhoCorasick,
44    api_key_matcher: AhoCorasick,
45    
46    // Pattern lookup maps
47    secret_patterns: AHashMap<usize, Arc<CompiledPattern>>,
48    env_var_patterns: AHashMap<usize, Arc<CompiledPattern>>,
49    api_key_patterns: AHashMap<usize, Arc<CompiledPattern>>,
50    
51    // Specialized matchers for complex patterns
52    complex_patterns: Vec<(Regex, Arc<CompiledPattern>)>,
53    
54    // Performance counters
55    total_patterns: usize,
56}
57
58impl PatternEngine {
59    pub fn new(config: &TurboConfig) -> Result<Self, SecurityError> {
60        debug!("Initializing pattern engine with pattern sets: {:?}", config.pattern_sets);
61        
62        // Load patterns based on configuration
63        let (secret_patterns, env_var_patterns, api_key_patterns, complex_patterns) = 
64            Self::load_patterns(&config.pattern_sets)?;
65        
66        // Build Aho-Corasick matchers
67        let secret_matcher = Self::build_matcher(&secret_patterns)?;
68        let env_var_matcher = Self::build_matcher(&env_var_patterns)?;
69        let api_key_matcher = Self::build_matcher(&api_key_patterns)?;
70        
71        let total_patterns = secret_patterns.len() + env_var_patterns.len() + 
72                           api_key_patterns.len() + complex_patterns.len();
73        
74        debug!("Pattern engine initialized with {} total patterns", total_patterns);
75        
76        Ok(Self {
77            secret_matcher,
78            env_var_matcher,
79            api_key_matcher,
80            secret_patterns: Self::create_pattern_map(secret_patterns),
81            env_var_patterns: Self::create_pattern_map(env_var_patterns),
82            api_key_patterns: Self::create_pattern_map(api_key_patterns),
83            complex_patterns,
84            total_patterns,
85        })
86    }
87    
88    /// Get total pattern count
89    pub fn pattern_count(&self) -> usize {
90        self.total_patterns
91    }
92    
93    /// Scan content for all patterns
94    pub fn scan_content(&self, content: &str, quick_reject: bool) -> Vec<PatternMatch> {
95        // Quick reject using Boyer-Moore substring search
96        if quick_reject && !self.quick_contains_secrets(content) {
97            return Vec::new();
98        }
99        
100        let mut matches = Vec::new();
101        
102        // Split content into lines for line number tracking
103        let lines: Vec<&str> = content.lines().collect();
104        let mut line_offsets = vec![0];
105        let mut offset = 0;
106        
107        for line in &lines {
108            offset += line.len() + 1; // +1 for newline
109            line_offsets.push(offset);
110        }
111        
112        // Run multi-pattern matchers
113        matches.extend(self.run_matcher(&self.secret_matcher, content, &self.secret_patterns, &lines, &line_offsets));
114        matches.extend(self.run_matcher(&self.env_var_matcher, content, &self.env_var_patterns, &lines, &line_offsets));
115        matches.extend(self.run_matcher(&self.api_key_matcher, content, &self.api_key_patterns, &lines, &line_offsets));
116        
117        // Run complex patterns (regex-based)
118        for (line_num, line) in lines.iter().enumerate() {
119            for (regex, pattern) in &self.complex_patterns {
120                if let Some(mat) = regex.find(line) {
121                    let confidence = self.calculate_confidence(line, content, &pattern);
122                    
123                    matches.push(PatternMatch {
124                        pattern: Arc::clone(pattern),
125                        line_number: line_num + 1,
126                        column_number: mat.start() + 1,
127                        evidence: self.extract_evidence(line, mat.start(), mat.end()),
128                        confidence,
129                    });
130                }
131            }
132        }
133        
134        // Intelligent confidence filtering - adaptive threshold based on pattern type
135        matches.retain(|m| {
136            let threshold = match m.pattern.id.as_str() {
137                id if id.contains("aws-access-key") => 0.4, // AWS keys need higher confidence
138                id if id.contains("openai-api-key") => 0.4, // OpenAI keys need higher confidence
139                id if id.contains("jwt-token") => 0.6, // JWT tokens need high confidence (often in examples)
140                id if id.contains("database-url") => 0.5, // Database URLs medium confidence
141                id if id.contains("bearer-token") => 0.7, // Bearer tokens often in examples
142                id if id.contains("generic") => 0.8, // Generic patterns need very high confidence
143                id if id.contains("long-secret-value") => 0.7, // Long secret values need high confidence
144                _ => 0.7, // Increased default threshold
145            };
146            m.confidence > threshold
147        });
148        
149        matches
150    }
151    
152    /// Quick check if content might contain secrets
153    fn quick_contains_secrets(&self, content: &str) -> bool {
154        // Enhanced quick rejection for common false positive patterns
155        if self.is_likely_false_positive_content(content) {
156            return false;
157        }
158        
159        // Common secret indicators (optimized for speed)
160        const QUICK_PATTERNS: &[&str] = &[
161            "api", "key", "secret", "token", "password", "credential",
162            "auth", "private", "-----BEGIN", "sk_", "pk_", "eyJ",
163        ];
164        
165        let content_lower = content.to_lowercase();
166        QUICK_PATTERNS.iter().any(|&pattern| content_lower.contains(pattern))
167    }
168    
169    /// Check if content is likely a false positive (encoded data, minified code, etc.)
170    fn is_likely_false_positive_content(&self, content: &str) -> bool {
171        let content_len = content.len();
172        
173        // Skip empty or very small content
174        if content_len < 10 {
175            return true;
176        }
177        
178        // Check for base64 data URLs (common in SVG, images)
179        if content.contains("data:image/") || content.contains("data:font/") {
180            return true;
181        }
182        
183        // Check for minified JavaScript (very long lines, no spaces)
184        let lines: Vec<&str> = content.lines().collect();
185        if lines.len() < 5 && lines.iter().any(|line| line.len() > 500 && line.matches(' ').count() < line.len() / 50) {
186            return true;
187        }
188        
189        // Check for high percentage of base64-like characters (but not a JWT)
190        let base64_chars = content.chars().filter(|c| c.is_alphanumeric() || *c == '+' || *c == '/' || *c == '=').count();
191        let base64_ratio = base64_chars as f32 / content_len as f32;
192        
193        // High base64 ratio but doesn't look like JWT tokens
194        if base64_ratio > 0.8 && !content.contains("eyJ") && content_len > 1000 {
195            return true;
196        }
197        
198        // Check for SVG content
199        if content.contains("<svg") || content.contains("xmlns=\"http://www.w3.org/2000/svg\"") {
200            return true;
201        }
202        
203        // Check for CSS content
204        if content.contains("@media") || content.contains("@import") || 
205           (content.contains("{") && content.contains("}") && content.contains(":")) {
206            return true;
207        }
208        
209        false
210    }
211    
212    /// Run Aho-Corasick matcher and collect results
213    fn run_matcher(
214        &self,
215        matcher: &AhoCorasick,
216        content: &str,
217        patterns: &AHashMap<usize, Arc<CompiledPattern>>,
218        lines: &[&str],
219        line_offsets: &[usize],
220    ) -> Vec<PatternMatch> {
221        let mut matches = Vec::new();
222        
223        for mat in matcher.find_iter(content) {
224            let pattern_id = mat.pattern().as_usize();
225            if let Some(pattern) = patterns.get(&pattern_id) {
226                // Find line and column
227                let (line_num, col_num) = self.offset_to_line_col(mat.start(), line_offsets);
228                let line = lines.get(line_num.saturating_sub(1)).unwrap_or(&"");
229                
230                let confidence = self.calculate_confidence(line, content, pattern);
231                
232                matches.push(PatternMatch {
233                    pattern: Arc::clone(pattern),
234                    line_number: line_num,
235                    column_number: col_num,
236                    evidence: self.extract_evidence(line, mat.start(), mat.end()),
237                    confidence,
238                });
239            }
240        }
241        
242        matches
243    }
244    
245    /// Convert byte offset to line and column numbers
246    fn offset_to_line_col(&self, offset: usize, line_offsets: &[usize]) -> (usize, usize) {
247        let line_num = line_offsets.binary_search(&offset)
248            .unwrap_or_else(|i| i.saturating_sub(1));
249        
250        let line_start = line_offsets.get(line_num).copied().unwrap_or(0);
251        let col_num = offset - line_start + 1;
252        
253        (line_num + 1, col_num)
254    }
255    
256    /// Calculate confidence score for a match
257    fn calculate_confidence(&self, line: &str, content: &str, pattern: &CompiledPattern) -> f32 {
258        let mut confidence: f32 = 0.6;
259        
260        let line_lower = line.to_lowercase();
261        let content_lower = content.to_lowercase();
262        
263        // Enhanced false positive detection
264        if self.is_obvious_false_positive(line, content) {
265            return 0.0;
266        }
267        
268        // Context-based confidence adjustments
269        confidence = self.adjust_confidence_for_context(confidence, line, content, pattern);
270        
271        // Pattern-specific adjustments
272        confidence = self.adjust_confidence_for_pattern(confidence, line, content, pattern);
273        
274        confidence.clamp(0.0, 1.0)
275    }
276    
277    /// Check for obvious false positives
278    fn is_obvious_false_positive(&self, line: &str, content: &str) -> bool {
279        let line_lower = line.to_lowercase();
280        
281        // Comments and documentation
282        if line_lower.trim_start().starts_with("//") || 
283           line_lower.trim_start().starts_with("#") ||
284           line_lower.trim_start().starts_with("*") ||
285           line_lower.trim_start().starts_with("<!--") {
286            return true;
287        }
288        
289        // JavaScript/TypeScript template literals (${...})
290        if line.contains("${") && line.contains("}") {
291            return true;
292        }
293        
294        // Template strings and interpolation patterns
295        if line.contains("${selectedApiKey") || line.contains("${apiKey") || 
296           line.contains("${key") || line.contains("${token") {
297            return true;
298        }
299        
300        // Code generation contexts (functions that generate example code)
301        if self.is_in_code_generation_context(content) && self.looks_like_template_code(line) {
302            return true;
303        }
304        
305        // Common example/placeholder patterns
306        let false_positive_patterns = [
307            "example", "placeholder", "your_", "todo", "fixme", "xxx",
308            "xxxxxxxx", "12345", "abcdef", "test", "demo", "sample",
309            "lorem", "ipsum", "change_me", "replace_me", "insert_",
310            "enter_your", "add_your", "put_your", "use_your",
311            // React/JSX specific patterns
312            "props.", "state.", "this.", "component",
313        ];
314        
315        if false_positive_patterns.iter().any(|&pattern| line_lower.contains(pattern)) {
316            return true;
317        }
318        
319        // Check for JSON schema or TypeScript interfaces
320        if line_lower.contains("@example") || line_lower.contains("@param") ||
321           line_lower.contains("interface") || line_lower.contains("type ") {
322            return true;
323        }
324        
325        // Check for base64 data URLs
326        if line.contains("data:image/") || line.contains("data:font/") || 
327           line.contains("data:application/") {
328            return true;
329        }
330        
331        // Check for minified content (very long line with little whitespace)
332        if line.len() > 200 && line.matches(' ').count() < line.len() / 20 {
333            return true;
334        }
335        
336        // React/JSX template patterns
337        if line.contains("return `") || line.contains("const ") && line.contains(" = `") {
338            return true;
339        }
340        
341        false
342    }
343    
344    /// Check if we're in a code generation context
345    fn is_in_code_generation_context(&self, content: &str) -> bool {
346        let content_lower = content.to_lowercase();
347        
348        // Common code generation function names and patterns
349        let code_gen_patterns = [
350            "getcode", "generatecode", "codecomponent", "apicodedialog",
351            "const getcode", "function getcode", "const code", "function code",
352            "codesnippet", "codeexample", "template", "example code",
353            "code generator", "api example", "curl example",
354            // React/JSX specific
355            "codeblock", "copyblock", "syntax highlight"
356        ];
357        
358        code_gen_patterns.iter().any(|&pattern| content_lower.contains(pattern))
359    }
360    
361    /// Check if a line looks like template code
362    fn looks_like_template_code(&self, line: &str) -> bool {
363        // Template string patterns
364        if line.contains("return `") || line.contains("= `") {
365            return true;
366        }
367        
368        // API URL construction patterns
369        if line.contains("API_URL") || line.contains("/api/v1/") || line.contains("/prediction/") {
370            return true;
371        }
372        
373        // Typical code example patterns
374        if line.contains("requests.post") || line.contains("fetch(") || 
375           line.contains("curl ") || line.contains("import requests") {
376            return true;
377        }
378        
379        // Authorization header patterns in templates
380        if line.contains("Authorization:") || line.contains("Bearer ") {
381            return true;
382        }
383        
384        false
385    }
386    
387    /// Adjust confidence based on context
388    fn adjust_confidence_for_context(&self, mut confidence: f32, line: &str, content: &str, _pattern: &CompiledPattern) -> f32 {
389        let line_lower = line.to_lowercase();
390        let content_lower = content.to_lowercase();
391        
392        // Boost confidence for actual assignments
393        if line.contains("=") || line.contains(":") {
394            confidence += 0.2;
395        }
396        
397        // Boost for environment variable assignment
398        if line_lower.contains("export ") || line_lower.contains("process.env") {
399            confidence += 0.3;
400        }
401        
402        // Boost for import statements with API keys
403        if line_lower.contains("import") && (line_lower.contains("api") || line_lower.contains("key")) {
404            confidence += 0.1;
405        }
406        
407        // Reduce confidence for certain file types based on content
408        if content_lower.contains("package.json") || content_lower.contains("node_modules") {
409            confidence -= 0.2;
410        }
411        
412        // Reduce confidence for test files
413        if content_lower.contains("/test/") || content_lower.contains("__test__") ||
414           content_lower.contains(".test.") || content_lower.contains(".spec.") {
415            confidence -= 0.3;
416        }
417        
418        // Reduce confidence for documentation
419        if content_lower.contains("readme") || content_lower.contains("documentation") ||
420           content_lower.contains("docs/") {
421            confidence -= 0.4;
422        }
423        
424        confidence
425    }
426    
427    /// Adjust confidence based on pattern-specific rules
428    fn adjust_confidence_for_pattern(&self, mut confidence: f32, line: &str, content: &str, pattern: &CompiledPattern) -> f32 {
429        let line_lower = line.to_lowercase();
430        let content_lower = content.to_lowercase();
431        
432        // Major confidence reduction for template/code generation contexts
433        if self.is_in_code_generation_context(content) {
434            confidence -= 0.6;
435        }
436        
437        // Check pattern-specific confidence boost keywords
438        for keyword in &pattern.confidence_boost_keywords {
439            if content_lower.contains(&keyword.to_lowercase()) {
440                confidence += 0.1;
441            }
442        }
443        
444        // Check pattern-specific false positive keywords
445        for keyword in &pattern.false_positive_keywords {
446            if line_lower.contains(&keyword.to_lowercase()) {
447                confidence -= 0.4;
448            }
449        }
450        
451        // Special handling for specific pattern types
452        match pattern.id.as_str() {
453            "jwt-token" => {
454                // JWT tokens should have proper structure
455                if !line.contains("eyJ") || line.split('.').count() != 3 {
456                    confidence -= 0.3;
457                }
458                // Less confident if in a comment or documentation
459                if line_lower.contains("example") || line_lower.contains("jwt") {
460                    confidence -= 0.2;
461                }
462                // Very low confidence for template literals
463                if line.contains("${") {
464                    confidence -= 0.8;
465                }
466            }
467            "openai-api-key" => {
468                // OpenAI keys should start with sk- and be proper length
469                if !line.contains("sk-") {
470                    confidence -= 0.5;
471                }
472                // Boost if in actual code context
473                if line_lower.contains("openai") || line_lower.contains("gpt") {
474                    confidence += 0.2;
475                }
476                // Major reduction for template literals
477                if line.contains("${") || line.contains("selectedApiKey") {
478                    confidence -= 0.9;
479                }
480            }
481            "database-url-with-creds" => {
482                // Should be a valid URL format
483                if !line.contains("://") || line.contains("example.com") {
484                    confidence -= 0.4;
485                }
486                // Reduce for template patterns
487                if line.contains("${") {
488                    confidence -= 0.7;
489                }
490            }
491            "long-secret-value" | "generic-api-key" => {
492                // High reduction for template literals and code generation
493                if line.contains("${") || line.contains("selectedApiKey") || 
494                   line.contains("apiKey") && line.contains("?") {
495                    confidence -= 0.8;
496                }
497                // Reduce for Bearer token patterns in templates
498                if line.contains("Bearer ") && line.contains("${") {
499                    confidence -= 0.9;
500                }
501            }
502            _ => {
503                // General template literal reduction
504                if line.contains("${") {
505                    confidence -= 0.6;
506                }
507            }
508        }
509        
510        // Additional React/JSX specific reductions
511        if content_lower.contains("react") || content_lower.contains("jsx") || 
512           content_lower.contains("component") {
513            if line.contains("${") || line.contains("props.") || line.contains("state.") {
514                confidence -= 0.5;
515            }
516        }
517        
518        confidence
519    }
520    
521    /// Extract evidence with context
522    fn extract_evidence(&self, line: &str, start: usize, end: usize) -> String {
523        // Mask the actual secret value
524        let prefix = &line[..start.min(line.len())];
525        let suffix = &line[end.min(line.len())..];
526        let masked = "*".repeat((end - start).min(20));
527        
528        format!("{}{}{}", prefix, masked, suffix).trim().to_string()
529    }
530    
531    /// Build Aho-Corasick matcher from patterns
532    fn build_matcher(patterns: &[(String, Arc<CompiledPattern>)]) -> Result<AhoCorasick, SecurityError> {
533        let strings: Vec<&str> = patterns.iter().map(|(s, _)| s.as_str()).collect();
534        
535        let matcher = AhoCorasickBuilder::new()
536            .match_kind(MatchKind::LeftmostFirst)
537            .ascii_case_insensitive(true)
538            .build(&strings)
539            .map_err(|e| SecurityError::PatternEngine(format!("Failed to build matcher: {}", e)))?;
540        
541        Ok(matcher)
542    }
543    
544    /// Create pattern lookup map
545    fn create_pattern_map(patterns: Vec<(String, Arc<CompiledPattern>)>) -> AHashMap<usize, Arc<CompiledPattern>> {
546        patterns.into_iter()
547            .enumerate()
548            .map(|(id, (_, pattern))| (id, pattern))
549            .collect()
550    }
551    
552    /// Load patterns based on pattern sets
553    fn load_patterns(pattern_sets: &[String]) -> Result<(
554        Vec<(String, Arc<CompiledPattern>)>,
555        Vec<(String, Arc<CompiledPattern>)>,
556        Vec<(String, Arc<CompiledPattern>)>,
557        Vec<(Regex, Arc<CompiledPattern>)>,
558    ), SecurityError> {
559        let mut secret_patterns = Vec::new();
560        let mut env_var_patterns = Vec::new();
561        let mut api_key_patterns = Vec::new();
562        let mut complex_patterns = Vec::new();
563        
564        // Load default patterns
565        if pattern_sets.contains(&"default".to_string()) {
566            Self::load_default_patterns(&mut secret_patterns, &mut env_var_patterns, 
567                                      &mut api_key_patterns, &mut complex_patterns)?;
568        }
569        
570        // Load additional pattern sets
571        for set in pattern_sets {
572            match set.as_str() {
573                "aws" => Self::load_aws_patterns(&mut api_key_patterns)?,
574                "gcp" => Self::load_gcp_patterns(&mut api_key_patterns)?,
575                "azure" => Self::load_azure_patterns(&mut api_key_patterns)?,
576                "crypto" => Self::load_crypto_patterns(&mut secret_patterns)?,
577                _ => {}
578            }
579        }
580        
581        Ok((secret_patterns, env_var_patterns, api_key_patterns, complex_patterns))
582    }
583    
584    /// Load default security patterns - focused on ACTUAL secrets, not references
585    fn load_default_patterns(
586        secret_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
587        env_var_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
588        api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
589        complex_patterns: &mut Vec<(Regex, Arc<CompiledPattern>)>,
590    ) -> Result<(), SecurityError> {
591        // ONLY detect actual API key values, not variable names
592        
593        // OpenAI API Keys - actual key format
594        api_key_patterns.push((
595            "sk-".to_string(),
596            Arc::new(CompiledPattern {
597                id: "openai-api-key".to_string(),
598                name: "OpenAI API Key".to_string(),
599                severity: SecuritySeverity::Critical,
600                category: SecurityCategory::SecretsExposure,
601                description: "OpenAI API key detected".to_string(),
602                remediation: vec![
603                    "Remove API key from source code".to_string(),
604                    "Use environment variables".to_string(),
605                ],
606                references: vec!["https://platform.openai.com/docs/api-reference".to_string()],
607                cwe_id: Some("CWE-798".to_string()),
608                confidence_boost_keywords: vec!["openai".to_string(), "gpt".to_string()],
609                false_positive_keywords: vec![
610                    "sk-xxxxxxxx".to_string(), "sk-...".to_string(), "sk_test".to_string(),
611                    "example".to_string(), "placeholder".to_string(), "your_".to_string(),
612                    "TODO".to_string(), "FIXME".to_string(), "XXX".to_string(),
613                ],
614            }),
615        ));
616        
617        // Complex regex patterns for ACTUAL secret assignments with values
618        complex_patterns.push((
619            // Only match when there's an actual long value, not just variable names
620            Regex::new(r#"(?i)(?:api[_-]?key|secret[_-]?key|access[_-]?token)\s*[:=]\s*['"]([a-zA-Z0-9+/=]{32,})['"]"#)
621                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
622            Arc::new(CompiledPattern {
623                id: "long-secret-value".to_string(),
624                name: "Hardcoded Secret Value".to_string(),
625                severity: SecuritySeverity::Critical,
626                category: SecurityCategory::SecretsExposure,
627                description: "Long secret value hardcoded in source code".to_string(),
628                remediation: vec![
629                    "Use environment variables for secrets".to_string(),
630                    "Implement proper secret management".to_string(),
631                ],
632                references: vec![],
633                cwe_id: Some("CWE-798".to_string()),
634                confidence_boost_keywords: vec!["bearer".to_string(), "auth".to_string()],
635                false_positive_keywords: vec![
636                    "process.env".to_string(), "getenv".to_string(), "example".to_string(),
637                    "placeholder".to_string(), "your_".to_string(), "TODO".to_string(),
638                    "test".to_string(), "demo".to_string(), "fake".to_string(),
639                ],
640            }),
641        ));
642        
643        // JWT tokens (actual token format)
644        complex_patterns.push((
645            Regex::new(r#"\beyJ[a-zA-Z0-9+/=]{100,}\b"#)
646                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
647            Arc::new(CompiledPattern {
648                id: "jwt-token".to_string(),
649                name: "JWT Token".to_string(),
650                severity: SecuritySeverity::High,
651                category: SecurityCategory::SecretsExposure,
652                description: "JWT token detected in source code".to_string(),
653                remediation: vec![
654                    "Never hardcode JWT tokens".to_string(),
655                    "Use secure token storage".to_string(),
656                ],
657                references: vec![],
658                cwe_id: Some("CWE-798".to_string()),
659                confidence_boost_keywords: vec!["bearer".to_string(), "authorization".to_string()],
660                false_positive_keywords: vec!["example".to_string(), "demo".to_string()],
661            }),
662        ));
663        
664        // Database connection strings with embedded credentials
665        complex_patterns.push((
666            Regex::new(r#"(?i)(?:postgres|mysql|mongodb)://[^:\s]+:[^@\s]+@[^/\s]+/[^\s]*"#)
667                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
668            Arc::new(CompiledPattern {
669                id: "database-url-with-creds".to_string(),
670                name: "Database URL with Credentials".to_string(),
671                severity: SecuritySeverity::Critical,
672                category: SecurityCategory::SecretsExposure,
673                description: "Database connection string with embedded credentials".to_string(),
674                remediation: vec![
675                    "Use environment variables for database credentials".to_string(),
676                    "Use connection string without embedded passwords".to_string(),
677                ],
678                references: vec![],
679                cwe_id: Some("CWE-798".to_string()),
680                confidence_boost_keywords: vec!["connection".to_string(), "database".to_string()],
681                false_positive_keywords: vec![
682                    "example.com".to_string(), "localhost".to_string(), "placeholder".to_string(),
683                    "your_".to_string(), "user:pass".to_string(),
684                ],
685            }),
686        ));
687        
688        // Private SSH/SSL keys
689        secret_patterns.push((
690            "-----BEGIN".to_string(),
691            Arc::new(CompiledPattern {
692                id: "private-key-header".to_string(),
693                name: "Private Key".to_string(),
694                severity: SecuritySeverity::Critical,
695                category: SecurityCategory::SecretsExposure,
696                description: "Private key detected".to_string(),
697                remediation: vec![
698                    "Never commit private keys to version control".to_string(),
699                    "Use secure key storage solutions".to_string(),
700                ],
701                references: vec![],
702                cwe_id: Some("CWE-321".to_string()),
703                confidence_boost_keywords: vec!["PRIVATE".to_string(), "RSA".to_string(), "DSA".to_string()],
704                false_positive_keywords: vec!["PUBLIC".to_string(), "CERTIFICATE".to_string()],
705            }),
706        ));
707        
708        Ok(())
709    }
710    
711    /// Load AWS-specific patterns
712    fn load_aws_patterns(api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
713        api_key_patterns.push((
714            "AKIA".to_string(),
715            Arc::new(CompiledPattern {
716                id: "aws-access-key".to_string(),
717                name: "AWS Access Key".to_string(),
718                severity: SecuritySeverity::Critical,
719                category: SecurityCategory::SecretsExposure,
720                description: "AWS Access Key ID detected".to_string(),
721                remediation: vec![
722                    "Remove AWS credentials from source code".to_string(),
723                    "Use IAM roles or environment variables".to_string(),
724                    "Rotate the exposed key immediately".to_string(),
725                ],
726                references: vec!["https://docs.aws.amazon.com/security/".to_string()],
727                cwe_id: Some("CWE-798".to_string()),
728                confidence_boost_keywords: vec!["aws".to_string(), "s3".to_string(), "ec2".to_string()],
729                false_positive_keywords: vec!["AKIA00000000".to_string()],
730            }),
731        ));
732        
733        Ok(())
734    }
735    
736    /// Load GCP-specific patterns
737    fn load_gcp_patterns(api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
738        api_key_patterns.push((
739            "AIza".to_string(),
740            Arc::new(CompiledPattern {
741                id: "gcp-api-key".to_string(),
742                name: "Google Cloud API Key".to_string(),
743                severity: SecuritySeverity::High,
744                category: SecurityCategory::SecretsExposure,
745                description: "Google Cloud API key detected".to_string(),
746                remediation: vec![
747                    "Use service accounts instead of API keys".to_string(),
748                    "Restrict API key usage by IP/referrer".to_string(),
749                ],
750                references: vec!["https://cloud.google.com/security/".to_string()],
751                cwe_id: Some("CWE-798".to_string()),
752                confidence_boost_keywords: vec!["google".to_string(), "gcp".to_string(), "firebase".to_string()],
753                false_positive_keywords: vec![],
754            }),
755        ));
756        
757        Ok(())
758    }
759    
760    /// Load Azure-specific patterns
761    fn load_azure_patterns(_api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
762        // Azure patterns would go here
763        Ok(())
764    }
765    
766    /// Load cryptocurrency-related patterns
767    fn load_crypto_patterns(secret_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
768        secret_patterns.push((
769            "-----BEGIN".to_string(),
770            Arc::new(CompiledPattern {
771                id: "private-key".to_string(),
772                name: "Private Key".to_string(),
773                severity: SecuritySeverity::Critical,
774                category: SecurityCategory::SecretsExposure,
775                description: "Private key detected".to_string(),
776                remediation: vec![
777                    "Never commit private keys to version control".to_string(),
778                    "Use secure key storage solutions".to_string(),
779                ],
780                references: vec![],
781                cwe_id: Some("CWE-321".to_string()),
782                confidence_boost_keywords: vec!["RSA".to_string(), "PRIVATE".to_string()],
783                false_positive_keywords: vec!["PUBLIC".to_string()],
784            }),
785        ));
786        
787        Ok(())
788    }
789}
790
791#[cfg(test)]
792mod tests {
793    use super::*;
794    
795    #[test]
796    fn test_pattern_engine_creation() {
797        let config = TurboConfig::default();
798        let engine = PatternEngine::new(&config);
799        assert!(engine.is_ok());
800        
801        let engine = engine.unwrap();
802        assert!(engine.pattern_count() > 0);
803    }
804    
805    #[test]
806    fn test_pattern_matching() {
807        let config = TurboConfig::default();
808        let engine = PatternEngine::new(&config).unwrap();
809        
810        let content = r#"
811            const apiKey = "sk-1234567890abcdef1234567890abcdef12345678";
812            password = "super_secret_password_that_is_long_enough";
813            process.env.DATABASE_URL
814        "#;
815        
816        let matches = engine.scan_content(content, false);
817        assert!(!matches.is_empty());
818        
819        // Should find API key (if long enough and not a template)
820        assert!(matches.iter().any(|m| m.pattern.id.contains("openai") || m.pattern.id.contains("secret")));
821    }
822    
823    #[test]
824    fn test_template_literal_filtering() {
825        let config = TurboConfig::default();
826        let engine = PatternEngine::new(&config).unwrap();
827        
828        // Template literal content (should be filtered out)
829        let template_content = r#"
830            const getCode = () => {
831                return `Authorization: "Bearer ${selectedApiKey?.apiKey}"`;
832            }
833            
834            function generateExample() {
835                return "Bearer " + apiKey;
836            }
837        "#;
838        
839        let matches = engine.scan_content(template_content, false);
840        // Should have very few or no matches due to template literal detection
841        assert!(matches.len() <= 1, "Template literals should be filtered out");
842    }
843    
844    #[test]
845    fn test_code_generation_context() {
846        let config = TurboConfig::default();
847        let engine = PatternEngine::new(&config).unwrap();
848        
849        // Code generation context (like React component that generates examples)
850        let code_gen_content = r#"
851            import { CopyBlock } from 'react-code-blocks';
852            
853            const APICodeDialog = () => {
854                const getCodeWithAuthorization = () => {
855                    return `
856                        headers: {
857                            Authorization: "Bearer ${selectedApiKey?.apiKey}",
858                            "Content-Type": "application/json"
859                        }
860                    `;
861                };
862                
863                return <CopyBlock text={getCodeWithAuthorization()} />;
864            };
865        "#;
866        
867        let matches = engine.scan_content(code_gen_content, false);
868        // Should have minimal matches due to code generation detection
869        assert!(matches.is_empty() || matches.iter().all(|m| m.confidence < 0.3), 
870                "Code generation context should have very low confidence");
871    }
872    
873    #[test]
874    fn test_quick_reject() {
875        let config = TurboConfig::default();
876        let engine = PatternEngine::new(&config).unwrap();
877        
878        let safe_content = "fn main() { println!(\"Hello, world!\"); }";
879        let matches = engine.scan_content(safe_content, true);
880        assert!(matches.is_empty());
881    }
882}