syncable_cli/analyzer/security/turbo/
pattern_engine.rs

1//! # Pattern Engine Module
2//! 
3//! Ultra-fast multi-pattern matching using Aho-Corasick algorithm and compiled regex sets.
4
5use std::sync::Arc;
6use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7use regex::Regex;
8use ahash::AHashMap;
9use log::debug;
10
11use super::{TurboConfig, SecurityError};
12use crate::analyzer::security::{SecuritySeverity, SecurityCategory};
13
14/// A compiled pattern for ultra-fast matching
15#[derive(Debug, Clone)]
16pub struct CompiledPattern {
17    pub id: String,
18    pub name: String,
19    pub severity: SecuritySeverity,
20    pub category: SecurityCategory,
21    pub description: String,
22    pub remediation: Vec<String>,
23    pub references: Vec<String>,
24    pub cwe_id: Option<String>,
25    pub confidence_boost_keywords: Vec<String>,
26    pub false_positive_keywords: Vec<String>,
27}
28
29/// Pattern match result
30#[derive(Debug, Clone)]
31pub struct PatternMatch {
32    pub pattern: Arc<CompiledPattern>,
33    pub line_number: usize,
34    pub column_number: usize,
35    pub evidence: String,
36    pub confidence: f32,
37}
38
39/// High-performance pattern matching engine
40pub struct PatternEngine {
41    // Multi-pattern matchers
42    secret_matcher: AhoCorasick,
43    env_var_matcher: AhoCorasick,
44    api_key_matcher: AhoCorasick,
45    
46    // Pattern lookup maps
47    secret_patterns: AHashMap<usize, Arc<CompiledPattern>>,
48    env_var_patterns: AHashMap<usize, Arc<CompiledPattern>>,
49    api_key_patterns: AHashMap<usize, Arc<CompiledPattern>>,
50    
51    // Specialized matchers for complex patterns
52    complex_patterns: Vec<(Regex, Arc<CompiledPattern>)>,
53    
54    // Performance counters
55    total_patterns: usize,
56}
57
58impl PatternEngine {
59    pub fn new(config: &TurboConfig) -> Result<Self, SecurityError> {
60        debug!("Initializing pattern engine with pattern sets: {:?}", config.pattern_sets);
61        
62        // Load patterns based on configuration
63        let (secret_patterns, env_var_patterns, api_key_patterns, complex_patterns) = 
64            Self::load_patterns(&config.pattern_sets)?;
65        
66        // Build Aho-Corasick matchers
67        let secret_matcher = Self::build_matcher(&secret_patterns)?;
68        let env_var_matcher = Self::build_matcher(&env_var_patterns)?;
69        let api_key_matcher = Self::build_matcher(&api_key_patterns)?;
70        
71        let total_patterns = secret_patterns.len() + env_var_patterns.len() + 
72                           api_key_patterns.len() + complex_patterns.len();
73        
74        debug!("Pattern engine initialized with {} total patterns", total_patterns);
75        
76        Ok(Self {
77            secret_matcher,
78            env_var_matcher,
79            api_key_matcher,
80            secret_patterns: Self::create_pattern_map(secret_patterns),
81            env_var_patterns: Self::create_pattern_map(env_var_patterns),
82            api_key_patterns: Self::create_pattern_map(api_key_patterns),
83            complex_patterns,
84            total_patterns,
85        })
86    }
87    
88    /// Get total pattern count
89    pub fn pattern_count(&self) -> usize {
90        self.total_patterns
91    }
92    
93    /// Scan content for all patterns
94    pub fn scan_content(&self, content: &str, quick_reject: bool) -> Vec<PatternMatch> {
95        // Quick reject using Boyer-Moore substring search
96        if quick_reject && !self.quick_contains_secrets(content) {
97            return Vec::new();
98        }
99        
100        let mut matches = Vec::new();
101        
102        // Split content into lines for line number tracking
103        let lines: Vec<&str> = content.lines().collect();
104        let mut line_offsets = vec![0];
105        let mut offset = 0;
106        
107        for line in &lines {
108            offset += line.len() + 1; // +1 for newline
109            line_offsets.push(offset);
110        }
111        
112        // Run multi-pattern matchers
113        matches.extend(self.run_matcher(&self.secret_matcher, content, &self.secret_patterns, &lines, &line_offsets));
114        matches.extend(self.run_matcher(&self.env_var_matcher, content, &self.env_var_patterns, &lines, &line_offsets));
115        matches.extend(self.run_matcher(&self.api_key_matcher, content, &self.api_key_patterns, &lines, &line_offsets));
116        
117        // Run complex patterns (regex-based)
118        for (line_num, line) in lines.iter().enumerate() {
119            for (regex, pattern) in &self.complex_patterns {
120                if let Some(mat) = regex.find(line) {
121                    let confidence = self.calculate_confidence(line, content, &pattern);
122                    
123                    matches.push(PatternMatch {
124                        pattern: Arc::clone(pattern),
125                        line_number: line_num + 1,
126                        column_number: mat.start() + 1,
127                        evidence: self.extract_evidence(line, mat.start(), mat.end()),
128                        confidence,
129                    });
130                }
131            }
132        }
133        
134        // Intelligent confidence filtering - adaptive threshold based on pattern type
135        matches.retain(|m| {
136            let threshold = match m.pattern.id.as_str() {
137                id if id.contains("aws-access-key") || id.contains("openai-api-key") => 0.3, // High-confidence patterns
138                id if id.contains("jwt-token") || id.contains("database-url") => 0.5, // Medium confidence patterns
139                id if id.contains("generic") => 0.7, // Generic patterns need higher confidence
140                _ => 0.6, // Default threshold
141            };
142            m.confidence > threshold
143        });
144        
145        matches
146    }
147    
148    /// Quick check if content might contain secrets
149    fn quick_contains_secrets(&self, content: &str) -> bool {
150        // Common secret indicators (optimized for speed)
151        const QUICK_PATTERNS: &[&str] = &[
152            "api", "key", "secret", "token", "password", "credential",
153            "auth", "private", "-----BEGIN", "sk_", "pk_", "eyJ",
154        ];
155        
156        let content_lower = content.to_lowercase();
157        QUICK_PATTERNS.iter().any(|&pattern| content_lower.contains(pattern))
158    }
159    
160    /// Run Aho-Corasick matcher and collect results
161    fn run_matcher(
162        &self,
163        matcher: &AhoCorasick,
164        content: &str,
165        patterns: &AHashMap<usize, Arc<CompiledPattern>>,
166        lines: &[&str],
167        line_offsets: &[usize],
168    ) -> Vec<PatternMatch> {
169        let mut matches = Vec::new();
170        
171        for mat in matcher.find_iter(content) {
172            let pattern_id = mat.pattern().as_usize();
173            if let Some(pattern) = patterns.get(&pattern_id) {
174                // Find line and column
175                let (line_num, col_num) = self.offset_to_line_col(mat.start(), line_offsets);
176                let line = lines.get(line_num.saturating_sub(1)).unwrap_or(&"");
177                
178                let confidence = self.calculate_confidence(line, content, pattern);
179                
180                matches.push(PatternMatch {
181                    pattern: Arc::clone(pattern),
182                    line_number: line_num,
183                    column_number: col_num,
184                    evidence: self.extract_evidence(line, mat.start(), mat.end()),
185                    confidence,
186                });
187            }
188        }
189        
190        matches
191    }
192    
193    /// Convert byte offset to line and column numbers
194    fn offset_to_line_col(&self, offset: usize, line_offsets: &[usize]) -> (usize, usize) {
195        let line_num = line_offsets.binary_search(&offset)
196            .unwrap_or_else(|i| i.saturating_sub(1));
197        
198        let line_start = line_offsets.get(line_num).copied().unwrap_or(0);
199        let col_num = offset - line_start + 1;
200        
201        (line_num + 1, col_num)
202    }
203    
204    /// Calculate confidence score for a match
205    fn calculate_confidence(&self, line: &str, content: &str, pattern: &CompiledPattern) -> f32 {
206        let mut confidence: f32 = 0.6;
207        
208        let line_lower = line.to_lowercase();
209        let content_lower = content.to_lowercase();
210        
211        // Basic false positive detection
212        if line_lower.starts_with("//") || line_lower.starts_with("#") || line_lower.contains("example") ||
213           line_lower.contains("placeholder") || line_lower.contains("your_") || line_lower.contains("todo") {
214            return 0.0; // Skip obvious examples/docs
215        }
216        
217        // Boost confidence for actual assignments
218        if line.contains("=") || line.contains(":") {
219            confidence += 0.2;
220        }
221        
222        // Check pattern-specific keywords
223        for keyword in &pattern.confidence_boost_keywords {
224            if content_lower.contains(&keyword.to_lowercase()) {
225                confidence += 0.1;
226            }
227        }
228        
229        for keyword in &pattern.false_positive_keywords {
230            if line_lower.contains(&keyword.to_lowercase()) {
231                confidence -= 0.4;
232            }
233        }
234        
235        confidence.clamp(0.0, 1.0)
236    }
237    
238
239    
240    /// Extract evidence with context
241    fn extract_evidence(&self, line: &str, start: usize, end: usize) -> String {
242        // Mask the actual secret value
243        let prefix = &line[..start.min(line.len())];
244        let suffix = &line[end.min(line.len())..];
245        let masked = "*".repeat((end - start).min(20));
246        
247        format!("{}{}{}", prefix, masked, suffix).trim().to_string()
248    }
249    
250    /// Build Aho-Corasick matcher from patterns
251    fn build_matcher(patterns: &[(String, Arc<CompiledPattern>)]) -> Result<AhoCorasick, SecurityError> {
252        let strings: Vec<&str> = patterns.iter().map(|(s, _)| s.as_str()).collect();
253        
254        let matcher = AhoCorasickBuilder::new()
255            .match_kind(MatchKind::LeftmostFirst)
256            .ascii_case_insensitive(true)
257            .build(&strings)
258            .map_err(|e| SecurityError::PatternEngine(format!("Failed to build matcher: {}", e)))?;
259        
260        Ok(matcher)
261    }
262    
263    /// Create pattern lookup map
264    fn create_pattern_map(patterns: Vec<(String, Arc<CompiledPattern>)>) -> AHashMap<usize, Arc<CompiledPattern>> {
265        patterns.into_iter()
266            .enumerate()
267            .map(|(id, (_, pattern))| (id, pattern))
268            .collect()
269    }
270    
271    /// Load patterns based on pattern sets
272    fn load_patterns(pattern_sets: &[String]) -> Result<(
273        Vec<(String, Arc<CompiledPattern>)>,
274        Vec<(String, Arc<CompiledPattern>)>,
275        Vec<(String, Arc<CompiledPattern>)>,
276        Vec<(Regex, Arc<CompiledPattern>)>,
277    ), SecurityError> {
278        let mut secret_patterns = Vec::new();
279        let mut env_var_patterns = Vec::new();
280        let mut api_key_patterns = Vec::new();
281        let mut complex_patterns = Vec::new();
282        
283        // Load default patterns
284        if pattern_sets.contains(&"default".to_string()) {
285            Self::load_default_patterns(&mut secret_patterns, &mut env_var_patterns, 
286                                      &mut api_key_patterns, &mut complex_patterns)?;
287        }
288        
289        // Load additional pattern sets
290        for set in pattern_sets {
291            match set.as_str() {
292                "aws" => Self::load_aws_patterns(&mut api_key_patterns)?,
293                "gcp" => Self::load_gcp_patterns(&mut api_key_patterns)?,
294                "azure" => Self::load_azure_patterns(&mut api_key_patterns)?,
295                "crypto" => Self::load_crypto_patterns(&mut secret_patterns)?,
296                _ => {}
297            }
298        }
299        
300        Ok((secret_patterns, env_var_patterns, api_key_patterns, complex_patterns))
301    }
302    
303    /// Load default security patterns - focused on ACTUAL secrets, not references
304    fn load_default_patterns(
305        secret_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
306        env_var_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
307        api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
308        complex_patterns: &mut Vec<(Regex, Arc<CompiledPattern>)>,
309    ) -> Result<(), SecurityError> {
310        // ONLY detect actual API key values, not variable names
311        
312        // OpenAI API Keys - actual key format
313        api_key_patterns.push((
314            "sk-".to_string(),
315            Arc::new(CompiledPattern {
316                id: "openai-api-key".to_string(),
317                name: "OpenAI API Key".to_string(),
318                severity: SecuritySeverity::Critical,
319                category: SecurityCategory::SecretsExposure,
320                description: "OpenAI API key detected".to_string(),
321                remediation: vec![
322                    "Remove API key from source code".to_string(),
323                    "Use environment variables".to_string(),
324                ],
325                references: vec!["https://platform.openai.com/docs/api-reference".to_string()],
326                cwe_id: Some("CWE-798".to_string()),
327                confidence_boost_keywords: vec!["openai".to_string(), "gpt".to_string()],
328                false_positive_keywords: vec![
329                    "sk-xxxxxxxx".to_string(), "sk-...".to_string(), "sk_test".to_string(),
330                    "example".to_string(), "placeholder".to_string(), "your_".to_string(),
331                    "TODO".to_string(), "FIXME".to_string(), "XXX".to_string(),
332                ],
333            }),
334        ));
335        
336        // Complex regex patterns for ACTUAL secret assignments with values
337        complex_patterns.push((
338            // Only match when there's an actual long value, not just variable names
339            Regex::new(r#"(?i)(?:api[_-]?key|secret[_-]?key|access[_-]?token)\s*[:=]\s*['"]([a-zA-Z0-9+/=]{32,})['"]"#)
340                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
341            Arc::new(CompiledPattern {
342                id: "long-secret-value".to_string(),
343                name: "Hardcoded Secret Value".to_string(),
344                severity: SecuritySeverity::Critical,
345                category: SecurityCategory::SecretsExposure,
346                description: "Long secret value hardcoded in source code".to_string(),
347                remediation: vec![
348                    "Use environment variables for secrets".to_string(),
349                    "Implement proper secret management".to_string(),
350                ],
351                references: vec![],
352                cwe_id: Some("CWE-798".to_string()),
353                confidence_boost_keywords: vec!["bearer".to_string(), "auth".to_string()],
354                false_positive_keywords: vec![
355                    "process.env".to_string(), "getenv".to_string(), "example".to_string(),
356                    "placeholder".to_string(), "your_".to_string(), "TODO".to_string(),
357                    "test".to_string(), "demo".to_string(), "fake".to_string(),
358                ],
359            }),
360        ));
361        
362        // JWT tokens (actual token format)
363        complex_patterns.push((
364            Regex::new(r#"\beyJ[a-zA-Z0-9+/=]{100,}\b"#)
365                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
366            Arc::new(CompiledPattern {
367                id: "jwt-token".to_string(),
368                name: "JWT Token".to_string(),
369                severity: SecuritySeverity::High,
370                category: SecurityCategory::SecretsExposure,
371                description: "JWT token detected in source code".to_string(),
372                remediation: vec![
373                    "Never hardcode JWT tokens".to_string(),
374                    "Use secure token storage".to_string(),
375                ],
376                references: vec![],
377                cwe_id: Some("CWE-798".to_string()),
378                confidence_boost_keywords: vec!["bearer".to_string(), "authorization".to_string()],
379                false_positive_keywords: vec!["example".to_string(), "demo".to_string()],
380            }),
381        ));
382        
383        // Database connection strings with embedded credentials
384        complex_patterns.push((
385            Regex::new(r#"(?i)(?:postgres|mysql|mongodb)://[^:\s]+:[^@\s]+@[^/\s]+/[^\s]*"#)
386                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
387            Arc::new(CompiledPattern {
388                id: "database-url-with-creds".to_string(),
389                name: "Database URL with Credentials".to_string(),
390                severity: SecuritySeverity::Critical,
391                category: SecurityCategory::SecretsExposure,
392                description: "Database connection string with embedded credentials".to_string(),
393                remediation: vec![
394                    "Use environment variables for database credentials".to_string(),
395                    "Use connection string without embedded passwords".to_string(),
396                ],
397                references: vec![],
398                cwe_id: Some("CWE-798".to_string()),
399                confidence_boost_keywords: vec!["connection".to_string(), "database".to_string()],
400                false_positive_keywords: vec![
401                    "example.com".to_string(), "localhost".to_string(), "placeholder".to_string(),
402                    "your_".to_string(), "user:pass".to_string(),
403                ],
404            }),
405        ));
406        
407        // Private SSH/SSL keys
408        secret_patterns.push((
409            "-----BEGIN".to_string(),
410            Arc::new(CompiledPattern {
411                id: "private-key-header".to_string(),
412                name: "Private Key".to_string(),
413                severity: SecuritySeverity::Critical,
414                category: SecurityCategory::SecretsExposure,
415                description: "Private key detected".to_string(),
416                remediation: vec![
417                    "Never commit private keys to version control".to_string(),
418                    "Use secure key storage solutions".to_string(),
419                ],
420                references: vec![],
421                cwe_id: Some("CWE-321".to_string()),
422                confidence_boost_keywords: vec!["PRIVATE".to_string(), "RSA".to_string(), "DSA".to_string()],
423                false_positive_keywords: vec!["PUBLIC".to_string(), "CERTIFICATE".to_string()],
424            }),
425        ));
426        
427        Ok(())
428    }
429    
430    /// Load AWS-specific patterns
431    fn load_aws_patterns(api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
432        api_key_patterns.push((
433            "AKIA".to_string(),
434            Arc::new(CompiledPattern {
435                id: "aws-access-key".to_string(),
436                name: "AWS Access Key".to_string(),
437                severity: SecuritySeverity::Critical,
438                category: SecurityCategory::SecretsExposure,
439                description: "AWS Access Key ID detected".to_string(),
440                remediation: vec![
441                    "Remove AWS credentials from source code".to_string(),
442                    "Use IAM roles or environment variables".to_string(),
443                    "Rotate the exposed key immediately".to_string(),
444                ],
445                references: vec!["https://docs.aws.amazon.com/security/".to_string()],
446                cwe_id: Some("CWE-798".to_string()),
447                confidence_boost_keywords: vec!["aws".to_string(), "s3".to_string(), "ec2".to_string()],
448                false_positive_keywords: vec!["AKIA00000000".to_string()],
449            }),
450        ));
451        
452        Ok(())
453    }
454    
455    /// Load GCP-specific patterns
456    fn load_gcp_patterns(api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
457        api_key_patterns.push((
458            "AIza".to_string(),
459            Arc::new(CompiledPattern {
460                id: "gcp-api-key".to_string(),
461                name: "Google Cloud API Key".to_string(),
462                severity: SecuritySeverity::High,
463                category: SecurityCategory::SecretsExposure,
464                description: "Google Cloud API key detected".to_string(),
465                remediation: vec![
466                    "Use service accounts instead of API keys".to_string(),
467                    "Restrict API key usage by IP/referrer".to_string(),
468                ],
469                references: vec!["https://cloud.google.com/security/".to_string()],
470                cwe_id: Some("CWE-798".to_string()),
471                confidence_boost_keywords: vec!["google".to_string(), "gcp".to_string(), "firebase".to_string()],
472                false_positive_keywords: vec![],
473            }),
474        ));
475        
476        Ok(())
477    }
478    
479    /// Load Azure-specific patterns
480    fn load_azure_patterns(_api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
481        // Azure patterns would go here
482        Ok(())
483    }
484    
485    /// Load cryptocurrency-related patterns
486    fn load_crypto_patterns(secret_patterns: &mut Vec<(String, Arc<CompiledPattern>)>) -> Result<(), SecurityError> {
487        secret_patterns.push((
488            "-----BEGIN".to_string(),
489            Arc::new(CompiledPattern {
490                id: "private-key".to_string(),
491                name: "Private Key".to_string(),
492                severity: SecuritySeverity::Critical,
493                category: SecurityCategory::SecretsExposure,
494                description: "Private key detected".to_string(),
495                remediation: vec![
496                    "Never commit private keys to version control".to_string(),
497                    "Use secure key storage solutions".to_string(),
498                ],
499                references: vec![],
500                cwe_id: Some("CWE-321".to_string()),
501                confidence_boost_keywords: vec!["RSA".to_string(), "PRIVATE".to_string()],
502                false_positive_keywords: vec!["PUBLIC".to_string()],
503            }),
504        ));
505        
506        Ok(())
507    }
508}
509
510#[cfg(test)]
511mod tests {
512    use super::*;
513    
514    #[test]
515    fn test_pattern_engine_creation() {
516        let config = TurboConfig::default();
517        let engine = PatternEngine::new(&config);
518        assert!(engine.is_ok());
519        
520        let engine = engine.unwrap();
521        assert!(engine.pattern_count() > 0);
522    }
523    
524    #[test]
525    fn test_pattern_matching() {
526        let config = TurboConfig::default();
527        let engine = PatternEngine::new(&config).unwrap();
528        
529        let content = r#"
530            const apiKey = "sk-1234567890abcdef";
531            password = "super_secret_password";
532            process.env.DATABASE_URL
533        "#;
534        
535        let matches = engine.scan_content(content, false);
536        assert!(!matches.is_empty());
537        
538        // Should find API key and password
539        assert!(matches.iter().any(|m| m.pattern.id == "openai-api-key"));
540        assert!(matches.iter().any(|m| m.pattern.id == "generic-password"));
541    }
542    
543    #[test]
544    fn test_quick_reject() {
545        let config = TurboConfig::default();
546        let engine = PatternEngine::new(&config).unwrap();
547        
548        let safe_content = "fn main() { println!(\"Hello, world!\"); }";
549        let matches = engine.scan_content(safe_content, true);
550        assert!(matches.is_empty());
551    }
552}