syncable_cli/analyzer/security/turbo/
pattern_engine.rs

1//! # Pattern Engine Module
2//!
3//! Ultra-fast multi-pattern matching using Aho-Corasick algorithm and compiled regex sets.
4
5use ahash::AHashMap;
6use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7use log::debug;
8use regex::Regex;
9use std::sync::Arc;
10
11use super::{SecurityError, TurboConfig};
12use crate::analyzer::security::{SecurityCategory, SecuritySeverity};
13
14/// A compiled pattern for ultra-fast matching
15#[derive(Debug, Clone)]
16pub struct CompiledPattern {
17    pub id: String,
18    pub name: String,
19    pub severity: SecuritySeverity,
20    pub category: SecurityCategory,
21    pub description: String,
22    pub remediation: Vec<String>,
23    pub references: Vec<String>,
24    pub cwe_id: Option<String>,
25    pub confidence_boost_keywords: Vec<String>,
26    pub false_positive_keywords: Vec<String>,
27}
28
29/// Pattern match result
30#[derive(Debug, Clone)]
31pub struct PatternMatch {
32    pub pattern: Arc<CompiledPattern>,
33    pub line_number: usize,
34    pub column_number: usize,
35    pub evidence: String,
36    pub confidence: f32,
37}
38
39/// High-performance pattern matching engine
40pub struct PatternEngine {
41    // Multi-pattern matchers
42    secret_matcher: AhoCorasick,
43    env_var_matcher: AhoCorasick,
44    api_key_matcher: AhoCorasick,
45
46    // Pattern lookup maps
47    secret_patterns: AHashMap<usize, Arc<CompiledPattern>>,
48    env_var_patterns: AHashMap<usize, Arc<CompiledPattern>>,
49    api_key_patterns: AHashMap<usize, Arc<CompiledPattern>>,
50
51    // Specialized matchers for complex patterns
52    complex_patterns: Vec<(Regex, Arc<CompiledPattern>)>,
53
54    // Performance counters
55    total_patterns: usize,
56}
57
58impl PatternEngine {
59    pub fn new(config: &TurboConfig) -> Result<Self, SecurityError> {
60        debug!(
61            "Initializing pattern engine with pattern sets: {:?}",
62            config.pattern_sets
63        );
64
65        // Load patterns based on configuration
66        let (secret_patterns, env_var_patterns, api_key_patterns, complex_patterns) =
67            Self::load_patterns(&config.pattern_sets)?;
68
69        // Build Aho-Corasick matchers
70        let secret_matcher = Self::build_matcher(&secret_patterns)?;
71        let env_var_matcher = Self::build_matcher(&env_var_patterns)?;
72        let api_key_matcher = Self::build_matcher(&api_key_patterns)?;
73
74        let total_patterns = secret_patterns.len()
75            + env_var_patterns.len()
76            + api_key_patterns.len()
77            + complex_patterns.len();
78
79        debug!(
80            "Pattern engine initialized with {} total patterns",
81            total_patterns
82        );
83
84        Ok(Self {
85            secret_matcher,
86            env_var_matcher,
87            api_key_matcher,
88            secret_patterns: Self::create_pattern_map(secret_patterns),
89            env_var_patterns: Self::create_pattern_map(env_var_patterns),
90            api_key_patterns: Self::create_pattern_map(api_key_patterns),
91            complex_patterns,
92            total_patterns,
93        })
94    }
95
96    /// Get total pattern count
97    pub fn pattern_count(&self) -> usize {
98        self.total_patterns
99    }
100
101    /// Scan content for all patterns
102    pub fn scan_content(
103        &self,
104        content: &str,
105        quick_reject: bool,
106        file_meta: &super::file_discovery::FileMetadata,
107    ) -> Vec<PatternMatch> {
108        // Quick reject using Boyer-Moore substring search
109        if quick_reject && !self.quick_contains_secrets(content) {
110            return Vec::new();
111        }
112
113        let mut matches = Vec::new();
114
115        // Split content into lines for line number tracking
116        let lines: Vec<&str> = content.lines().collect();
117        let mut line_offsets = vec![0];
118        let mut offset = 0;
119
120        for line in &lines {
121            offset += line.len() + 1; // +1 for newline
122            line_offsets.push(offset);
123        }
124
125        // Run multi-pattern matchers
126        matches.extend(self.run_matcher(
127            &self.secret_matcher,
128            content,
129            &self.secret_patterns,
130            &lines,
131            &line_offsets,
132            file_meta,
133        ));
134        matches.extend(self.run_matcher(
135            &self.env_var_matcher,
136            content,
137            &self.env_var_patterns,
138            &lines,
139            &line_offsets,
140            file_meta,
141        ));
142        matches.extend(self.run_matcher(
143            &self.api_key_matcher,
144            content,
145            &self.api_key_patterns,
146            &lines,
147            &line_offsets,
148            file_meta,
149        ));
150
151        // Run complex patterns (regex-based)
152        for (line_num, line) in lines.iter().enumerate() {
153            for (regex, pattern) in &self.complex_patterns {
154                if let Some(mat) = regex.find(line) {
155                    let confidence = self.calculate_confidence(line, content, &pattern, file_meta);
156
157                    matches.push(PatternMatch {
158                        pattern: Arc::clone(pattern),
159                        line_number: line_num + 1,
160                        column_number: mat.start() + 1,
161                        evidence: self.extract_evidence(line, mat.start(), mat.end()),
162                        confidence,
163                    });
164                }
165            }
166        }
167
168        // Intelligent confidence filtering - adaptive threshold based on pattern type
169        matches.retain(|m| {
170            let threshold = match m.pattern.id.as_str() {
171                id if id.contains("aws-access-key") => 0.4, // AWS keys need higher confidence
172                id if id.contains("openai-api-key") => 0.4, // OpenAI keys need higher confidence
173                id if id.contains("jwt-token") => 0.6, // JWT tokens need high confidence (often in examples)
174                id if id.contains("database-url") => 0.5, // Database URLs medium confidence
175                id if id.contains("bearer-token") => 0.7, // Bearer tokens often in examples
176                id if id.contains("generic") => 0.8,   // Generic patterns need very high confidence
177                id if id.contains("long-secret-value") => 0.7, // Long secret values need high confidence
178                _ => 0.7,                                      // Increased default threshold
179            };
180            m.confidence > threshold
181        });
182
183        matches
184    }
185
186    /// Quick check if content might contain secrets
187    fn quick_contains_secrets(&self, content: &str) -> bool {
188        // Enhanced quick rejection for common false positive patterns
189        if self.is_likely_false_positive_content(content) {
190            return false;
191        }
192
193        // Common secret indicators (optimized for speed)
194        const QUICK_PATTERNS: &[&str] = &[
195            "api",
196            "key",
197            "secret",
198            "token",
199            "password",
200            "credential",
201            "auth",
202            "private",
203            "-----BEGIN",
204            "sk_",
205            "pk_",
206            "eyJ",
207        ];
208
209        let content_lower = content.to_lowercase();
210        QUICK_PATTERNS
211            .iter()
212            .any(|&pattern| content_lower.contains(pattern))
213    }
214
215    /// Check if content is likely a false positive (encoded data, minified code, etc.)
216    fn is_likely_false_positive_content(&self, content: &str) -> bool {
217        let content_len = content.len();
218
219        // Skip empty or very small content
220        if content_len < 10 {
221            return true;
222        }
223
224        // Check for base64 data URLs (common in SVG, images)
225        if content.contains("data:image/") || content.contains("data:font/") {
226            return true;
227        }
228
229        // Check for minified JavaScript (very long lines, no spaces)
230        let lines: Vec<&str> = content.lines().collect();
231        if lines.len() < 5
232            && lines
233                .iter()
234                .any(|line| line.len() > 500 && line.matches(' ').count() < line.len() / 50)
235        {
236            return true;
237        }
238
239        // Check for high percentage of base64-like characters (but not a JWT)
240        let base64_chars = content
241            .chars()
242            .filter(|c| c.is_alphanumeric() || *c == '+' || *c == '/' || *c == '=')
243            .count();
244        let base64_ratio = base64_chars as f32 / content_len as f32;
245
246        // High base64 ratio but doesn't look like JWT tokens
247        if base64_ratio > 0.8 && !content.contains("eyJ") && content_len > 1000 {
248            return true;
249        }
250
251        // Check for SVG content
252        if content.contains("<svg") || content.contains("xmlns=\"http://www.w3.org/2000/svg\"") {
253            return true;
254        }
255
256        // Check for CSS content
257        if content.contains("@media")
258            || content.contains("@import")
259            || (content.contains("{") && content.contains("}") && content.contains(":"))
260        {
261            return true;
262        }
263
264        false
265    }
266
267    /// Run Aho-Corasick matcher and collect results
268    fn run_matcher(
269        &self,
270        matcher: &AhoCorasick,
271        content: &str,
272        patterns: &AHashMap<usize, Arc<CompiledPattern>>,
273        lines: &[&str],
274        line_offsets: &[usize],
275        file_meta: &super::file_discovery::FileMetadata,
276    ) -> Vec<PatternMatch> {
277        let mut matches = Vec::new();
278
279        for mat in matcher.find_iter(content) {
280            let pattern_id = mat.pattern().as_usize();
281            if let Some(pattern) = patterns.get(&pattern_id) {
282                // Find line and column
283                let (line_num, col_num) = self.offset_to_line_col(mat.start(), line_offsets);
284                let line = lines.get(line_num.saturating_sub(1)).unwrap_or(&"");
285
286                let confidence = self.calculate_confidence(line, content, pattern, file_meta);
287
288                matches.push(PatternMatch {
289                    pattern: Arc::clone(pattern),
290                    line_number: line_num,
291                    column_number: col_num,
292                    evidence: self.extract_evidence(line, mat.start(), mat.end()),
293                    confidence,
294                });
295            }
296        }
297
298        matches
299    }
300
301    /// Convert byte offset to line and column numbers
302    fn offset_to_line_col(&self, offset: usize, line_offsets: &[usize]) -> (usize, usize) {
303        let line_num = line_offsets
304            .binary_search(&offset)
305            .unwrap_or_else(|i| i.saturating_sub(1));
306
307        let line_start = line_offsets.get(line_num).copied().unwrap_or(0);
308        let col_num = offset - line_start + 1;
309
310        (line_num + 1, col_num)
311    }
312
313    /// Calculate confidence score for a match
314    fn calculate_confidence(
315        &self,
316        line: &str,
317        content: &str,
318        pattern: &CompiledPattern,
319        file_meta: &super::file_discovery::FileMetadata,
320    ) -> f32 {
321        let mut confidence: f32 = 0.6;
322
323        let _line_lower = line.to_lowercase();
324        let _content_lower = content.to_lowercase();
325
326        // Enhanced false positive detection
327        if self.is_obvious_false_positive(line, content, file_meta) {
328            return 0.0;
329        }
330
331        // Context-based confidence adjustments
332        confidence = self.adjust_confidence_for_context(confidence, line, content, pattern);
333
334        // Pattern-specific adjustments
335        confidence = self.adjust_confidence_for_pattern(confidence, line, content, pattern);
336
337        confidence.clamp(0.0, 1.0)
338    }
339
340    /// Check for obvious false positives
341    fn is_obvious_false_positive(
342        &self,
343        line: &str,
344        content: &str,
345        file_meta: &super::file_discovery::FileMetadata,
346    ) -> bool {
347        let line_lower = line.to_lowercase();
348
349        // Comments and documentation
350        if line_lower.trim_start().starts_with("//")
351            || line_lower.trim_start().starts_with("#")
352            || line_lower.trim_start().starts_with("*")
353            || line_lower.trim_start().starts_with("<!--")
354        {
355            return true;
356        }
357
358        // Check for safe keys in common dependency management files
359        if self.is_safe_dependency_metadata(line, file_meta) {
360            return true;
361        }
362
363        // JavaScript/TypeScript template literals (${...})
364        if line.contains("${") && line.contains("}") {
365            return true;
366        }
367
368        // Template strings and interpolation patterns
369        if line.contains("${selectedApiKey")
370            || line.contains("${apiKey")
371            || line.contains("${key")
372            || line.contains("${token")
373        {
374            return true;
375        }
376
377        // Code generation contexts (functions that generate example code)
378        if self.is_in_code_generation_context(content) && self.looks_like_template_code(line) {
379            return true;
380        }
381
382        // Common example/placeholder patterns
383        let false_positive_patterns = [
384            "example",
385            "placeholder",
386            "your_",
387            "todo",
388            "fixme",
389            "xxx",
390            "xxxxxxxx",
391            "12345",
392            "abcdef",
393            "test",
394            "demo",
395            "sample",
396            "lorem",
397            "ipsum",
398            "change_me",
399            "replace_me",
400            "insert_",
401            "enter_your",
402            "add_your",
403            "put_your",
404            "use_your",
405            // React/JSX specific patterns
406            "props.",
407            "state.",
408            "this.",
409            "component",
410        ];
411
412        if false_positive_patterns
413            .iter()
414            .any(|&pattern| line_lower.contains(pattern))
415        {
416            return true;
417        }
418
419        // Check for JSON schema or TypeScript interfaces
420        if line_lower.contains("@example")
421            || line_lower.contains("@param")
422            || line_lower.contains("interface")
423            || line_lower.contains("type ")
424        {
425            return true;
426        }
427
428        // Check for base64 data URLs
429        if line.contains("data:image/")
430            || line.contains("data:font/")
431            || line.contains("data:application/")
432        {
433            return true;
434        }
435
436        // Check for URLs in an array context
437        if (line.contains("http://") || line.contains("https://"))
438            && self.is_in_array_or_list(content)
439        {
440            return true;
441        }
442
443        // Check for command-line scripts which often contain high-entropy strings
444        // that are not secrets (e.g., project IDs, build hashes).
445        if self.is_command_line_script(line) {
446            return true;
447        }
448
449        // Check for environment variable interpolations, which are secure.
450        if self.is_env_var_interpolation(line, file_meta) {
451            return true;
452        }
453
454        // Check for minified content (very long line with little whitespace)
455        if line.len() > 200 && line.matches(' ').count() < line.len() / 20 {
456            return true;
457        }
458
459        // React/JSX template patterns
460        if line.contains("return `") || line.contains("const ") && line.contains(" = `") {
461            return true;
462        }
463
464        false
465    }
466
467    /// Check if we're inside an array or list definition
468    fn is_in_array_or_list(&self, content: &str) -> bool {
469        let content_lower = content.to_lowercase();
470        // Language-agnostic checks for array/list definitions
471        let array_patterns = [
472            "const ",
473            "let ",
474            "var ",
475            "export const ",
476            "export let ",
477            "authorized_parties",
478            "allowed_origins",
479            "authorized_domains",
480            "hosts",
481            "urls",
482            "uris",
483            "endpoints",
484            "domains",
485            "redirect_uris",
486            "allowed_hosts",
487            "cors_origins",
488            "trusted_sources",
489        ];
490
491        array_patterns.iter().any(|p| content_lower.contains(p)) &&
492        (content.contains("[") && content.contains("]")) || // JS, Python, Rust arrays/lists
493        (content.contains("(") && content.contains(")")) || // Python tuples
494        (content.contains("{") && content.contains("}")) // Go slices
495    }
496
497    /// Check if a line looks like a command-line script.
498    /// This is to avoid flagging project IDs, build hashes, or other identifiers
499    /// inside shell commands as secrets.
500    fn is_command_line_script(&self, line: &str) -> bool {
501        // Quick check for flags, which are a strong indicator of a shell command.
502        if !line.contains("--") {
503            return false;
504        }
505
506        let line_lower = line.to_lowercase();
507
508        // Common script/command keywords.
509        // The presence of these alongside flags increases confidence that it's a script.
510        let command_keywords = [
511            // Verbs
512            "run",
513            "exec",
514            "build",
515            "start",
516            "test",
517            "deploy",
518            "gen",
519            "generate",
520            "get",
521            "set",
522            "create",
523            "delete",
524            "update",
525            "push",
526            "pull",
527            "watch",
528            "serve",
529            "lint",
530            "format",
531            // Nouns/Context
532            "client",
533            "server",
534            "output",
535            "input",
536            "file",
537            "env",
538            "environment",
539            "config",
540            "path",
541            "dir",
542            "port",
543            "host",
544            "watch",
545            "prod",
546            "dev",
547            // Common tools
548            "npm",
549            "yarn",
550            "pnpm",
551            "npx",
552            "node",
553            "python",
554            "pip",
555            "go",
556            "cargo",
557            "docker",
558            "aws",
559            "gcloud",
560            "az",
561            "kubectl",
562            "terraform",
563            "encore",
564            "bun",
565            "bunx",
566            "maven",
567            "gradle",
568            "gradlew",
569            "gradlew.bat",
570            "gradlew.sh",
571            "gradlew.jar",
572            "gradlew.zip",
573            "mvn",
574            "pipx",
575            "pipenv",
576            "poetry",
577            "ruff",
578            "black",
579            "isort",
580            "flake8",
581            "mypy",
582            "pytest",
583            "jest",
584            "mocha",
585            "jasmine",
586            "cypress",
587            "playwright",
588            "selenium",
589            "puppeteer",
590            "webdriver",
591            "puppeteer-extra",
592            "puppeteer-extra-plugin-stealth",
593            "puppeteer-extra-plugin-recaptcha",
594        ];
595
596        // If we find a flag AND a common command keyword, it's very likely a script.
597        if command_keywords.iter().any(|&kw| line_lower.contains(kw)) {
598            return true;
599        }
600
601        // Also consider it a script if it looks like a file path assignment after a flag
602        if line.contains("--") && (line.contains('/') || line.contains('\\') || line.contains('='))
603        {
604            return true;
605        }
606
607        false
608    }
609
610    /// Check if we're in a code generation context
611    fn is_in_code_generation_context(&self, content: &str) -> bool {
612        let content_lower = content.to_lowercase();
613
614        // Common code generation function names and patterns
615        let code_gen_patterns = [
616            "getcode",
617            "generatecode",
618            "codecomponent",
619            "apicodedialog",
620            "const getcode",
621            "function getcode",
622            "const code",
623            "function code",
624            "codesnippet",
625            "codeexample",
626            "template",
627            "example code",
628            "code generator",
629            "api example",
630            "curl example",
631            // React/JSX specific
632            "codeblock",
633            "copyblock",
634            "syntax highlight",
635        ];
636
637        code_gen_patterns
638            .iter()
639            .any(|&pattern| content_lower.contains(pattern))
640    }
641
642    /// Check if a line looks like template code
643    fn looks_like_template_code(&self, line: &str) -> bool {
644        // Template string patterns
645        if line.contains("return `") || line.contains("= `") {
646            return true;
647        }
648
649        // API URL construction patterns
650        if line.contains("API_URL") || line.contains("/api/v1/") || line.contains("/prediction/") {
651            return true;
652        }
653
654        // Typical code example patterns
655        if line.contains("requests.post")
656            || line.contains("fetch(")
657            || line.contains("curl ")
658            || line.contains("import requests")
659        {
660            return true;
661        }
662
663        // Authorization header patterns in templates
664        if line.contains("Authorization:") || line.contains("Bearer ") {
665            return true;
666        }
667
668        false
669    }
670
671    /// Adjust confidence based on context
672    fn adjust_confidence_for_context(
673        &self,
674        mut confidence: f32,
675        line: &str,
676        content: &str,
677        _pattern: &CompiledPattern,
678    ) -> f32 {
679        let line_lower = line.to_lowercase();
680        let content_lower = content.to_lowercase();
681
682        // Boost confidence for actual assignments
683        if line.contains("=") || line.contains(":") {
684            confidence += 0.2;
685        }
686
687        // Boost for environment variable assignment
688        if line_lower.contains("export ") || line_lower.contains("process.env") {
689            confidence += 0.3;
690        }
691
692        // Boost for import statements with API keys
693        if line_lower.contains("import")
694            && (line_lower.contains("api") || line_lower.contains("key"))
695        {
696            confidence += 0.1;
697        }
698
699        // Reduce confidence for certain file types based on content
700        if content_lower.contains("package.json") || content_lower.contains("node_modules") {
701            confidence -= 0.2;
702        }
703
704        // Reduce confidence for test files
705        if content_lower.contains("/test/")
706            || content_lower.contains("__test__")
707            || content_lower.contains(".test.")
708            || content_lower.contains(".spec.")
709        {
710            confidence -= 0.3;
711        }
712
713        // Reduce confidence for documentation
714        if content_lower.contains("readme")
715            || content_lower.contains("documentation")
716            || content_lower.contains("docs/")
717        {
718            confidence -= 0.4;
719        }
720
721        confidence
722    }
723
724    /// Adjust confidence based on pattern-specific rules
725    fn adjust_confidence_for_pattern(
726        &self,
727        mut confidence: f32,
728        line: &str,
729        content: &str,
730        pattern: &CompiledPattern,
731    ) -> f32 {
732        let line_lower = line.to_lowercase();
733        let content_lower = content.to_lowercase();
734
735        // Major confidence reduction for template/code generation contexts
736        if self.is_in_code_generation_context(content) {
737            confidence -= 0.6;
738        }
739
740        // Check pattern-specific confidence boost keywords
741        for keyword in &pattern.confidence_boost_keywords {
742            if content_lower.contains(&keyword.to_lowercase()) {
743                confidence += 0.1;
744            }
745        }
746
747        // Check pattern-specific false positive keywords
748        for keyword in &pattern.false_positive_keywords {
749            if line_lower.contains(&keyword.to_lowercase()) {
750                confidence -= 0.4;
751            }
752        }
753
754        // Special handling for specific pattern types
755        match pattern.id.as_str() {
756            "jwt-token" => {
757                // JWT tokens should have proper structure
758                if !line.contains("eyJ") || line.split('.').count() != 3 {
759                    confidence -= 0.3;
760                }
761                // Less confident if in a comment or documentation
762                if line_lower.contains("example") || line_lower.contains("jwt") {
763                    confidence -= 0.2;
764                }
765                // Very low confidence for template literals
766                if line.contains("${") {
767                    confidence -= 0.8;
768                }
769            }
770            "openai-api-key" => {
771                // OpenAI keys should start with sk- and be proper length
772                if !line.contains("sk-") {
773                    confidence -= 0.5;
774                }
775                // Boost if in actual code context
776                if line_lower.contains("openai") || line_lower.contains("gpt") {
777                    confidence += 0.2;
778                }
779                // Major reduction for template literals
780                if line.contains("${") || line.contains("selectedApiKey") {
781                    confidence -= 0.9;
782                }
783            }
784            "database-url-with-creds" => {
785                // Should be a valid URL format
786                if !line.contains("://") || line.contains("example.com") {
787                    confidence -= 0.4;
788                }
789
790                // Check for placeholder credentials
791                let placeholder_creds = [
792                    "user:pass",
793                    "user:password",
794                    "admin:admin",
795                    "admin:password",
796                    "username:password",
797                    "test:test",
798                    "root:root",
799                    "postgres:postgres",
800                ];
801                if placeholder_creds.iter().any(|p| line.contains(p)) {
802                    confidence -= 0.8; // Drastically reduce confidence for placeholders
803                }
804
805                // Reduce for template patterns
806                if line.contains("${") {
807                    confidence -= 0.7;
808                }
809            }
810            "long-secret-value" | "generic-api-key" => {
811                // High reduction for template literals and code generation
812                if line.contains("${")
813                    || line.contains("selectedApiKey")
814                    || line.contains("apiKey") && line.contains("?")
815                {
816                    confidence -= 0.8;
817                }
818                // Reduce for Bearer token patterns in templates
819                if line.contains("Bearer ") && line.contains("${") {
820                    confidence -= 0.9;
821                }
822            }
823            _ => {
824                // General template literal reduction
825                if line.contains("${") {
826                    confidence -= 0.6;
827                }
828            }
829        }
830
831        // Additional React/JSX specific reductions
832        if content_lower.contains("react")
833            || content_lower.contains("jsx")
834            || content_lower.contains("component")
835        {
836            if line.contains("${") || line.contains("props.") || line.contains("state.") {
837                confidence -= 0.5;
838            }
839        }
840
841        confidence
842    }
843
844    /// Extract evidence with context
845    fn extract_evidence(&self, line: &str, start: usize, end: usize) -> String {
846        // Mask the actual secret value
847        let prefix = &line[..start.min(line.len())];
848        let suffix = &line[end.min(line.len())..];
849        let masked = "*".repeat((end - start).min(20));
850
851        format!("{}{}{}", prefix, masked, suffix).trim().to_string()
852    }
853
854    /// Build Aho-Corasick matcher from patterns
855    fn build_matcher(
856        patterns: &[(String, Arc<CompiledPattern>)],
857    ) -> Result<AhoCorasick, SecurityError> {
858        let strings: Vec<&str> = patterns.iter().map(|(s, _)| s.as_str()).collect();
859
860        let matcher = AhoCorasickBuilder::new()
861            .match_kind(MatchKind::LeftmostFirst)
862            .ascii_case_insensitive(true)
863            .build(&strings)
864            .map_err(|e| SecurityError::PatternEngine(format!("Failed to build matcher: {}", e)))?;
865
866        Ok(matcher)
867    }
868
869    /// Create pattern lookup map
870    fn create_pattern_map(
871        patterns: Vec<(String, Arc<CompiledPattern>)>,
872    ) -> AHashMap<usize, Arc<CompiledPattern>> {
873        patterns
874            .into_iter()
875            .enumerate()
876            .map(|(id, (_, pattern))| (id, pattern))
877            .collect()
878    }
879
880    /// Load patterns based on pattern sets
881    fn load_patterns(
882        pattern_sets: &[String],
883    ) -> Result<
884        (
885            Vec<(String, Arc<CompiledPattern>)>,
886            Vec<(String, Arc<CompiledPattern>)>,
887            Vec<(String, Arc<CompiledPattern>)>,
888            Vec<(Regex, Arc<CompiledPattern>)>,
889        ),
890        SecurityError,
891    > {
892        let mut secret_patterns = Vec::new();
893        let mut env_var_patterns = Vec::new();
894        let mut api_key_patterns = Vec::new();
895        let mut complex_patterns = Vec::new();
896
897        // Load default patterns
898        if pattern_sets.contains(&"default".to_string()) {
899            Self::load_default_patterns(
900                &mut secret_patterns,
901                &mut env_var_patterns,
902                &mut api_key_patterns,
903                &mut complex_patterns,
904            )?;
905        }
906
907        // Load additional pattern sets
908        for set in pattern_sets {
909            match set.as_str() {
910                "aws" => Self::load_aws_patterns(&mut api_key_patterns)?,
911                "gcp" => Self::load_gcp_patterns(&mut api_key_patterns)?,
912                "azure" => Self::load_azure_patterns(&mut api_key_patterns)?,
913                "crypto" => Self::load_crypto_patterns(&mut secret_patterns)?,
914                _ => {}
915            }
916        }
917
918        Ok((
919            secret_patterns,
920            env_var_patterns,
921            api_key_patterns,
922            complex_patterns,
923        ))
924    }
925
926    /// Load default security patterns - focused on ACTUAL secrets, not references
927    fn load_default_patterns(
928        secret_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
929        _env_var_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
930        api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
931        complex_patterns: &mut Vec<(Regex, Arc<CompiledPattern>)>,
932    ) -> Result<(), SecurityError> {
933        // ONLY detect actual API key values, not variable names
934
935        // OpenAI API Keys - actual key format
936        api_key_patterns.push((
937            "sk-".to_string(),
938            Arc::new(CompiledPattern {
939                id: "openai-api-key".to_string(),
940                name: "OpenAI API Key".to_string(),
941                severity: SecuritySeverity::Critical,
942                category: SecurityCategory::SecretsExposure,
943                description: "OpenAI API key detected".to_string(),
944                remediation: vec![
945                    "Remove API key from source code".to_string(),
946                    "Use environment variables".to_string(),
947                ],
948                references: vec!["https://platform.openai.com/docs/api-reference".to_string()],
949                cwe_id: Some("CWE-798".to_string()),
950                confidence_boost_keywords: vec!["openai".to_string(), "gpt".to_string()],
951                false_positive_keywords: vec![
952                    "sk-xxxxxxxx".to_string(),
953                    "sk-...".to_string(),
954                    "sk_test".to_string(),
955                    "example".to_string(),
956                    "placeholder".to_string(),
957                    "your_".to_string(),
958                    "TODO".to_string(),
959                    "FIXME".to_string(),
960                    "XXX".to_string(),
961                ],
962            }),
963        ));
964
965        // Complex regex patterns for ACTUAL secret assignments with values
966        complex_patterns.push((
967            // Only match when there's an actual long value, not just variable names
968            Regex::new(r#"(?i)(?:api[_-]?key|secret[_-]?key|access[_-]?token)\s*[:=]\s*['"]([a-zA-Z0-9+/=]{32,})['"]"#)
969                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
970            Arc::new(CompiledPattern {
971                id: "long-secret-value".to_string(),
972                name: "Hardcoded Secret Value".to_string(),
973                severity: SecuritySeverity::Critical,
974                category: SecurityCategory::SecretsExposure,
975                description: "Long secret value hardcoded in source code".to_string(),
976                remediation: vec![
977                    "Use environment variables for secrets".to_string(),
978                    "Implement proper secret management".to_string(),
979                ],
980                references: vec![],
981                cwe_id: Some("CWE-798".to_string()),
982                confidence_boost_keywords: vec!["bearer".to_string(), "auth".to_string()],
983                false_positive_keywords: vec![
984                    "process.env".to_string(), "getenv".to_string(), "example".to_string(),
985                    "placeholder".to_string(), "your_".to_string(), "TODO".to_string(),
986                    "test".to_string(), "demo".to_string(), "fake".to_string(),
987                ],
988            }),
989        ));
990
991        // JWT tokens (actual token format)
992        complex_patterns.push((
993            Regex::new(r#"\beyJ[a-zA-Z0-9+/=]{100,}\b"#)
994                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
995            Arc::new(CompiledPattern {
996                id: "jwt-token".to_string(),
997                name: "JWT Token".to_string(),
998                severity: SecuritySeverity::High,
999                category: SecurityCategory::SecretsExposure,
1000                description: "JWT token detected in source code".to_string(),
1001                remediation: vec![
1002                    "Never hardcode JWT tokens".to_string(),
1003                    "Use secure token storage".to_string(),
1004                ],
1005                references: vec![],
1006                cwe_id: Some("CWE-798".to_string()),
1007                confidence_boost_keywords: vec!["bearer".to_string(), "authorization".to_string()],
1008                false_positive_keywords: vec!["example".to_string(), "demo".to_string()],
1009            }),
1010        ));
1011
1012        // Database connection strings with embedded credentials
1013        complex_patterns.push((
1014            Regex::new(r#"(?i)(?:postgres|postgresql|mysql|mongodb|redis|mariadb)://[^:\s]+:[^@\s]+@[^/\s]+/[^\s]*"#)
1015                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
1016            Arc::new(CompiledPattern {
1017                id: "database-url-with-creds".to_string(),
1018                name: "Database URL with Credentials".to_string(),
1019                severity: SecuritySeverity::Critical,
1020                category: SecurityCategory::SecretsExposure,
1021                description: "Database connection string with embedded credentials".to_string(),
1022                remediation: vec![
1023                    "Use environment variables for database credentials".to_string(),
1024                    "Use connection string without embedded passwords".to_string(),
1025                ],
1026                references: vec![],
1027                cwe_id: Some("CWE-798".to_string()),
1028                confidence_boost_keywords: vec!["connection".to_string(), "database".to_string()],
1029                false_positive_keywords: vec![
1030                    "example.com".to_string(), "localhost".to_string(), "placeholder".to_string(),
1031                    "your_".to_string(), "user:pass".to_string(),
1032                ],
1033            }),
1034        ));
1035
1036        // Private SSH/SSL keys
1037        secret_patterns.push((
1038            "-----BEGIN".to_string(),
1039            Arc::new(CompiledPattern {
1040                id: "private-key-header".to_string(),
1041                name: "Private Key".to_string(),
1042                severity: SecuritySeverity::Critical,
1043                category: SecurityCategory::SecretsExposure,
1044                description: "Private key detected".to_string(),
1045                remediation: vec![
1046                    "Never commit private keys to version control".to_string(),
1047                    "Use secure key storage solutions".to_string(),
1048                ],
1049                references: vec![],
1050                cwe_id: Some("CWE-321".to_string()),
1051                confidence_boost_keywords: vec![
1052                    "PRIVATE".to_string(),
1053                    "RSA".to_string(),
1054                    "DSA".to_string(),
1055                ],
1056                false_positive_keywords: vec!["PUBLIC".to_string(), "CERTIFICATE".to_string()],
1057            }),
1058        ));
1059
1060        Ok(())
1061    }
1062
1063    /// Load AWS-specific patterns
1064    fn load_aws_patterns(
1065        api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
1066    ) -> Result<(), SecurityError> {
1067        api_key_patterns.push((
1068            "AKIA".to_string(),
1069            Arc::new(CompiledPattern {
1070                id: "aws-access-key".to_string(),
1071                name: "AWS Access Key".to_string(),
1072                severity: SecuritySeverity::Critical,
1073                category: SecurityCategory::SecretsExposure,
1074                description: "AWS Access Key ID detected".to_string(),
1075                remediation: vec![
1076                    "Remove AWS credentials from source code".to_string(),
1077                    "Use IAM roles or environment variables".to_string(),
1078                    "Rotate the exposed key immediately".to_string(),
1079                ],
1080                references: vec!["https://docs.aws.amazon.com/security/".to_string()],
1081                cwe_id: Some("CWE-798".to_string()),
1082                confidence_boost_keywords: vec![
1083                    "aws".to_string(),
1084                    "s3".to_string(),
1085                    "ec2".to_string(),
1086                ],
1087                false_positive_keywords: vec!["AKIA00000000".to_string()],
1088            }),
1089        ));
1090
1091        Ok(())
1092    }
1093
1094    /// Load GCP-specific patterns
1095    fn load_gcp_patterns(
1096        api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
1097    ) -> Result<(), SecurityError> {
1098        api_key_patterns.push((
1099            "AIza".to_string(),
1100            Arc::new(CompiledPattern {
1101                id: "gcp-api-key".to_string(),
1102                name: "Google Cloud API Key".to_string(),
1103                severity: SecuritySeverity::High,
1104                category: SecurityCategory::SecretsExposure,
1105                description: "Google Cloud API key detected".to_string(),
1106                remediation: vec![
1107                    "Use service accounts instead of API keys".to_string(),
1108                    "Restrict API key usage by IP/referrer".to_string(),
1109                ],
1110                references: vec!["https://cloud.google.com/security/".to_string()],
1111                cwe_id: Some("CWE-798".to_string()),
1112                confidence_boost_keywords: vec![
1113                    "google".to_string(),
1114                    "gcp".to_string(),
1115                    "firebase".to_string(),
1116                ],
1117                false_positive_keywords: vec![],
1118            }),
1119        ));
1120
1121        Ok(())
1122    }
1123
1124    /// Load Azure-specific patterns
1125    fn load_azure_patterns(
1126        _api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
1127    ) -> Result<(), SecurityError> {
1128        // Azure patterns would go here
1129        Ok(())
1130    }
1131
1132    /// Load cryptocurrency-related patterns
1133    fn load_crypto_patterns(
1134        secret_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
1135    ) -> Result<(), SecurityError> {
1136        secret_patterns.push((
1137            "-----BEGIN".to_string(),
1138            Arc::new(CompiledPattern {
1139                id: "private-key".to_string(),
1140                name: "Private Key".to_string(),
1141                severity: SecuritySeverity::Critical,
1142                category: SecurityCategory::SecretsExposure,
1143                description: "Private key detected".to_string(),
1144                remediation: vec![
1145                    "Never commit private keys to version control".to_string(),
1146                    "Use secure key storage solutions".to_string(),
1147                ],
1148                references: vec![],
1149                cwe_id: Some("CWE-321".to_string()),
1150                confidence_boost_keywords: vec!["RSA".to_string(), "PRIVATE".to_string()],
1151                false_positive_keywords: vec!["PUBLIC".to_string()],
1152            }),
1153        ));
1154
1155        Ok(())
1156    }
1157
1158    /// Checks if a line is a safe, non-secret key-value pair in a known dependency file.
1159    fn is_safe_dependency_metadata(
1160        &self,
1161        line: &str,
1162        file_meta: &super::file_discovery::FileMetadata,
1163    ) -> bool {
1164        let filename = file_meta
1165            .path
1166            .file_name()
1167            .and_then(|s| s.to_str())
1168            .unwrap_or("");
1169        let line_trimmed = line.trim();
1170
1171        match filename {
1172            "package.json" => {
1173                // Keys in JSON are quoted strings
1174                let safe_keys = [
1175                    "\"name\"",
1176                    "\"version\"",
1177                    "\"description\"",
1178                    "\"main\"",
1179                    "\"module\"",
1180                    "\"type\"",
1181                    "\"private\"",
1182                    "\"license\"",
1183                    "\"author\"",
1184                    "\"homepage\"",
1185                    "\"repository\"",
1186                    "\"bugs\"",
1187                    "\"keywords\"",
1188                    "\"workspaces\"",
1189                ];
1190                safe_keys.iter().any(|key| line_trimmed.starts_with(key))
1191            }
1192            "Cargo.toml" | "pyproject.toml" => {
1193                // Keys in TOML are typically not quoted
1194                let safe_keys = [
1195                    "name =",
1196                    "version =",
1197                    "description =",
1198                    "edition =",
1199                    "license =",
1200                    "authors =",
1201                    "homepage =",
1202                    "repository =",
1203                    "documentation =",
1204                    "keywords =",
1205                ];
1206                safe_keys.iter().any(|key| line_trimmed.starts_with(key))
1207            }
1208            "go.mod" => line_trimmed.starts_with("module ") || line_trimmed.starts_with("go "),
1209            "pom.xml" => {
1210                // Keys in XML are tags
1211                let safe_tags = [
1212                    "<groupId>",
1213                    "<artifactId>",
1214                    "<version>",
1215                    "<name>",
1216                    "<description>",
1217                    "<url>",
1218                    "<license>",
1219                ];
1220                safe_tags.iter().any(|tag| line_trimmed.contains(tag))
1221            }
1222            "build.gradle" | "build.gradle.kts" => {
1223                let safe_assignments = ["rootProject.name =", "group =", "version ="];
1224                safe_assignments.iter().any(|s| line_trimmed.starts_with(s))
1225            }
1226            _ => false,
1227        }
1228    }
1229
1230    /// Checks if a line contains a reference to an environment variable, not a hardcoded secret.
1231    fn is_env_var_interpolation(
1232        &self,
1233        line: &str,
1234        file_meta: &super::file_discovery::FileMetadata,
1235    ) -> bool {
1236        let filename = file_meta
1237            .path
1238            .file_name()
1239            .and_then(|s| s.to_str())
1240            .unwrap_or("");
1241
1242        // Pattern 1: JSON-based `{"$env": "VAR"}`. This is a very specific and safe pattern.
1243        if line.contains("\"$env\"") {
1244            return true;
1245        }
1246
1247        // Pattern 2: Shell/YAML/Dockerfile `${VAR}` or `$VAR`. This is more generic.
1248        if line.contains('$') {
1249            // Check for `${...}` or `$VAR` patterns
1250            if line.contains("${") && line.contains("}") {
1251                let is_config_file = matches!(
1252                    filename,
1253                    "docker-compose.yml"
1254                        | "docker-compose.yaml"
1255                        | "Dockerfile"
1256                        | "Jenkinsfile"
1257                        | "Makefile"
1258                ) || filename.ends_with(".env")
1259                    || filename.ends_with(".sh")
1260                    || filename.ends_with(".yml")
1261                    || filename.ends_with(".yaml");
1262
1263                if is_config_file {
1264                    return true;
1265                }
1266
1267                // Also check for context keywords in any file
1268                let line_lower = line.to_lowercase();
1269                let env_context_keywords =
1270                    ["environment:", "command:", "entrypoint:", "value:", "args:"];
1271                if env_context_keywords
1272                    .iter()
1273                    .any(|kw| line_lower.contains(kw))
1274                {
1275                    return true;
1276                }
1277            }
1278        }
1279
1280        false
1281    }
1282}
1283
1284#[cfg(test)]
1285mod tests {
1286    use super::*;
1287    use crate::analyzer::security::turbo::file_discovery::{FileMetadata, PriorityHints};
1288    use std::path::PathBuf;
1289    use std::time::SystemTime;
1290
1291    fn dummy_metadata(path: &str) -> FileMetadata {
1292        FileMetadata {
1293            path: PathBuf::from(path),
1294            size: 100,
1295            extension: Some(
1296                PathBuf::from(path)
1297                    .extension()
1298                    .and_then(|s| s.to_str())
1299                    .unwrap_or("")
1300                    .to_string(),
1301            ),
1302            is_gitignored: false,
1303            modified: SystemTime::now(),
1304            priority_hints: PriorityHints::default(),
1305        }
1306    }
1307
1308    #[test]
1309    fn test_pattern_engine_creation() {
1310        let config = TurboConfig::default();
1311        let engine = PatternEngine::new(&config);
1312        assert!(engine.is_ok());
1313
1314        let engine = engine.unwrap();
1315        assert!(engine.pattern_count() > 0);
1316    }
1317
1318    #[test]
1319    #[ignore] // Flaky - pattern matching depends on config/environment
1320    fn test_pattern_matching() {
1321        let config = TurboConfig::default();
1322        let engine = PatternEngine::new(&config).unwrap();
1323        let meta = dummy_metadata("test.js");
1324
1325        let content = r#"
1326            const apiKey = "sk-1234567890abcdef1234567890abcdef12345678";
1327            password = "super_secret_password_that_is_long_enough";
1328            process.env.DATABASE_URL
1329        "#;
1330
1331        let matches = engine.scan_content(content, false, &meta);
1332        assert!(!matches.is_empty());
1333
1334        // Should find API key (if long enough and not a template)
1335        assert!(
1336            matches
1337                .iter()
1338                .any(|m| m.pattern.id.contains("openai") || m.pattern.id.contains("secret"))
1339        );
1340    }
1341
1342    #[test]
1343    fn test_template_literal_filtering() {
1344        let config = TurboConfig::default();
1345        let engine = PatternEngine::new(&config).unwrap();
1346        let meta = dummy_metadata("test.js");
1347
1348        // Template literal content (should be filtered out)
1349        let template_content = r#"
1350            const getCode = () => {
1351                return `Authorization: "Bearer ${selectedApiKey?.apiKey}"`;
1352            }
1353            
1354            function generateExample() {
1355                return "Bearer " + apiKey;
1356            }
1357        "#;
1358
1359        let matches = engine.scan_content(template_content, false, &meta);
1360        // Should have very few or no matches due to template literal detection
1361        assert!(
1362            matches.len() <= 1,
1363            "Template literals should be filtered out"
1364        );
1365    }
1366
1367    #[test]
1368    fn test_code_generation_context() {
1369        let config = TurboConfig::default();
1370        let engine = PatternEngine::new(&config).unwrap();
1371        let meta = dummy_metadata("APICodeDialog.jsx");
1372
1373        // Code generation context (like React component that generates examples)
1374        let code_gen_content = r#"
1375            import { CopyBlock } from 'react-code-blocks';
1376            
1377            const APICodeDialog = () => {
1378                const getCodeWithAuthorization = () => {
1379                    return `
1380                        headers: {
1381                            Authorization: "Bearer ${selectedApiKey?.apiKey}",
1382                            "Content-Type": "application/json"
1383                        }
1384                    `;
1385                };
1386                
1387                return <CopyBlock text={getCodeWithAuthorization()} />;
1388            };
1389        "#;
1390
1391        let matches = engine.scan_content(code_gen_content, false, &meta);
1392        // Should have minimal matches due to code generation detection
1393        assert!(
1394            matches.is_empty() || matches.iter().all(|m| m.confidence < 0.3),
1395            "Code generation context should have very low confidence"
1396        );
1397    }
1398
1399    #[test]
1400    fn test_quick_reject() {
1401        let config = TurboConfig::default();
1402        let engine = PatternEngine::new(&config).unwrap();
1403        let meta = dummy_metadata("main.rs");
1404
1405        let safe_content = "fn main() { println!(\"Hello, world!\"); }";
1406        let matches = engine.scan_content(safe_content, true, &meta);
1407        assert!(matches.is_empty());
1408    }
1409
1410    #[test]
1411    fn test_package_json_filtering() {
1412        let config = TurboConfig::default();
1413        let engine = PatternEngine::new(&config).unwrap();
1414        let meta = dummy_metadata("package.json");
1415
1416        let content = r#"
1417            {
1418                "name": "my-cool-package-with-a-long-name-that-could-be-a-secret",
1419                "version": "1.0.0-beta.this.is.a.very.long.version.string.that.is.not.a.key",
1420                "description": "a string that is not a secret"
1421            }
1422        "#;
1423
1424        // Use a generic regex that would normally match these lines
1425        let mut test_engine = engine;
1426        test_engine.complex_patterns.push((
1427            Regex::new(r#"[a-zA-Z0-9-]{20,}"#).unwrap(),
1428            Arc::new(CompiledPattern {
1429                id: "generic-long-string".to_string(),
1430                name: "Generic Long String".to_string(),
1431                severity: SecuritySeverity::High,
1432                category: SecurityCategory::SecretsExposure,
1433                description: "A generic long string.".to_string(),
1434                remediation: vec![],
1435                references: vec![],
1436                cwe_id: None,
1437                confidence_boost_keywords: vec![],
1438                false_positive_keywords: vec![],
1439            }),
1440        ));
1441
1442        let matches = test_engine.scan_content(content, false, &meta);
1443        assert!(
1444            matches.is_empty(),
1445            "Should not find secrets in safe package.json keys"
1446        );
1447    }
1448}
syncable_cli/analyzer/security/turbo/pattern_engine.rs

syncable_cli/analyzer/security/turbo/
pattern_engine.rs