syncable_cli/analyzer/security/turbo/
pattern_engine.rs

1//! # Pattern Engine Module
2//!
3//! Ultra-fast multi-pattern matching using Aho-Corasick algorithm and compiled regex sets.
4
5use ahash::AHashMap;
6use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7use log::debug;
8use regex::Regex;
9use std::sync::Arc;
10
11use super::{SecurityError, TurboConfig};
12use crate::analyzer::security::{SecurityCategory, SecuritySeverity};
13
14/// A compiled pattern for ultra-fast matching
15#[derive(Debug, Clone)]
16pub struct CompiledPattern {
17    pub id: String,
18    pub name: String,
19    pub severity: SecuritySeverity,
20    pub category: SecurityCategory,
21    pub description: String,
22    pub remediation: Vec<String>,
23    pub references: Vec<String>,
24    pub cwe_id: Option<String>,
25    pub confidence_boost_keywords: Vec<String>,
26    pub false_positive_keywords: Vec<String>,
27}
28
29/// Pattern match result
30#[derive(Debug, Clone)]
31pub struct PatternMatch {
32    pub pattern: Arc<CompiledPattern>,
33    pub line_number: usize,
34    pub column_number: usize,
35    pub evidence: String,
36    pub confidence: f32,
37}
38
39/// High-performance pattern matching engine
40pub struct PatternEngine {
41    // Multi-pattern matchers
42    secret_matcher: AhoCorasick,
43    env_var_matcher: AhoCorasick,
44    api_key_matcher: AhoCorasick,
45
46    // Pattern lookup maps
47    secret_patterns: AHashMap<usize, Arc<CompiledPattern>>,
48    env_var_patterns: AHashMap<usize, Arc<CompiledPattern>>,
49    api_key_patterns: AHashMap<usize, Arc<CompiledPattern>>,
50
51    // Specialized matchers for complex patterns
52    complex_patterns: Vec<(Regex, Arc<CompiledPattern>)>,
53
54    // Performance counters
55    total_patterns: usize,
56}
57
58impl PatternEngine {
59    pub fn new(config: &TurboConfig) -> Result<Self, SecurityError> {
60        debug!(
61            "Initializing pattern engine with pattern sets: {:?}",
62            config.pattern_sets
63        );
64
65        // Load patterns based on configuration
66        let (secret_patterns, env_var_patterns, api_key_patterns, complex_patterns) =
67            Self::load_patterns(&config.pattern_sets)?;
68
69        // Build Aho-Corasick matchers
70        let secret_matcher = Self::build_matcher(&secret_patterns)?;
71        let env_var_matcher = Self::build_matcher(&env_var_patterns)?;
72        let api_key_matcher = Self::build_matcher(&api_key_patterns)?;
73
74        let total_patterns = secret_patterns.len()
75            + env_var_patterns.len()
76            + api_key_patterns.len()
77            + complex_patterns.len();
78
79        debug!(
80            "Pattern engine initialized with {} total patterns",
81            total_patterns
82        );
83
84        Ok(Self {
85            secret_matcher,
86            env_var_matcher,
87            api_key_matcher,
88            secret_patterns: Self::create_pattern_map(secret_patterns),
89            env_var_patterns: Self::create_pattern_map(env_var_patterns),
90            api_key_patterns: Self::create_pattern_map(api_key_patterns),
91            complex_patterns,
92            total_patterns,
93        })
94    }
95
96    /// Get total pattern count
97    pub fn pattern_count(&self) -> usize {
98        self.total_patterns
99    }
100
101    /// Scan content for all patterns
102    pub fn scan_content(
103        &self,
104        content: &str,
105        quick_reject: bool,
106        file_meta: &super::file_discovery::FileMetadata,
107    ) -> Vec<PatternMatch> {
108        // Quick reject using Boyer-Moore substring search
109        if quick_reject && !self.quick_contains_secrets(content) {
110            return Vec::new();
111        }
112
113        let mut matches = Vec::new();
114
115        // Split content into lines for line number tracking
116        let lines: Vec<&str> = content.lines().collect();
117        let mut line_offsets = vec![0];
118        let mut offset = 0;
119
120        for line in &lines {
121            offset += line.len() + 1; // +1 for newline
122            line_offsets.push(offset);
123        }
124
125        // Run multi-pattern matchers
126        matches.extend(self.run_matcher(
127            &self.secret_matcher,
128            content,
129            &self.secret_patterns,
130            &lines,
131            &line_offsets,
132            file_meta,
133        ));
134        matches.extend(self.run_matcher(
135            &self.env_var_matcher,
136            content,
137            &self.env_var_patterns,
138            &lines,
139            &line_offsets,
140            file_meta,
141        ));
142        matches.extend(self.run_matcher(
143            &self.api_key_matcher,
144            content,
145            &self.api_key_patterns,
146            &lines,
147            &line_offsets,
148            file_meta,
149        ));
150
151        // Run complex patterns (regex-based)
152        for (line_num, line) in lines.iter().enumerate() {
153            for (regex, pattern) in &self.complex_patterns {
154                if let Some(mat) = regex.find(line) {
155                    let confidence = self.calculate_confidence(line, content, pattern, file_meta);
156
157                    matches.push(PatternMatch {
158                        pattern: Arc::clone(pattern),
159                        line_number: line_num + 1,
160                        column_number: mat.start() + 1,
161                        evidence: self.extract_evidence(line, mat.start(), mat.end()),
162                        confidence,
163                    });
164                }
165            }
166        }
167
168        // Intelligent confidence filtering - adaptive threshold based on pattern type
169        matches.retain(|m| {
170            let threshold = match m.pattern.id.as_str() {
171                id if id.contains("aws-access-key") => 0.4, // AWS keys need higher confidence
172                id if id.contains("openai-api-key") => 0.4, // OpenAI keys need higher confidence
173                id if id.contains("jwt-token") => 0.6, // JWT tokens need high confidence (often in examples)
174                id if id.contains("database-url") => 0.5, // Database URLs medium confidence
175                id if id.contains("bearer-token") => 0.7, // Bearer tokens often in examples
176                id if id.contains("generic") => 0.8,   // Generic patterns need very high confidence
177                id if id.contains("long-secret-value") => 0.7, // Long secret values need high confidence
178                _ => 0.7,                                      // Increased default threshold
179            };
180            m.confidence > threshold
181        });
182
183        matches
184    }
185
186    /// Quick check if content might contain secrets
187    fn quick_contains_secrets(&self, content: &str) -> bool {
188        // Enhanced quick rejection for common false positive patterns
189        if self.is_likely_false_positive_content(content) {
190            return false;
191        }
192
193        // Common secret indicators (optimized for speed)
194        const QUICK_PATTERNS: &[&str] = &[
195            "api",
196            "key",
197            "secret",
198            "token",
199            "password",
200            "credential",
201            "auth",
202            "private",
203            "-----BEGIN",
204            "sk_",
205            "pk_",
206            "eyJ",
207        ];
208
209        let content_lower = content.to_lowercase();
210        QUICK_PATTERNS
211            .iter()
212            .any(|&pattern| content_lower.contains(pattern))
213    }
214
215    /// Check if content is likely a false positive (encoded data, minified code, etc.)
216    fn is_likely_false_positive_content(&self, content: &str) -> bool {
217        let content_len = content.len();
218
219        // Skip empty or very small content
220        if content_len < 10 {
221            return true;
222        }
223
224        // Check for base64 data URLs (common in SVG, images)
225        if content.contains("data:image/") || content.contains("data:font/") {
226            return true;
227        }
228
229        // Check for minified JavaScript (very long lines, no spaces)
230        let lines: Vec<&str> = content.lines().collect();
231        if lines.len() < 5
232            && lines
233                .iter()
234                .any(|line| line.len() > 500 && line.matches(' ').count() < line.len() / 50)
235        {
236            return true;
237        }
238
239        // Check for high percentage of base64-like characters (but not a JWT)
240        let base64_chars = content
241            .chars()
242            .filter(|c| c.is_alphanumeric() || *c == '+' || *c == '/' || *c == '=')
243            .count();
244        let base64_ratio = base64_chars as f32 / content_len as f32;
245
246        // High base64 ratio but doesn't look like JWT tokens
247        if base64_ratio > 0.8 && !content.contains("eyJ") && content_len > 1000 {
248            return true;
249        }
250
251        // Check for SVG content
252        if content.contains("<svg") || content.contains("xmlns=\"http://www.w3.org/2000/svg\"") {
253            return true;
254        }
255
256        // Check for CSS content
257        if content.contains("@media")
258            || content.contains("@import")
259            || (content.contains("{") && content.contains("}") && content.contains(":"))
260        {
261            return true;
262        }
263
264        false
265    }
266
267    /// Run Aho-Corasick matcher and collect results
268    fn run_matcher(
269        &self,
270        matcher: &AhoCorasick,
271        content: &str,
272        patterns: &AHashMap<usize, Arc<CompiledPattern>>,
273        lines: &[&str],
274        line_offsets: &[usize],
275        file_meta: &super::file_discovery::FileMetadata,
276    ) -> Vec<PatternMatch> {
277        let mut matches = Vec::new();
278
279        for mat in matcher.find_iter(content) {
280            let pattern_id = mat.pattern().as_usize();
281            if let Some(pattern) = patterns.get(&pattern_id) {
282                // Find line and column
283                let (line_num, col_num) = self.offset_to_line_col(mat.start(), line_offsets);
284                let line = lines.get(line_num.saturating_sub(1)).unwrap_or(&"");
285
286                let confidence = self.calculate_confidence(line, content, pattern, file_meta);
287
288                matches.push(PatternMatch {
289                    pattern: Arc::clone(pattern),
290                    line_number: line_num,
291                    column_number: col_num,
292                    evidence: self.extract_evidence(line, mat.start(), mat.end()),
293                    confidence,
294                });
295            }
296        }
297
298        matches
299    }
300
301    /// Convert byte offset to line and column numbers
302    fn offset_to_line_col(&self, offset: usize, line_offsets: &[usize]) -> (usize, usize) {
303        let line_num = line_offsets
304            .binary_search(&offset)
305            .unwrap_or_else(|i| i.saturating_sub(1));
306
307        let line_start = line_offsets.get(line_num).copied().unwrap_or(0);
308        let col_num = offset - line_start + 1;
309
310        (line_num + 1, col_num)
311    }
312
313    /// Calculate confidence score for a match
314    fn calculate_confidence(
315        &self,
316        line: &str,
317        content: &str,
318        pattern: &CompiledPattern,
319        file_meta: &super::file_discovery::FileMetadata,
320    ) -> f32 {
321        let mut confidence: f32 = 0.6;
322
323        let _line_lower = line.to_lowercase();
324        let _content_lower = content.to_lowercase();
325
326        // Enhanced false positive detection
327        if self.is_obvious_false_positive(line, content, file_meta) {
328            return 0.0;
329        }
330
331        // Context-based confidence adjustments
332        confidence = self.adjust_confidence_for_context(confidence, line, content, pattern);
333
334        // Pattern-specific adjustments
335        confidence = self.adjust_confidence_for_pattern(confidence, line, content, pattern);
336
337        confidence.clamp(0.0, 1.0)
338    }
339
340    /// Check for obvious false positives
341    fn is_obvious_false_positive(
342        &self,
343        line: &str,
344        content: &str,
345        file_meta: &super::file_discovery::FileMetadata,
346    ) -> bool {
347        let line_lower = line.to_lowercase();
348
349        // Comments and documentation
350        if line_lower.trim_start().starts_with("//")
351            || line_lower.trim_start().starts_with("#")
352            || line_lower.trim_start().starts_with("*")
353            || line_lower.trim_start().starts_with("<!--")
354        {
355            return true;
356        }
357
358        // Check for safe keys in common dependency management files
359        if self.is_safe_dependency_metadata(line, file_meta) {
360            return true;
361        }
362
363        // JavaScript/TypeScript template literals (${...})
364        if line.contains("${") && line.contains("}") {
365            return true;
366        }
367
368        // Template strings and interpolation patterns
369        if line.contains("${selectedApiKey")
370            || line.contains("${apiKey")
371            || line.contains("${key")
372            || line.contains("${token")
373        {
374            return true;
375        }
376
377        // Code generation contexts (functions that generate example code)
378        if self.is_in_code_generation_context(content) && self.looks_like_template_code(line) {
379            return true;
380        }
381
382        // Common example/placeholder patterns
383        let false_positive_patterns = [
384            "example",
385            "placeholder",
386            "your_",
387            "todo",
388            "fixme",
389            "xxx",
390            "xxxxxxxx",
391            "12345",
392            "abcdef",
393            "test",
394            "demo",
395            "sample",
396            "lorem",
397            "ipsum",
398            "change_me",
399            "replace_me",
400            "insert_",
401            "enter_your",
402            "add_your",
403            "put_your",
404            "use_your",
405            // React/JSX specific patterns
406            "props.",
407            "state.",
408            "this.",
409            "component",
410        ];
411
412        if false_positive_patterns
413            .iter()
414            .any(|&pattern| line_lower.contains(pattern))
415        {
416            return true;
417        }
418
419        // Check for JSON schema or TypeScript interfaces
420        if line_lower.contains("@example")
421            || line_lower.contains("@param")
422            || line_lower.contains("interface")
423            || line_lower.contains("type ")
424        {
425            return true;
426        }
427
428        // Check for base64 data URLs
429        if line.contains("data:image/")
430            || line.contains("data:font/")
431            || line.contains("data:application/")
432        {
433            return true;
434        }
435
436        // Check for URLs in an array context
437        if (line.contains("http://") || line.contains("https://"))
438            && self.is_in_array_or_list(content)
439        {
440            return true;
441        }
442
443        // Check for command-line scripts which often contain high-entropy strings
444        // that are not secrets (e.g., project IDs, build hashes).
445        if self.is_command_line_script(line) {
446            return true;
447        }
448
449        // Check for environment variable interpolations, which are secure.
450        if self.is_env_var_interpolation(line, file_meta) {
451            return true;
452        }
453
454        // Check for minified content (very long line with little whitespace)
455        if line.len() > 200 && line.matches(' ').count() < line.len() / 20 {
456            return true;
457        }
458
459        // React/JSX template patterns
460        if line.contains("return `") || line.contains("const ") && line.contains(" = `") {
461            return true;
462        }
463
464        false
465    }
466
467    /// Check if we're inside an array or list definition
468    fn is_in_array_or_list(&self, content: &str) -> bool {
469        let content_lower = content.to_lowercase();
470        // Language-agnostic checks for array/list definitions
471        let array_patterns = [
472            "const ",
473            "let ",
474            "var ",
475            "export const ",
476            "export let ",
477            "authorized_parties",
478            "allowed_origins",
479            "authorized_domains",
480            "hosts",
481            "urls",
482            "uris",
483            "endpoints",
484            "domains",
485            "redirect_uris",
486            "allowed_hosts",
487            "cors_origins",
488            "trusted_sources",
489        ];
490
491        array_patterns.iter().any(|p| content_lower.contains(p)) &&
492        (content.contains("[") && content.contains("]")) || // JS, Python, Rust arrays/lists
493        (content.contains("(") && content.contains(")")) || // Python tuples
494        (content.contains("{") && content.contains("}")) // Go slices
495    }
496
497    /// Check if a line looks like a command-line script.
498    /// This is to avoid flagging project IDs, build hashes, or other identifiers
499    /// inside shell commands as secrets.
500    fn is_command_line_script(&self, line: &str) -> bool {
501        // Quick check for flags, which are a strong indicator of a shell command.
502        if !line.contains("--") {
503            return false;
504        }
505
506        let line_lower = line.to_lowercase();
507
508        // Common script/command keywords.
509        // The presence of these alongside flags increases confidence that it's a script.
510        let command_keywords = [
511            // Verbs
512            "run",
513            "exec",
514            "build",
515            "start",
516            "test",
517            "deploy",
518            "gen",
519            "generate",
520            "get",
521            "set",
522            "create",
523            "delete",
524            "update",
525            "push",
526            "pull",
527            "watch",
528            "serve",
529            "lint",
530            "format",
531            // Nouns/Context
532            "client",
533            "server",
534            "output",
535            "input",
536            "file",
537            "env",
538            "environment",
539            "config",
540            "path",
541            "dir",
542            "port",
543            "host",
544            "watch",
545            "prod",
546            "dev",
547            // Common tools
548            "npm",
549            "yarn",
550            "pnpm",
551            "npx",
552            "node",
553            "python",
554            "pip",
555            "go",
556            "cargo",
557            "docker",
558            "aws",
559            "gcloud",
560            "az",
561            "kubectl",
562            "terraform",
563            "encore",
564            "bun",
565            "bunx",
566            "maven",
567            "gradle",
568            "gradlew",
569            "gradlew.bat",
570            "gradlew.sh",
571            "gradlew.jar",
572            "gradlew.zip",
573            "mvn",
574            "pipx",
575            "pipenv",
576            "poetry",
577            "ruff",
578            "black",
579            "isort",
580            "flake8",
581            "mypy",
582            "pytest",
583            "jest",
584            "mocha",
585            "jasmine",
586            "cypress",
587            "playwright",
588            "selenium",
589            "puppeteer",
590            "webdriver",
591            "puppeteer-extra",
592            "puppeteer-extra-plugin-stealth",
593            "puppeteer-extra-plugin-recaptcha",
594        ];
595
596        // If we find a flag AND a common command keyword, it's very likely a script.
597        if command_keywords.iter().any(|&kw| line_lower.contains(kw)) {
598            return true;
599        }
600
601        // Also consider it a script if it looks like a file path assignment after a flag
602        if line.contains("--") && (line.contains('/') || line.contains('\\') || line.contains('='))
603        {
604            return true;
605        }
606
607        false
608    }
609
610    /// Check if we're in a code generation context
611    fn is_in_code_generation_context(&self, content: &str) -> bool {
612        let content_lower = content.to_lowercase();
613
614        // Common code generation function names and patterns
615        let code_gen_patterns = [
616            "getcode",
617            "generatecode",
618            "codecomponent",
619            "apicodedialog",
620            "const getcode",
621            "function getcode",
622            "const code",
623            "function code",
624            "codesnippet",
625            "codeexample",
626            "template",
627            "example code",
628            "code generator",
629            "api example",
630            "curl example",
631            // React/JSX specific
632            "codeblock",
633            "copyblock",
634            "syntax highlight",
635        ];
636
637        code_gen_patterns
638            .iter()
639            .any(|&pattern| content_lower.contains(pattern))
640    }
641
642    /// Check if a line looks like template code
643    fn looks_like_template_code(&self, line: &str) -> bool {
644        // Template string patterns
645        if line.contains("return `") || line.contains("= `") {
646            return true;
647        }
648
649        // API URL construction patterns
650        if line.contains("API_URL") || line.contains("/api/v1/") || line.contains("/prediction/") {
651            return true;
652        }
653
654        // Typical code example patterns
655        if line.contains("requests.post")
656            || line.contains("fetch(")
657            || line.contains("curl ")
658            || line.contains("import requests")
659        {
660            return true;
661        }
662
663        // Authorization header patterns in templates
664        if line.contains("Authorization:") || line.contains("Bearer ") {
665            return true;
666        }
667
668        false
669    }
670
671    /// Adjust confidence based on context
672    fn adjust_confidence_for_context(
673        &self,
674        mut confidence: f32,
675        line: &str,
676        content: &str,
677        _pattern: &CompiledPattern,
678    ) -> f32 {
679        let line_lower = line.to_lowercase();
680        let content_lower = content.to_lowercase();
681
682        // Boost confidence for actual assignments
683        if line.contains("=") || line.contains(":") {
684            confidence += 0.2;
685        }
686
687        // Boost for environment variable assignment
688        if line_lower.contains("export ") || line_lower.contains("process.env") {
689            confidence += 0.3;
690        }
691
692        // Boost for import statements with API keys
693        if line_lower.contains("import")
694            && (line_lower.contains("api") || line_lower.contains("key"))
695        {
696            confidence += 0.1;
697        }
698
699        // Reduce confidence for certain file types based on content
700        if content_lower.contains("package.json") || content_lower.contains("node_modules") {
701            confidence -= 0.2;
702        }
703
704        // Reduce confidence for test files
705        if content_lower.contains("/test/")
706            || content_lower.contains("__test__")
707            || content_lower.contains(".test.")
708            || content_lower.contains(".spec.")
709        {
710            confidence -= 0.3;
711        }
712
713        // Reduce confidence for documentation
714        if content_lower.contains("readme")
715            || content_lower.contains("documentation")
716            || content_lower.contains("docs/")
717        {
718            confidence -= 0.4;
719        }
720
721        confidence
722    }
723
724    /// Adjust confidence based on pattern-specific rules
725    fn adjust_confidence_for_pattern(
726        &self,
727        mut confidence: f32,
728        line: &str,
729        content: &str,
730        pattern: &CompiledPattern,
731    ) -> f32 {
732        let line_lower = line.to_lowercase();
733        let content_lower = content.to_lowercase();
734
735        // Major confidence reduction for template/code generation contexts
736        if self.is_in_code_generation_context(content) {
737            confidence -= 0.6;
738        }
739
740        // Check pattern-specific confidence boost keywords
741        for keyword in &pattern.confidence_boost_keywords {
742            if content_lower.contains(&keyword.to_lowercase()) {
743                confidence += 0.1;
744            }
745        }
746
747        // Check pattern-specific false positive keywords
748        for keyword in &pattern.false_positive_keywords {
749            if line_lower.contains(&keyword.to_lowercase()) {
750                confidence -= 0.4;
751            }
752        }
753
754        // Special handling for specific pattern types
755        match pattern.id.as_str() {
756            "jwt-token" => {
757                // JWT tokens should have proper structure
758                if !line.contains("eyJ") || line.split('.').count() != 3 {
759                    confidence -= 0.3;
760                }
761                // Less confident if in a comment or documentation
762                if line_lower.contains("example") || line_lower.contains("jwt") {
763                    confidence -= 0.2;
764                }
765                // Very low confidence for template literals
766                if line.contains("${") {
767                    confidence -= 0.8;
768                }
769            }
770            "openai-api-key" => {
771                // OpenAI keys should start with sk- and be proper length
772                if !line.contains("sk-") {
773                    confidence -= 0.5;
774                }
775                // Boost if in actual code context
776                if line_lower.contains("openai") || line_lower.contains("gpt") {
777                    confidence += 0.2;
778                }
779                // Major reduction for template literals
780                if line.contains("${") || line.contains("selectedApiKey") {
781                    confidence -= 0.9;
782                }
783            }
784            "database-url-with-creds" => {
785                // Should be a valid URL format
786                if !line.contains("://") || line.contains("example.com") {
787                    confidence -= 0.4;
788                }
789
790                // Check for placeholder credentials
791                let placeholder_creds = [
792                    "user:pass",
793                    "user:password",
794                    "admin:admin",
795                    "admin:password",
796                    "username:password",
797                    "test:test",
798                    "root:root",
799                    "postgres:postgres",
800                ];
801                if placeholder_creds.iter().any(|p| line.contains(p)) {
802                    confidence -= 0.8; // Drastically reduce confidence for placeholders
803                }
804
805                // Reduce for template patterns
806                if line.contains("${") {
807                    confidence -= 0.7;
808                }
809            }
810            "long-secret-value" | "generic-api-key" => {
811                // High reduction for template literals and code generation
812                if line.contains("${")
813                    || line.contains("selectedApiKey")
814                    || line.contains("apiKey") && line.contains("?")
815                {
816                    confidence -= 0.8;
817                }
818                // Reduce for Bearer token patterns in templates
819                if line.contains("Bearer ") && line.contains("${") {
820                    confidence -= 0.9;
821                }
822            }
823            _ => {
824                // General template literal reduction
825                if line.contains("${") {
826                    confidence -= 0.6;
827                }
828            }
829        }
830
831        // Additional React/JSX specific reductions
832        if (content_lower.contains("react")
833            || content_lower.contains("jsx")
834            || content_lower.contains("component"))
835            && (line.contains("${") || line.contains("props.") || line.contains("state."))
836        {
837            confidence -= 0.5;
838        }
839
840        confidence
841    }
842
843    /// Extract evidence with context
844    fn extract_evidence(&self, line: &str, start: usize, end: usize) -> String {
845        // Mask the actual secret value
846        let prefix = &line[..start.min(line.len())];
847        let suffix = &line[end.min(line.len())..];
848        let masked = "*".repeat((end - start).min(20));
849
850        format!("{}{}{}", prefix, masked, suffix).trim().to_string()
851    }
852
853    /// Build Aho-Corasick matcher from patterns
854    fn build_matcher(
855        patterns: &[(String, Arc<CompiledPattern>)],
856    ) -> Result<AhoCorasick, SecurityError> {
857        let strings: Vec<&str> = patterns.iter().map(|(s, _)| s.as_str()).collect();
858
859        let matcher = AhoCorasickBuilder::new()
860            .match_kind(MatchKind::LeftmostFirst)
861            .ascii_case_insensitive(true)
862            .build(&strings)
863            .map_err(|e| SecurityError::PatternEngine(format!("Failed to build matcher: {}", e)))?;
864
865        Ok(matcher)
866    }
867
868    /// Create pattern lookup map
869    fn create_pattern_map(
870        patterns: Vec<(String, Arc<CompiledPattern>)>,
871    ) -> AHashMap<usize, Arc<CompiledPattern>> {
872        patterns
873            .into_iter()
874            .enumerate()
875            .map(|(id, (_, pattern))| (id, pattern))
876            .collect()
877    }
878
879    /// Load patterns based on pattern sets
880    fn load_patterns(
881        pattern_sets: &[String],
882    ) -> Result<
883        (
884            Vec<(String, Arc<CompiledPattern>)>,
885            Vec<(String, Arc<CompiledPattern>)>,
886            Vec<(String, Arc<CompiledPattern>)>,
887            Vec<(Regex, Arc<CompiledPattern>)>,
888        ),
889        SecurityError,
890    > {
891        let mut secret_patterns = Vec::new();
892        let mut env_var_patterns = Vec::new();
893        let mut api_key_patterns = Vec::new();
894        let mut complex_patterns = Vec::new();
895
896        // Load default patterns
897        if pattern_sets.contains(&"default".to_string()) {
898            Self::load_default_patterns(
899                &mut secret_patterns,
900                &mut env_var_patterns,
901                &mut api_key_patterns,
902                &mut complex_patterns,
903            )?;
904        }
905
906        // Load additional pattern sets
907        for set in pattern_sets {
908            match set.as_str() {
909                "aws" => Self::load_aws_patterns(&mut api_key_patterns)?,
910                "gcp" => Self::load_gcp_patterns(&mut api_key_patterns)?,
911                "azure" => Self::load_azure_patterns(&mut api_key_patterns)?,
912                "crypto" => Self::load_crypto_patterns(&mut secret_patterns)?,
913                _ => {}
914            }
915        }
916
917        Ok((
918            secret_patterns,
919            env_var_patterns,
920            api_key_patterns,
921            complex_patterns,
922        ))
923    }
924
925    /// Load default security patterns - focused on ACTUAL secrets, not references
926    fn load_default_patterns(
927        secret_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
928        _env_var_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
929        api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
930        complex_patterns: &mut Vec<(Regex, Arc<CompiledPattern>)>,
931    ) -> Result<(), SecurityError> {
932        // ONLY detect actual API key values, not variable names
933
934        // OpenAI API Keys - actual key format
935        api_key_patterns.push((
936            "sk-".to_string(),
937            Arc::new(CompiledPattern {
938                id: "openai-api-key".to_string(),
939                name: "OpenAI API Key".to_string(),
940                severity: SecuritySeverity::Critical,
941                category: SecurityCategory::SecretsExposure,
942                description: "OpenAI API key detected".to_string(),
943                remediation: vec![
944                    "Remove API key from source code".to_string(),
945                    "Use environment variables".to_string(),
946                ],
947                references: vec!["https://platform.openai.com/docs/api-reference".to_string()],
948                cwe_id: Some("CWE-798".to_string()),
949                confidence_boost_keywords: vec!["openai".to_string(), "gpt".to_string()],
950                false_positive_keywords: vec![
951                    "sk-xxxxxxxx".to_string(),
952                    "sk-...".to_string(),
953                    "sk_test".to_string(),
954                    "example".to_string(),
955                    "placeholder".to_string(),
956                    "your_".to_string(),
957                    "TODO".to_string(),
958                    "FIXME".to_string(),
959                    "XXX".to_string(),
960                ],
961            }),
962        ));
963
964        // Complex regex patterns for ACTUAL secret assignments with values
965        complex_patterns.push((
966            // Only match when there's an actual long value, not just variable names
967            Regex::new(r#"(?i)(?:api[_-]?key|secret[_-]?key|access[_-]?token)\s*[:=]\s*['"]([a-zA-Z0-9+/=]{32,})['"]"#)
968                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
969            Arc::new(CompiledPattern {
970                id: "long-secret-value".to_string(),
971                name: "Hardcoded Secret Value".to_string(),
972                severity: SecuritySeverity::Critical,
973                category: SecurityCategory::SecretsExposure,
974                description: "Long secret value hardcoded in source code".to_string(),
975                remediation: vec![
976                    "Use environment variables for secrets".to_string(),
977                    "Implement proper secret management".to_string(),
978                ],
979                references: vec![],
980                cwe_id: Some("CWE-798".to_string()),
981                confidence_boost_keywords: vec!["bearer".to_string(), "auth".to_string()],
982                false_positive_keywords: vec![
983                    "process.env".to_string(), "getenv".to_string(), "example".to_string(),
984                    "placeholder".to_string(), "your_".to_string(), "TODO".to_string(),
985                    "test".to_string(), "demo".to_string(), "fake".to_string(),
986                ],
987            }),
988        ));
989
990        // JWT tokens (actual token format)
991        complex_patterns.push((
992            Regex::new(r#"\beyJ[a-zA-Z0-9+/=]{100,}\b"#)
993                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
994            Arc::new(CompiledPattern {
995                id: "jwt-token".to_string(),
996                name: "JWT Token".to_string(),
997                severity: SecuritySeverity::High,
998                category: SecurityCategory::SecretsExposure,
999                description: "JWT token detected in source code".to_string(),
1000                remediation: vec![
1001                    "Never hardcode JWT tokens".to_string(),
1002                    "Use secure token storage".to_string(),
1003                ],
1004                references: vec![],
1005                cwe_id: Some("CWE-798".to_string()),
1006                confidence_boost_keywords: vec!["bearer".to_string(), "authorization".to_string()],
1007                false_positive_keywords: vec!["example".to_string(), "demo".to_string()],
1008            }),
1009        ));
1010
1011        // Database connection strings with embedded credentials
1012        complex_patterns.push((
1013            Regex::new(r#"(?i)(?:postgres|postgresql|mysql|mongodb|redis|mariadb)://[^:\s]+:[^@\s]+@[^/\s]+/[^\s]*"#)
1014                .map_err(|e| SecurityError::PatternEngine(format!("Regex error: {}", e)))?,
1015            Arc::new(CompiledPattern {
1016                id: "database-url-with-creds".to_string(),
1017                name: "Database URL with Credentials".to_string(),
1018                severity: SecuritySeverity::Critical,
1019                category: SecurityCategory::SecretsExposure,
1020                description: "Database connection string with embedded credentials".to_string(),
1021                remediation: vec![
1022                    "Use environment variables for database credentials".to_string(),
1023                    "Use connection string without embedded passwords".to_string(),
1024                ],
1025                references: vec![],
1026                cwe_id: Some("CWE-798".to_string()),
1027                confidence_boost_keywords: vec!["connection".to_string(), "database".to_string()],
1028                false_positive_keywords: vec![
1029                    "example.com".to_string(), "localhost".to_string(), "placeholder".to_string(),
1030                    "your_".to_string(), "user:pass".to_string(),
1031                ],
1032            }),
1033        ));
1034
1035        // Private SSH/SSL keys
1036        secret_patterns.push((
1037            "-----BEGIN".to_string(),
1038            Arc::new(CompiledPattern {
1039                id: "private-key-header".to_string(),
1040                name: "Private Key".to_string(),
1041                severity: SecuritySeverity::Critical,
1042                category: SecurityCategory::SecretsExposure,
1043                description: "Private key detected".to_string(),
1044                remediation: vec![
1045                    "Never commit private keys to version control".to_string(),
1046                    "Use secure key storage solutions".to_string(),
1047                ],
1048                references: vec![],
1049                cwe_id: Some("CWE-321".to_string()),
1050                confidence_boost_keywords: vec![
1051                    "PRIVATE".to_string(),
1052                    "RSA".to_string(),
1053                    "DSA".to_string(),
1054                ],
1055                false_positive_keywords: vec!["PUBLIC".to_string(), "CERTIFICATE".to_string()],
1056            }),
1057        ));
1058
1059        Ok(())
1060    }
1061
1062    /// Load AWS-specific patterns
1063    fn load_aws_patterns(
1064        api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
1065    ) -> Result<(), SecurityError> {
1066        api_key_patterns.push((
1067            "AKIA".to_string(),
1068            Arc::new(CompiledPattern {
1069                id: "aws-access-key".to_string(),
1070                name: "AWS Access Key".to_string(),
1071                severity: SecuritySeverity::Critical,
1072                category: SecurityCategory::SecretsExposure,
1073                description: "AWS Access Key ID detected".to_string(),
1074                remediation: vec![
1075                    "Remove AWS credentials from source code".to_string(),
1076                    "Use IAM roles or environment variables".to_string(),
1077                    "Rotate the exposed key immediately".to_string(),
1078                ],
1079                references: vec!["https://docs.aws.amazon.com/security/".to_string()],
1080                cwe_id: Some("CWE-798".to_string()),
1081                confidence_boost_keywords: vec![
1082                    "aws".to_string(),
1083                    "s3".to_string(),
1084                    "ec2".to_string(),
1085                ],
1086                false_positive_keywords: vec!["AKIA00000000".to_string()],
1087            }),
1088        ));
1089
1090        Ok(())
1091    }
1092
1093    /// Load GCP-specific patterns
1094    fn load_gcp_patterns(
1095        api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
1096    ) -> Result<(), SecurityError> {
1097        api_key_patterns.push((
1098            "AIza".to_string(),
1099            Arc::new(CompiledPattern {
1100                id: "gcp-api-key".to_string(),
1101                name: "Google Cloud API Key".to_string(),
1102                severity: SecuritySeverity::High,
1103                category: SecurityCategory::SecretsExposure,
1104                description: "Google Cloud API key detected".to_string(),
1105                remediation: vec![
1106                    "Use service accounts instead of API keys".to_string(),
1107                    "Restrict API key usage by IP/referrer".to_string(),
1108                ],
1109                references: vec!["https://cloud.google.com/security/".to_string()],
1110                cwe_id: Some("CWE-798".to_string()),
1111                confidence_boost_keywords: vec![
1112                    "google".to_string(),
1113                    "gcp".to_string(),
1114                    "firebase".to_string(),
1115                ],
1116                false_positive_keywords: vec![],
1117            }),
1118        ));
1119
1120        Ok(())
1121    }
1122
1123    /// Load Azure-specific patterns
1124    fn load_azure_patterns(
1125        _api_key_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
1126    ) -> Result<(), SecurityError> {
1127        // Azure patterns would go here
1128        Ok(())
1129    }
1130
1131    /// Load cryptocurrency-related patterns
1132    fn load_crypto_patterns(
1133        secret_patterns: &mut Vec<(String, Arc<CompiledPattern>)>,
1134    ) -> Result<(), SecurityError> {
1135        secret_patterns.push((
1136            "-----BEGIN".to_string(),
1137            Arc::new(CompiledPattern {
1138                id: "private-key".to_string(),
1139                name: "Private Key".to_string(),
1140                severity: SecuritySeverity::Critical,
1141                category: SecurityCategory::SecretsExposure,
1142                description: "Private key detected".to_string(),
1143                remediation: vec![
1144                    "Never commit private keys to version control".to_string(),
1145                    "Use secure key storage solutions".to_string(),
1146                ],
1147                references: vec![],
1148                cwe_id: Some("CWE-321".to_string()),
1149                confidence_boost_keywords: vec!["RSA".to_string(), "PRIVATE".to_string()],
1150                false_positive_keywords: vec!["PUBLIC".to_string()],
1151            }),
1152        ));
1153
1154        Ok(())
1155    }
1156
1157    /// Checks if a line is a safe, non-secret key-value pair in a known dependency file.
1158    fn is_safe_dependency_metadata(
1159        &self,
1160        line: &str,
1161        file_meta: &super::file_discovery::FileMetadata,
1162    ) -> bool {
1163        let filename = file_meta
1164            .path
1165            .file_name()
1166            .and_then(|s| s.to_str())
1167            .unwrap_or("");
1168        let line_trimmed = line.trim();
1169
1170        match filename {
1171            "package.json" => {
1172                // Keys in JSON are quoted strings
1173                let safe_keys = [
1174                    "\"name\"",
1175                    "\"version\"",
1176                    "\"description\"",
1177                    "\"main\"",
1178                    "\"module\"",
1179                    "\"type\"",
1180                    "\"private\"",
1181                    "\"license\"",
1182                    "\"author\"",
1183                    "\"homepage\"",
1184                    "\"repository\"",
1185                    "\"bugs\"",
1186                    "\"keywords\"",
1187                    "\"workspaces\"",
1188                ];
1189                safe_keys.iter().any(|key| line_trimmed.starts_with(key))
1190            }
1191            "Cargo.toml" | "pyproject.toml" => {
1192                // Keys in TOML are typically not quoted
1193                let safe_keys = [
1194                    "name =",
1195                    "version =",
1196                    "description =",
1197                    "edition =",
1198                    "license =",
1199                    "authors =",
1200                    "homepage =",
1201                    "repository =",
1202                    "documentation =",
1203                    "keywords =",
1204                ];
1205                safe_keys.iter().any(|key| line_trimmed.starts_with(key))
1206            }
1207            "go.mod" => line_trimmed.starts_with("module ") || line_trimmed.starts_with("go "),
1208            "pom.xml" => {
1209                // Keys in XML are tags
1210                let safe_tags = [
1211                    "<groupId>",
1212                    "<artifactId>",
1213                    "<version>",
1214                    "<name>",
1215                    "<description>",
1216                    "<url>",
1217                    "<license>",
1218                ];
1219                safe_tags.iter().any(|tag| line_trimmed.contains(tag))
1220            }
1221            "build.gradle" | "build.gradle.kts" => {
1222                let safe_assignments = ["rootProject.name =", "group =", "version ="];
1223                safe_assignments.iter().any(|s| line_trimmed.starts_with(s))
1224            }
1225            _ => false,
1226        }
1227    }
1228
1229    /// Checks if a line contains a reference to an environment variable, not a hardcoded secret.
1230    fn is_env_var_interpolation(
1231        &self,
1232        line: &str,
1233        file_meta: &super::file_discovery::FileMetadata,
1234    ) -> bool {
1235        let filename = file_meta
1236            .path
1237            .file_name()
1238            .and_then(|s| s.to_str())
1239            .unwrap_or("");
1240
1241        // Pattern 1: JSON-based `{"$env": "VAR"}`. This is a very specific and safe pattern.
1242        if line.contains("\"$env\"") {
1243            return true;
1244        }
1245
1246        // Pattern 2: Shell/YAML/Dockerfile `${VAR}` or `$VAR`. This is more generic.
1247        if line.contains('$') {
1248            // Check for `${...}` or `$VAR` patterns
1249            if line.contains("${") && line.contains("}") {
1250                let is_config_file = matches!(
1251                    filename,
1252                    "docker-compose.yml"
1253                        | "docker-compose.yaml"
1254                        | "Dockerfile"
1255                        | "Jenkinsfile"
1256                        | "Makefile"
1257                ) || filename.ends_with(".env")
1258                    || filename.ends_with(".sh")
1259                    || filename.ends_with(".yml")
1260                    || filename.ends_with(".yaml");
1261
1262                if is_config_file {
1263                    return true;
1264                }
1265
1266                // Also check for context keywords in any file
1267                let line_lower = line.to_lowercase();
1268                let env_context_keywords =
1269                    ["environment:", "command:", "entrypoint:", "value:", "args:"];
1270                if env_context_keywords
1271                    .iter()
1272                    .any(|kw| line_lower.contains(kw))
1273                {
1274                    return true;
1275                }
1276            }
1277        }
1278
1279        false
1280    }
1281}
1282
1283#[cfg(test)]
1284mod tests {
1285    use super::*;
1286    use crate::analyzer::security::turbo::file_discovery::{FileMetadata, PriorityHints};
1287    use std::path::PathBuf;
1288    use std::time::SystemTime;
1289
1290    fn dummy_metadata(path: &str) -> FileMetadata {
1291        FileMetadata {
1292            path: PathBuf::from(path),
1293            size: 100,
1294            extension: Some(
1295                PathBuf::from(path)
1296                    .extension()
1297                    .and_then(|s| s.to_str())
1298                    .unwrap_or("")
1299                    .to_string(),
1300            ),
1301            is_gitignored: false,
1302            modified: SystemTime::now(),
1303            priority_hints: PriorityHints::default(),
1304        }
1305    }
1306
1307    #[test]
1308    fn test_pattern_engine_creation() {
1309        let config = TurboConfig::default();
1310        let engine = PatternEngine::new(&config);
1311        assert!(engine.is_ok());
1312
1313        let engine = engine.unwrap();
1314        assert!(engine.pattern_count() > 0);
1315    }
1316
1317    #[test]
1318    #[ignore] // Flaky - pattern matching depends on config/environment
1319    fn test_pattern_matching() {
1320        let config = TurboConfig::default();
1321        let engine = PatternEngine::new(&config).unwrap();
1322        let meta = dummy_metadata("test.js");
1323
1324        let content = r#"
1325            const apiKey = "sk-1234567890abcdef1234567890abcdef12345678";
1326            password = "super_secret_password_that_is_long_enough";
1327            process.env.DATABASE_URL
1328        "#;
1329
1330        let matches = engine.scan_content(content, false, &meta);
1331        assert!(!matches.is_empty());
1332
1333        // Should find API key (if long enough and not a template)
1334        assert!(
1335            matches
1336                .iter()
1337                .any(|m| m.pattern.id.contains("openai") || m.pattern.id.contains("secret"))
1338        );
1339    }
1340
1341    #[test]
1342    fn test_template_literal_filtering() {
1343        let config = TurboConfig::default();
1344        let engine = PatternEngine::new(&config).unwrap();
1345        let meta = dummy_metadata("test.js");
1346
1347        // Template literal content (should be filtered out)
1348        let template_content = r#"
1349            const getCode = () => {
1350                return `Authorization: "Bearer ${selectedApiKey?.apiKey}"`;
1351            }
1352            
1353            function generateExample() {
1354                return "Bearer " + apiKey;
1355            }
1356        "#;
1357
1358        let matches = engine.scan_content(template_content, false, &meta);
1359        // Should have very few or no matches due to template literal detection
1360        assert!(
1361            matches.len() <= 1,
1362            "Template literals should be filtered out"
1363        );
1364    }
1365
1366    #[test]
1367    fn test_code_generation_context() {
1368        let config = TurboConfig::default();
1369        let engine = PatternEngine::new(&config).unwrap();
1370        let meta = dummy_metadata("APICodeDialog.jsx");
1371
1372        // Code generation context (like React component that generates examples)
1373        let code_gen_content = r#"
1374            import { CopyBlock } from 'react-code-blocks';
1375            
1376            const APICodeDialog = () => {
1377                const getCodeWithAuthorization = () => {
1378                    return `
1379                        headers: {
1380                            Authorization: "Bearer ${selectedApiKey?.apiKey}",
1381                            "Content-Type": "application/json"
1382                        }
1383                    `;
1384                };
1385                
1386                return <CopyBlock text={getCodeWithAuthorization()} />;
1387            };
1388        "#;
1389
1390        let matches = engine.scan_content(code_gen_content, false, &meta);
1391        // Should have minimal matches due to code generation detection
1392        assert!(
1393            matches.is_empty() || matches.iter().all(|m| m.confidence < 0.3),
1394            "Code generation context should have very low confidence"
1395        );
1396    }
1397
1398    #[test]
1399    fn test_quick_reject() {
1400        let config = TurboConfig::default();
1401        let engine = PatternEngine::new(&config).unwrap();
1402        let meta = dummy_metadata("main.rs");
1403
1404        let safe_content = "fn main() { println!(\"Hello, world!\"); }";
1405        let matches = engine.scan_content(safe_content, true, &meta);
1406        assert!(matches.is_empty());
1407    }
1408
1409    #[test]
1410    fn test_package_json_filtering() {
1411        let config = TurboConfig::default();
1412        let engine = PatternEngine::new(&config).unwrap();
1413        let meta = dummy_metadata("package.json");
1414
1415        let content = r#"
1416            {
1417                "name": "my-cool-package-with-a-long-name-that-could-be-a-secret",
1418                "version": "1.0.0-beta.this.is.a.very.long.version.string.that.is.not.a.key",
1419                "description": "a string that is not a secret"
1420            }
1421        "#;
1422
1423        // Use a generic regex that would normally match these lines
1424        let mut test_engine = engine;
1425        test_engine.complex_patterns.push((
1426            Regex::new(r#"[a-zA-Z0-9-]{20,}"#).unwrap(),
1427            Arc::new(CompiledPattern {
1428                id: "generic-long-string".to_string(),
1429                name: "Generic Long String".to_string(),
1430                severity: SecuritySeverity::High,
1431                category: SecurityCategory::SecretsExposure,
1432                description: "A generic long string.".to_string(),
1433                remediation: vec![],
1434                references: vec![],
1435                cwe_id: None,
1436                confidence_boost_keywords: vec![],
1437                false_positive_keywords: vec![],
1438            }),
1439        ));
1440
1441        let matches = test_engine.scan_content(content, false, &meta);
1442        assert!(
1443            matches.is_empty(),
1444            "Should not find secrets in safe package.json keys"
1445        );
1446    }
1447}
syncable_cli/analyzer/security/turbo/pattern_engine.rs

syncable_cli/analyzer/security/turbo/
pattern_engine.rs