Skip to main content

rma_analyzer/
lib.rs

1//! Code analysis and security scanning for Qryon
2//!
3//! This crate provides metrics computation, vulnerability detection,
4//! and pattern-based analysis on parsed ASTs.
5//!
6//! NOTE: This crate DETECTS security vulnerabilities - it does not contain them.
7//! The security rules detect dangerous patterns like unsafe code, code injection, etc.
8//!
9//! # Modules
10//!
11//! - `flow`: Control flow and data flow analysis (CFG, taint tracking)
12//! - `knowledge`: Framework-specific security knowledge base
13//! - `metrics`: Code metrics computation (complexity, LOC, etc.)
14//! - `providers`: External analysis tool integrations (PMD, Oxlint, etc.)
15//! - `rules`: Analysis rule trait and implementations
16//! - `security`: Security rules organized by language
17//! - `semantics`: Language adapter layer for tree-sitter AST mapping
18
19pub mod cache;
20pub mod callgraph;
21pub mod diff;
22pub mod flow;
23pub mod imports;
24pub mod knowledge;
25pub mod metrics;
26pub mod project;
27pub mod providers;
28pub mod rules;
29pub mod security;
30pub mod semantics;
31pub mod semgrep;
32pub mod ts_query_matcher;
33
34use anyhow::Result;
35use cache::AnalysisCache;
36use providers::{AnalysisProvider, PmdProvider, ProviderRegistry};
37use rayon::prelude::*;
38use rma_common::{
39    CodeMetrics, Finding, Language, ProviderType, ProvidersConfig, RmaConfig, Severity,
40};
41use rma_parser::ParsedFile;
42use serde::{Deserialize, Serialize};
43use std::collections::HashMap;
44use std::fs;
45use std::path::Path;
46use std::sync::Arc;
47use std::time::SystemTime;
48use tracing::{debug, info, instrument, warn};
49
50/// Results from analyzing a single file
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct FileAnalysis {
53    pub path: String,
54    pub language: Language,
55    pub metrics: CodeMetrics,
56    pub findings: Vec<Finding>,
57}
58
59/// Summary of analysis results
60#[derive(Debug, Clone, Default, Serialize, Deserialize)]
61pub struct AnalysisSummary {
62    pub files_analyzed: usize,
63    pub total_findings: usize,
64    pub critical_count: usize,
65    pub error_count: usize,
66    pub warning_count: usize,
67    pub info_count: usize,
68    pub total_complexity: usize,
69    pub total_loc: usize,
70}
71
72/// The main analysis engine
73///
74/// Combines native RMA rules with optional external providers (PMD, Oxlint)
75/// for comprehensive code analysis across multiple languages.
76pub struct AnalyzerEngine {
77    config: Arc<RmaConfig>,
78    rules: Vec<Box<dyn rules::Rule + Send + Sync>>,
79    /// Pre-filtered rule indices by language for O(1) lookup
80    rules_by_language: HashMap<Language, Vec<usize>>,
81    provider_registry: ProviderRegistry,
82    enabled_providers: Vec<ProviderType>,
83}
84
85impl AnalyzerEngine {
86    /// Create a new analyzer with default rules (no external providers)
87    pub fn new(config: RmaConfig) -> Self {
88        Self::with_providers(config, ProvidersConfig::default())
89    }
90
91    /// Create a new analyzer with specified providers
92    pub fn with_providers(config: RmaConfig, providers_config: ProvidersConfig) -> Self {
93        let mut engine = Self {
94            config: Arc::new(config),
95            rules: Vec::new(),
96            rules_by_language: HashMap::new(),
97            provider_registry: ProviderRegistry::new(),
98            enabled_providers: providers_config.enabled.clone(),
99        };
100        engine.register_default_rules();
101        engine.build_language_index();
102        engine.register_providers(&providers_config);
103        engine
104    }
105
106    /// Build index of rules by language for O(1) lookup
107    fn build_language_index(&mut self) {
108        let languages = [
109            Language::Rust,
110            Language::JavaScript,
111            Language::TypeScript,
112            Language::Python,
113            Language::Go,
114            Language::Java,
115            Language::Unknown,
116        ];
117
118        for lang in languages {
119            let indices: Vec<usize> = self
120                .rules
121                .iter()
122                .enumerate()
123                .filter(|(_, rule)| rule.applies_to(lang))
124                .map(|(i, _)| i)
125                .collect();
126            self.rules_by_language.insert(lang, indices);
127        }
128    }
129
130    /// Register external providers based on configuration
131    fn register_providers(&mut self, config: &ProvidersConfig) {
132        for provider_type in &config.enabled {
133            match provider_type {
134                ProviderType::Rma => {
135                    // RMA is always enabled via native rules, nothing to register
136                    debug!("Qryon native rules enabled");
137                }
138                ProviderType::Pmd => {
139                    let pmd = PmdProvider::new(config.pmd.clone());
140                    if pmd.is_available() {
141                        info!("PMD provider registered (version: {:?})", pmd.version());
142                        self.provider_registry.register(Box::new(pmd));
143                    } else {
144                        warn!(
145                            "PMD provider enabled but not available - check pmd_path configuration"
146                        );
147                    }
148                }
149                ProviderType::Oxlint => {
150                    let oxlint = providers::OxlintProvider::new();
151                    if oxlint.is_available() {
152                        info!(
153                            "Oxlint provider registered (version: {:?})",
154                            oxlint.version()
155                        );
156                        self.provider_registry.register(Box::new(oxlint));
157                    } else {
158                        warn!(
159                            "Oxlint provider enabled but not available - install oxlint or check binary_path"
160                        );
161                    }
162                }
163                ProviderType::RustSec => {
164                    let rustsec = providers::RustSecProvider::new();
165                    if rustsec.is_available() {
166                        info!(
167                            "RustSec provider registered (version: {:?})",
168                            rustsec.version()
169                        );
170                        self.provider_registry.register(Box::new(rustsec));
171                    } else {
172                        warn!(
173                            "RustSec provider enabled but database unavailable - check network connection"
174                        );
175                    }
176                }
177                ProviderType::Gosec => {
178                    let gosec = providers::GosecProvider::new(config.gosec.clone());
179                    if gosec.is_available() {
180                        info!("Gosec provider registered (version: {:?})", gosec.version());
181                        self.provider_registry.register(Box::new(gosec));
182                    } else {
183                        warn!(
184                            "Gosec provider enabled but not available - install gosec: go install github.com/securego/gosec/v2/cmd/gosec@latest"
185                        );
186                    }
187                }
188                #[cfg(feature = "oxc")]
189                ProviderType::Oxc => {
190                    let oxc = providers::OxcNativeProvider::new();
191                    if oxc.is_available() {
192                        info!(
193                            "Oxc native provider registered (version: {:?})",
194                            oxc.version()
195                        );
196                        self.provider_registry.register(Box::new(oxc));
197                    }
198                }
199                #[cfg(not(feature = "oxc"))]
200                ProviderType::Oxc => {
201                    warn!("Oxc provider not available - compiled without oxc feature");
202                }
203                ProviderType::Osv => {
204                    let osv = providers::OsvProvider::new(config.osv.clone());
205                    if osv.is_available() {
206                        info!("OSV provider registered (version: {:?})", osv.version());
207                        self.provider_registry.register(Box::new(osv));
208                    } else {
209                        // This should never happen since OsvProvider is always available
210                        warn!("OSV provider unexpectedly unavailable");
211                    }
212                }
213            }
214        }
215    }
216
217    /// Check if a provider is enabled
218    pub fn is_provider_enabled(&self, provider_type: ProviderType) -> bool {
219        self.enabled_providers.contains(&provider_type)
220    }
221
222    /// Get list of available providers
223    pub fn available_providers(&self) -> Vec<&str> {
224        self.provider_registry
225            .providers()
226            .iter()
227            .map(|p| p.name())
228            .collect()
229    }
230
231    /// Register all default security and quality rules
232    ///
233    /// All rules come from the embedded Semgrep rule engine. The 647+ community-vetted
234    /// rules are compiled into the binary at build time and provide comprehensive
235    /// coverage for security vulnerabilities across all supported languages.
236    fn register_default_rules(&mut self) {
237        // =====================================================================
238        // EMBEDDED SEMGREP RULES (647+ community-vetted rules)
239        // =====================================================================
240        // All scanning is done through the rule engine. Rules are:
241        // - Pre-compiled at build time from semgrep-rules repository
242        // - Validated and community-vetted
243        // - Cover: Python, JavaScript, TypeScript, Java, Go, Ruby, Rust, C, etc.
244        // - Categories: Security, quality, correctness, performance
245        self.rules.push(Box::new(semgrep::EmbeddedRulesRule::new()));
246    }
247
248    /// Analyze a single parsed file using native rules only
249    #[instrument(skip(self, parsed), fields(path = %parsed.path.display()))]
250    pub fn analyze_file(&self, parsed: &ParsedFile) -> Result<FileAnalysis> {
251        let metrics = metrics::compute_metrics(parsed);
252
253        let mut findings = Vec::new();
254
255        // Run only applicable rules using pre-built language index (O(1) lookup)
256        if let Some(rule_indices) = self.rules_by_language.get(&parsed.language) {
257            // Check if any applicable rule uses flow analysis
258            let needs_flow = rule_indices.iter().any(|&idx| self.rules[idx].uses_flow());
259
260            // Build flow context lazily only if needed
261            let flow_context = if needs_flow {
262                Some(flow::FlowContext::build(parsed, parsed.language))
263            } else {
264                None
265            };
266
267            for &idx in rule_indices {
268                let rule = &self.rules[idx];
269                let rule_findings = if rule.uses_flow() {
270                    if let Some(ref flow) = flow_context {
271                        rule.check_with_flow(parsed, flow)
272                    } else {
273                        rule.check(parsed)
274                    }
275                } else {
276                    rule.check(parsed)
277                };
278                findings.extend(rule_findings);
279            }
280        }
281
282        // Run applicable providers on this file
283        for provider in self.provider_registry.providers() {
284            if provider.supports_language(parsed.language) {
285                match provider.analyze_file(&parsed.path) {
286                    Ok(provider_findings) => {
287                        debug!(
288                            "Provider {} found {} findings for {}",
289                            provider.name(),
290                            provider_findings.len(),
291                            parsed.path.display()
292                        );
293                        findings.extend(provider_findings);
294                    }
295                    Err(e) => {
296                        warn!(
297                            "Provider {} failed for {}: {}",
298                            provider.name(),
299                            parsed.path.display(),
300                            e
301                        );
302                    }
303                }
304            }
305        }
306
307        // Filter by minimum severity
308        findings.retain(|f| f.severity >= self.config.min_severity);
309
310        debug!(
311            "Analyzed {} - {} findings, complexity {}",
312            parsed.path.display(),
313            findings.len(),
314            metrics.cyclomatic_complexity
315        );
316
317        Ok(FileAnalysis {
318            path: parsed.path.to_string_lossy().to_string(),
319            language: parsed.language,
320            metrics,
321            findings,
322        })
323    }
324
325    /// Analyze multiple parsed files in parallel
326    ///
327    /// This is the legacy method without caching support. For better performance
328    /// on repeated scans, use `analyze_files_cached` instead.
329    #[instrument(skip(self, files))]
330    pub fn analyze_files(
331        &self,
332        files: &[ParsedFile],
333    ) -> Result<(Vec<FileAnalysis>, AnalysisSummary)> {
334        self.analyze_files_cached(files, None)
335    }
336
337    /// Analyze multiple parsed files in parallel with optional caching
338    ///
339    /// When a cache is provided:
340    /// 1. Files with unchanged content (based on hash) use cached results
341    /// 2. Only modified/new files are analyzed
342    /// 3. Fresh analysis results are stored in the cache
343    /// 4. Combined results (cached + fresh) are returned
344    ///
345    /// This can reduce scan time by 80-90% for repeated scans of the same codebase.
346    #[instrument(skip(self, files, cache))]
347    pub fn analyze_files_cached(
348        &self,
349        files: &[ParsedFile],
350        cache: Option<&mut AnalysisCache>,
351    ) -> Result<(Vec<FileAnalysis>, AnalysisSummary)> {
352        info!("Starting parallel analysis of {} files", files.len());
353
354        // If no cache provided, analyze all files
355        let Some(cache) = cache else {
356            let results: Vec<FileAnalysis> = files
357                .par_iter()
358                .filter_map(|parsed| self.analyze_file(parsed).ok())
359                .collect();
360
361            let summary = compute_summary(&results);
362
363            info!(
364                "Analysis complete: {} files, {} findings ({} critical)",
365                summary.files_analyzed, summary.total_findings, summary.critical_count
366            );
367
368            return Ok((results, summary));
369        };
370
371        // Step 1: Partition files into those needing analysis vs cached
372        // Get mtime for each file (fallback to current time if unavailable)
373        let files_with_mtime: Vec<(&ParsedFile, SystemTime)> = files
374            .iter()
375            .map(|f| {
376                let mtime = fs::metadata(&f.path)
377                    .and_then(|m| m.modified())
378                    .unwrap_or_else(|_| SystemTime::now());
379                (f, mtime)
380            })
381            .collect();
382
383        // Separate files into those that need analysis and those that can use cache
384        let mut needs_analysis: Vec<(&ParsedFile, SystemTime)> = Vec::new();
385        let mut cached_results: Vec<FileAnalysis> = Vec::new();
386
387        for (parsed, mtime) in &files_with_mtime {
388            if cache.needs_analysis(&parsed.path, &parsed.content, *mtime) {
389                needs_analysis.push((*parsed, *mtime));
390            } else {
391                // Try to load from cache
392                if let Some(analysis) = cache.load_analysis(&parsed.path, &parsed.content) {
393                    debug!("Using cached analysis for {}", parsed.path.display());
394                    cached_results.push(analysis);
395                } else {
396                    // Cache entry exists but analysis file is missing - need to re-analyze
397                    needs_analysis.push((*parsed, *mtime));
398                }
399            }
400        }
401
402        let cached_count = cached_results.len();
403        let analyze_count = needs_analysis.len();
404
405        info!(
406            "Cache status: {} files cached, {} files need analysis",
407            cached_count, analyze_count
408        );
409
410        // Step 2: Analyze files that need it (in parallel)
411        let fresh_results: Vec<(FileAnalysis, SystemTime)> = needs_analysis
412            .par_iter()
413            .filter_map(|(parsed, mtime)| {
414                self.analyze_file(parsed)
415                    .ok()
416                    .map(|analysis| (analysis, *mtime))
417            })
418            .collect();
419
420        // Step 3: Update cache with fresh results (sequential - cache is mutable)
421        for (analysis, mtime) in &fresh_results {
422            // Find the corresponding parsed file to get content
423            if let Some((parsed, _)) = needs_analysis
424                .iter()
425                .find(|(p, _)| p.path.to_string_lossy() == analysis.path)
426            {
427                cache.mark_analyzed(parsed.path.clone(), &parsed.content, *mtime);
428                if let Err(e) = cache.store_analysis(&parsed.path, &parsed.content, analysis) {
429                    warn!("Failed to store analysis in cache: {}", e);
430                }
431            }
432        }
433
434        // Step 4: Combine cached and fresh results
435        let fresh_analyses: Vec<FileAnalysis> = fresh_results.into_iter().map(|(a, _)| a).collect();
436        let mut results = cached_results;
437        results.extend(fresh_analyses);
438
439        let summary = compute_summary(&results);
440
441        info!(
442            "Analysis complete: {} files ({} cached, {} fresh), {} findings ({} critical)",
443            summary.files_analyzed,
444            cached_count,
445            analyze_count,
446            summary.total_findings,
447            summary.critical_count
448        );
449
450        Ok((results, summary))
451    }
452
453    /// Run provider analysis on a directory
454    ///
455    /// This is more efficient for providers that support batch analysis
456    /// (like PMD which can analyze a whole directory at once).
457    #[instrument(skip(self))]
458    pub fn analyze_directory_with_providers(&self, path: &Path) -> Result<Vec<Finding>> {
459        let mut all_findings = Vec::new();
460
461        for provider in self.provider_registry.providers() {
462            if provider.is_available() {
463                info!("Running {} on {}", provider.name(), path.display());
464                match provider.analyze_directory(path) {
465                    Ok(findings) => {
466                        info!("{} found {} findings", provider.name(), findings.len());
467                        all_findings.extend(findings);
468                    }
469                    Err(e) => {
470                        warn!("Provider {} failed: {}", provider.name(), e);
471                    }
472                }
473            }
474        }
475
476        // Filter by minimum severity
477        all_findings.retain(|f| f.severity >= self.config.min_severity);
478
479        Ok(all_findings)
480    }
481
482    /// Analyze files with both native rules and providers
483    ///
484    /// This combines:
485    /// 1. Native rule analysis (per-file, parallel)
486    /// 2. Provider analysis (batch where possible)
487    #[instrument(skip(self, files))]
488    pub fn analyze_files_with_providers(
489        &self,
490        files: &[ParsedFile],
491        base_path: &Path,
492    ) -> Result<(Vec<FileAnalysis>, AnalysisSummary)> {
493        info!(
494            "Starting analysis of {} files with {} providers",
495            files.len(),
496            self.provider_registry.providers().len()
497        );
498
499        // Step 1: Run native rules in parallel using pre-indexed rules
500        let results: Vec<FileAnalysis> = files
501            .par_iter()
502            .filter_map(|parsed| {
503                let metrics = metrics::compute_metrics(parsed);
504                let mut findings = Vec::new();
505
506                // Run only applicable rules using pre-built language index
507                if let Some(rule_indices) = self.rules_by_language.get(&parsed.language) {
508                    // Check if any applicable rule uses flow analysis
509                    let needs_flow = rule_indices.iter().any(|&idx| self.rules[idx].uses_flow());
510
511                    // Build flow context lazily only if needed
512                    let flow_context = if needs_flow {
513                        Some(flow::FlowContext::build(parsed, parsed.language))
514                    } else {
515                        None
516                    };
517
518                    for &idx in rule_indices {
519                        let rule = &self.rules[idx];
520                        let rule_findings = if rule.uses_flow() {
521                            if let Some(ref flow) = flow_context {
522                                rule.check_with_flow(parsed, flow)
523                            } else {
524                                rule.check(parsed)
525                            }
526                        } else {
527                            rule.check(parsed)
528                        };
529                        findings.extend(rule_findings);
530                    }
531                }
532
533                Some(FileAnalysis {
534                    path: parsed.path.display().to_string(),
535                    language: parsed.language,
536                    metrics,
537                    findings,
538                })
539            })
540            .collect();
541
542        // Step 2: Build HashMap for O(1) result lookups
543        let mut results_map: HashMap<String, FileAnalysis> =
544            results.into_iter().map(|r| (r.path.clone(), r)).collect();
545
546        // Step 3: Run providers on the directory (more efficient for tools like PMD)
547        let provider_findings = self.analyze_directory_with_providers(base_path)?;
548
549        // Step 4: Merge provider findings into file results using O(1) HashMap lookup
550        for finding in provider_findings {
551            let file_path = finding.location.file.display().to_string();
552            if let Some(result) = results_map.get_mut(&file_path) {
553                result.findings.push(finding);
554            } else {
555                // File wasn't in parsed files - create a new result
556                results_map.insert(
557                    file_path.clone(),
558                    FileAnalysis {
559                        path: file_path,
560                        language: finding.language,
561                        metrics: CodeMetrics::default(),
562                        findings: vec![finding],
563                    },
564                );
565            }
566        }
567
568        // Convert back to Vec
569        let mut results: Vec<FileAnalysis> = results_map.into_values().collect();
570
571        // Step 4: Filter by severity
572        for result in &mut results {
573            result
574                .findings
575                .retain(|f| f.severity >= self.config.min_severity);
576        }
577
578        let summary = compute_summary(&results);
579
580        info!(
581            "Analysis complete: {} files, {} findings ({} critical)",
582            summary.files_analyzed, summary.total_findings, summary.critical_count
583        );
584
585        Ok((results, summary))
586    }
587}
588
589/// Compute aggregate summary from analysis results
590fn compute_summary(results: &[FileAnalysis]) -> AnalysisSummary {
591    let mut summary = AnalysisSummary {
592        files_analyzed: results.len(),
593        ..Default::default()
594    };
595
596    for result in results {
597        summary.total_loc += result.metrics.lines_of_code;
598        summary.total_complexity += result.metrics.cyclomatic_complexity;
599
600        for finding in &result.findings {
601            summary.total_findings += 1;
602            match finding.severity {
603                Severity::Critical => summary.critical_count += 1,
604                Severity::Error => summary.error_count += 1,
605                Severity::Warning => summary.warning_count += 1,
606                Severity::Info => summary.info_count += 1,
607            }
608        }
609    }
610
611    summary
612}
613
614#[cfg(test)]
615mod tests {
616    use super::*;
617    use rma_parser::ParserEngine;
618    use std::path::Path;
619
620    #[test]
621    fn test_analyze_rust_file() {
622        let config = RmaConfig::default();
623        let parser = ParserEngine::new(config.clone());
624        let analyzer = AnalyzerEngine::new(config);
625
626        let content = r#"
627fn safe_function() {
628    println!("Safe!");
629}
630
631fn another_function() {
632    let x = 42;
633    println!("{}", x);
634}
635"#;
636
637        let parsed = parser.parse_file(Path::new("test.rs"), content).unwrap();
638        let analysis = analyzer.analyze_file(&parsed).unwrap();
639
640        // Analysis should complete successfully
641        assert_eq!(analysis.language, Language::Rust);
642        assert!(analysis.metrics.lines_of_code > 0);
643    }
644
645    #[test]
646    fn test_embedded_rules_are_active() {
647        let config = RmaConfig::default();
648        let analyzer = AnalyzerEngine::new(config);
649
650        // Verify that the embedded rules engine is registered
651        // The analyzer should have at least one rule (the EmbeddedRulesRule)
652        assert!(!analyzer.rules.is_empty());
653    }
654}