Skip to main content

raxit_core/
scanner.rs

1//! Scanner implementation - orchestrates the scanning pipeline
2
3use crate::{
4    cache::FileCache,
5    config::ScanConfig,
6    error::{RaxitError, Result},
7    extractors,
8    schema::{ScanResult, TrustBoundary},
9};
10use rayon::prelude::*;
11use std::path::{Path, PathBuf};
12use walkdir::WalkDir;
13
14/// Main scanner orchestrator
15pub struct Scanner {
16    config: ScanConfig,
17    cache: FileCache,
18    cache_path: PathBuf,
19}
20
21impl Scanner {
22    /// Create a new scanner with the given configuration
23    pub fn new(config: ScanConfig) -> Result<Self> {
24        // Validate path exists
25        if !config.path.exists() {
26            return Err(RaxitError::InvalidPath(config.path.clone()));
27        }
28
29        // Set up cache path
30        let cache_path = config.path.join(&config.cache_dir).join("cache.json");
31
32        // Load cache if incremental mode is enabled
33        let cache = if config.incremental {
34            FileCache::load(&cache_path)?
35        } else {
36            FileCache::new()
37        };
38
39        Ok(Self {
40            config,
41            cache,
42            cache_path,
43        })
44    }
45
46    /// Discover files to scan based on include/exclude patterns
47    /// Returns (files_to_scan, files_skipped)
48    pub fn discover_files(&mut self) -> Result<(Vec<PathBuf>, usize)> {
49        let mut all_files = Vec::new();
50        let mut files_to_scan = Vec::new();
51        let mut files_skipped = 0;
52
53        let walker = WalkDir::new(&self.config.path)
54            .follow_links(false)
55            .into_iter()
56            .filter_entry(|e| !self.should_exclude(e.path()));
57
58        for entry in walker {
59            let entry = entry?;
60            let path = entry.path();
61
62            if path.is_file() && self.should_include(path) {
63                all_files.push(path.to_path_buf());
64
65                // Check if file has changed (only in incremental mode)
66                if self.config.incremental {
67                    match self.cache.has_changed(path) {
68                        Ok(true) => {
69                            files_to_scan.push(path.to_path_buf());
70                        }
71                        Ok(false) => {
72                            files_skipped += 1;
73                            tracing::debug!("Skipping unchanged file: {}", path.display());
74                        }
75                        Err(e) => {
76                            tracing::warn!("Failed to check cache for {}: {}", path.display(), e);
77                            // On error, scan the file to be safe
78                            files_to_scan.push(path.to_path_buf());
79                        }
80                    }
81                } else {
82                    files_to_scan.push(path.to_path_buf());
83                }
84            }
85        }
86
87        tracing::debug!(
88            "Discovered {} files ({} to scan, {} skipped)",
89            all_files.len(),
90            files_to_scan.len(),
91            files_skipped
92        );
93
94        Ok((files_to_scan, files_skipped))
95    }
96
97    /// Detect AI agent frameworks in the codebase
98    pub fn detect_frameworks(&self, files: &[PathBuf]) -> Result<Vec<String>> {
99        let mut frameworks = std::collections::HashSet::new();
100
101        // Detect frameworks by looking for import patterns
102        for file in files {
103            if let Ok(content) = std::fs::read_to_string(file) {
104                if content.contains("from pydantic_ai import")
105                    || content.contains("import pydantic_ai")
106                {
107                    frameworks.insert("pydantic-ai".to_string());
108                }
109                if content.contains("from langgraph import") || content.contains("import langgraph")
110                {
111                    frameworks.insert("langgraph".to_string());
112                }
113                if content.contains("from crewai import") || content.contains("import crewai") {
114                    frameworks.insert("crewai".to_string());
115                }
116            }
117        }
118
119        Ok(frameworks.into_iter().collect())
120    }
121
122    /// Extract assets from all files using appropriate extractors
123    pub fn extract_all(
124        &mut self,
125        files: &[PathBuf],
126        frameworks: &[String],
127        files_skipped: usize,
128    ) -> Result<ScanResult> {
129        let mut result = ScanResult::new();
130
131        // Populate manifest with scan metadata
132        result.manifest.subject.name = self.detect_project_name();
133        result.manifest.subject.version = self.detect_project_version();
134        result.manifest.subject.source = self.detect_project_source();
135
136        // Add scanned files to manifest
137        result.manifest.files = files
138            .iter()
139            .map(|p| p.to_string_lossy().to_string())
140            .collect();
141
142        // Update scan configuration metadata
143        result.manifest.scan_config.exclude_patterns = self.config.exclude.clone();
144        result.manifest.scan_config.frameworks_detected = frameworks.to_vec();
145        result.manifest.scan_config.parallel_workers = if self.config.parallel {
146            self.config.max_threads.unwrap_or_else(num_cpus::get)
147        } else {
148            1
149        };
150        result.manifest.scan_config.incremental = self.config.incremental;
151        result.manifest.scan_config.files_scanned = files.len();
152        result.manifest.scan_config.files_skipped = files_skipped;
153
154        // Determine primary framework (use first detected)
155        let primary_framework = frameworks.first().map(|s| s.as_str()).unwrap_or("unknown");
156
157        // Process files in parallel or sequentially based on config
158        let extracted_assets: Vec<_> = if self.config.parallel {
159            files
160                .par_iter()
161                .filter_map(|file| extractors::extract_from_file(file, primary_framework).ok())
162                .collect()
163        } else {
164            files
165                .iter()
166                .filter_map(|file| extractors::extract_from_file(file, primary_framework).ok())
167                .collect()
168        };
169
170        // Combine all extracted assets
171        for assets in extracted_assets {
172            result.agents.extend(assets.agents);
173            result.tools.extend(assets.tools);
174            result.models.extend(assets.models);
175            result.memory.extend(assets.memory);
176        }
177
178        // Update cache for scanned files
179        if self.config.incremental {
180            for file in files {
181                if let Err(e) = self.cache.update(file) {
182                    tracing::warn!("Failed to update cache for {}: {}", file.display(), e);
183                }
184            }
185
186            // Save cache
187            if let Err(e) = self.cache.save(&self.cache_path) {
188                tracing::warn!("Failed to save cache: {}", e);
189            }
190        }
191
192        tracing::info!(
193            "Extracted {} agents, {} tools, {} models from {} files",
194            result.agents.len(),
195            result.tools.len(),
196            result.models.len(),
197            files.len()
198        );
199
200        Ok(result)
201    }
202
203    /// Detect project name from directory or pyproject.toml
204    fn detect_project_name(&self) -> String {
205        // Try to read pyproject.toml
206        let pyproject_path = self.config.path.join("pyproject.toml");
207        if let Ok(content) = std::fs::read_to_string(&pyproject_path) {
208            // Simple TOML parsing for project name
209            for line in content.lines() {
210                if line.trim().starts_with("name") {
211                    if let Some(name) = line.split('=').nth(1) {
212                        return name.trim().trim_matches('"').to_string();
213                    }
214                }
215            }
216        }
217
218        // Fallback to directory name
219        self.config
220            .path
221            .file_name()
222            .and_then(|n| n.to_str())
223            .unwrap_or("unknown")
224            .to_string()
225    }
226
227    /// Detect project version from pyproject.toml
228    fn detect_project_version(&self) -> Option<String> {
229        let pyproject_path = self.config.path.join("pyproject.toml");
230        if let Ok(content) = std::fs::read_to_string(&pyproject_path) {
231            for line in content.lines() {
232                if line.trim().starts_with("version") {
233                    if let Some(version) = line.split('=').nth(1) {
234                        return Some(version.trim().trim_matches('"').to_string());
235                    }
236                }
237            }
238        }
239        None
240    }
241
242    /// Detect project source from git config
243    fn detect_project_source(&self) -> Option<String> {
244        let git_config = self.config.path.join(".git/config");
245        if let Ok(content) = std::fs::read_to_string(&git_config) {
246            for line in content.lines() {
247                if line.trim().starts_with("url") {
248                    if let Some(url) = line.split('=').nth(1) {
249                        return Some(url.trim().to_string());
250                    }
251                }
252            }
253        }
254        None
255    }
256
257    /// Build call graph from extracted assets
258    pub fn build_call_graph(&self, results: &ScanResult) -> Result<CallGraph> {
259        crate::analyzers::build_call_graph(results)
260    }
261
262    /// Analyze trust boundaries using Meta's Rule of Two
263    pub fn analyze_trust_boundaries(&self, results: &ScanResult) -> Result<Vec<TrustBoundary>> {
264        crate::analyzers::analyze_trust_boundaries(results)
265    }
266
267    /// Generate final schema with all analysis results
268    pub fn generate_schema(
269        &self,
270        results: &ScanResult,
271        boundaries: &[TrustBoundary],
272    ) -> Result<ScanResult> {
273        let mut schema = results.clone();
274        schema.trust_boundaries = boundaries.to_vec();
275        Ok(schema)
276    }
277
278    /// Check if path should be included based on patterns
279    fn should_include(&self, path: &Path) -> bool {
280        let path_str = path.to_string_lossy();
281
282        // Check include patterns
283        self.config.include.iter().any(|pattern| {
284            glob::Pattern::new(pattern)
285                .map(|p| p.matches(&path_str))
286                .unwrap_or(false)
287        })
288    }
289
290    /// Check if path should be excluded based on patterns
291    fn should_exclude(&self, path: &Path) -> bool {
292        let path_str = path.to_string_lossy();
293
294        // Check exclude patterns
295        self.config.exclude.iter().any(|pattern| {
296            glob::Pattern::new(pattern)
297                .map(|p| p.matches(&path_str))
298                .unwrap_or(false)
299        })
300    }
301}
302
303/// Call graph representation (placeholder)
304pub struct CallGraph {
305    nodes: Vec<String>,
306}
307
308impl CallGraph {
309    pub fn new() -> Self {
310        Self { nodes: Vec::new() }
311    }
312
313    pub fn nodes(&self) -> &[String] {
314        &self.nodes
315    }
316}
317
318impl Default for CallGraph {
319    fn default() -> Self {
320        Self::new()
321    }
322}
323
324#[cfg(test)]
325mod tests {
326    use super::*;
327
328    #[test]
329    fn test_scanner_creation() {
330        let config = ScanConfig::default();
331        let scanner = Scanner::new(config);
332        assert!(scanner.is_ok());
333    }
334
335    #[test]
336    fn test_invalid_path() {
337        let config = ScanConfig::new("/nonexistent/path");
338        let scanner = Scanner::new(config);
339        assert!(scanner.is_err());
340    }
341}