Skip to main content

codemod_core/scanner/
mod.rs

1//! Codebase scanning for pattern matches.
2//!
3//! The scanner walks a directory tree, filters files by language and glob
4//! patterns, and runs the [`PatternMatcher`](crate::pattern::PatternMatcher)
5//! against each file. It supports parallel file processing via the
6//! [`parallel`] sub-module and configurable include/exclude rules via
7//! [`walker`].
8
9pub mod parallel;
10pub mod walker;
11
12pub use walker::FileWalker;
13
14use serde::{Deserialize, Serialize};
15use std::path::PathBuf;
16use std::time::Instant;
17
18use crate::language::LanguageAdapter;
19use crate::pattern::matcher::PatternMatcher;
20use crate::pattern::Pattern;
21
22// ---------------------------------------------------------------------------
23// Configuration
24// ---------------------------------------------------------------------------
25
26/// Configuration for a codebase scan.
27#[derive(Debug, Clone)]
28pub struct ScanConfig {
29    /// Root directory to scan.
30    pub target_dir: PathBuf,
31    /// Glob patterns for files to *include* (empty = include all).
32    pub include_patterns: Vec<String>,
33    /// Glob patterns for files to *exclude*.
34    pub exclude_patterns: Vec<String>,
35    /// Whether to respect `.gitignore` rules.
36    pub respect_gitignore: bool,
37    /// Maximum file size in bytes (files larger than this are skipped).
38    pub max_file_size: usize,
39}
40
41impl Default for ScanConfig {
42    fn default() -> Self {
43        Self {
44            target_dir: PathBuf::from("."),
45            include_patterns: vec![],
46            exclude_patterns: vec![],
47            respect_gitignore: true,
48            max_file_size: 1_000_000, // 1 MB
49        }
50    }
51}
52
53// ---------------------------------------------------------------------------
54// Scan results
55// ---------------------------------------------------------------------------
56
57/// Aggregated result of scanning a codebase.
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct ScanResult {
60    /// Total number of files scanned.
61    pub total_files_scanned: usize,
62    /// Total number of pattern matches found across all files.
63    pub total_matches: usize,
64    /// Individual matches.
65    pub matches: Vec<ScanMatch>,
66    /// Wall-clock duration of the scan in milliseconds.
67    pub duration_ms: u64,
68}
69
70/// A single match found during scanning.
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct ScanMatch {
73    /// File in which the match was found.
74    pub file_path: PathBuf,
75    /// 1-indexed line number.
76    pub line: usize,
77    /// 0-indexed column (byte offset within the line).
78    pub column: usize,
79    /// The matched source text.
80    pub matched_text: String,
81    /// A few lines of context *before* the match.
82    pub context_before: String,
83    /// A few lines of context *after* the match.
84    pub context_after: String,
85}
86
87// ---------------------------------------------------------------------------
88// Scanner
89// ---------------------------------------------------------------------------
90
91/// Main scanner that orchestrates file walking and pattern matching.
92pub struct Scanner {
93    config: ScanConfig,
94    language: Box<dyn LanguageAdapter>,
95}
96
97impl Scanner {
98    /// Create a new scanner with the given configuration and language adapter.
99    pub fn new(config: ScanConfig, language: Box<dyn LanguageAdapter>) -> Self {
100        Self { config, language }
101    }
102
103    /// Scan the target directory for pattern matches.
104    ///
105    /// Files are filtered by extension (via the language adapter) and the
106    /// configured include/exclude globs. Each eligible file is parsed and
107    /// matched against the pattern.
108    ///
109    /// # Errors
110    ///
111    /// Returns [`CodemodError::Scan`] if the target directory cannot be
112    /// walked or a file cannot be read.
113    pub fn scan(&self, pattern: &Pattern) -> crate::Result<ScanResult> {
114        let start = Instant::now();
115
116        // 1. Collect eligible files.
117        let walker = FileWalker::new(&self.config)?;
118        let files = walker.collect_files(&*self.language)?;
119
120        log::info!("Found {} eligible files to scan", files.len());
121
122        // 2. Scan files (sequentially; for parallel see `parallel` module).
123        let matcher = PatternMatcher::new(self.make_language_clone());
124        let mut scan_matches = Vec::new();
125        let mut total_files_scanned: usize = 0;
126
127        for file_path in &files {
128            let source = match std::fs::read_to_string(file_path) {
129                Ok(s) => s,
130                Err(e) => {
131                    log::warn!("Skipping {}: {e}", file_path.display());
132                    continue;
133                }
134            };
135
136            total_files_scanned += 1;
137
138            match matcher.find_matches(&source, pattern) {
139                Ok(matches) => {
140                    for m in matches {
141                        let (ctx_before, ctx_after) =
142                            Self::extract_context(&source, m.start_position.line, 3);
143                        scan_matches.push(ScanMatch {
144                            file_path: file_path.clone(),
145                            line: m.start_position.line + 1, // 1-indexed
146                            column: m.start_position.column,
147                            matched_text: m.matched_text.clone(),
148                            context_before: ctx_before,
149                            context_after: ctx_after,
150                        });
151                    }
152                }
153                Err(e) => {
154                    log::warn!("Error matching in {}: {e}", file_path.display());
155                }
156            }
157        }
158
159        let duration_ms = start.elapsed().as_millis() as u64;
160        let total_matches = scan_matches.len();
161
162        Ok(ScanResult {
163            total_files_scanned,
164            total_matches,
165            matches: scan_matches,
166            duration_ms,
167        })
168    }
169
170    // -----------------------------------------------------------------
171    // Helpers
172    // -----------------------------------------------------------------
173
174    /// Extract a few lines of context around a given line number.
175    fn extract_context(source: &str, line: usize, radius: usize) -> (String, String) {
176        let lines: Vec<&str> = source.lines().collect();
177        let start_before = line.saturating_sub(radius);
178        let end_after = (line + radius + 1).min(lines.len());
179
180        let before = lines[start_before..line].join("\n");
181        let after = if line + 1 < lines.len() {
182            lines[(line + 1)..end_after].join("\n")
183        } else {
184            String::new()
185        };
186
187        (before, after)
188    }
189
190    /// Create a boxed clone of the language adapter.
191    ///
192    /// Because `LanguageAdapter` is object-safe but not `Clone`, we capture
193    /// the data returned by the trait methods into a small owned struct.
194    fn make_language_clone(&self) -> Box<dyn LanguageAdapter> {
195        Box::new(StaticLanguageInfo::snapshot(&*self.language))
196    }
197}
198
199// ---------------------------------------------------------------------------
200// StaticLanguageInfo — owned snapshot of a LanguageAdapter
201// ---------------------------------------------------------------------------
202
203/// A small, owned snapshot of a [`LanguageAdapter`] that can be cheaply moved
204/// into a new [`PatternMatcher`].
205///
206/// This exists because `Box<dyn LanguageAdapter>` is not `Clone`. The snapshot
207/// captures all returned slices and the `Language` value so that a new
208/// `PatternMatcher` can be constructed without requiring `Arc`.
209pub(crate) struct StaticLanguageInfo {
210    name: String,
211    lang: tree_sitter::Language,
212    extensions: Vec<String>,
213    statements: Vec<String>,
214    expressions: Vec<String>,
215    identifiers: Vec<String>,
216}
217
218impl StaticLanguageInfo {
219    pub(crate) fn snapshot(adapter: &dyn LanguageAdapter) -> Self {
220        Self {
221            name: adapter.name().to_string(),
222            lang: adapter.language(),
223            extensions: adapter
224                .file_extensions()
225                .iter()
226                .map(|s| s.to_string())
227                .collect(),
228            statements: adapter
229                .statement_node_types()
230                .iter()
231                .map(|s| s.to_string())
232                .collect(),
233            expressions: adapter
234                .expression_node_types()
235                .iter()
236                .map(|s| s.to_string())
237                .collect(),
238            identifiers: adapter
239                .identifier_node_types()
240                .iter()
241                .map(|s| s.to_string())
242                .collect(),
243        }
244    }
245}
246
247impl LanguageAdapter for StaticLanguageInfo {
248    fn name(&self) -> &str {
249        &self.name
250    }
251
252    fn language(&self) -> tree_sitter::Language {
253        self.lang.clone()
254    }
255
256    fn file_extensions(&self) -> &[&str] {
257        // Leak a small slice for the lifetime of the program. This is bounded
258        // because StaticLanguageInfo is created at most a handful of times per
259        // scan invocation.
260        let refs: Vec<&str> = self.extensions.iter().map(|s| s.as_str()).collect();
261        Box::leak(refs.into_boxed_slice())
262    }
263
264    fn statement_node_types(&self) -> &[&str] {
265        let refs: Vec<&str> = self.statements.iter().map(|s| s.as_str()).collect();
266        Box::leak(refs.into_boxed_slice())
267    }
268
269    fn expression_node_types(&self) -> &[&str] {
270        let refs: Vec<&str> = self.expressions.iter().map(|s| s.as_str()).collect();
271        Box::leak(refs.into_boxed_slice())
272    }
273
274    fn identifier_node_types(&self) -> &[&str] {
275        let refs: Vec<&str> = self.identifiers.iter().map(|s| s.as_str()).collect();
276        Box::leak(refs.into_boxed_slice())
277    }
278}