Skip to main content

tirith_core/
scan.rs

1use std::path::{Path, PathBuf};
2
3use crate::engine::{self, AnalysisContext};
4use crate::extract::ScanContext;
5use crate::tokenize::ShellType;
6use crate::verdict::{Finding, Severity};
7
8/// Configuration for a file scan operation.
9pub struct ScanConfig {
10    /// Path to scan (directory or single file).
11    pub path: PathBuf,
12    /// Recurse into subdirectories.
13    pub recursive: bool,
14    /// Severity threshold for CI failure.
15    pub fail_on: Severity,
16    /// Glob patterns to ignore.
17    pub ignore_patterns: Vec<String>,
18    /// Max files to scan (None = unlimited).
19    pub max_files: Option<usize>,
20}
21
22/// Result of a complete scan operation.
23pub struct ScanResult {
24    pub file_results: Vec<FileScanResult>,
25    pub scanned_count: usize,
26    pub skipped_count: usize,
27    pub truncated: bool,
28    pub truncation_reason: Option<String>,
29}
30
31/// Result of scanning a single file.
32pub struct FileScanResult {
33    pub path: PathBuf,
34    pub findings: Vec<Finding>,
35    pub is_config_file: bool,
36}
37
38/// Known AI config file basenames (scanned first for priority ordering).
39/// Only includes names specific to AI tooling — generic names like settings.json
40/// are only prioritized when found inside a known config directory (handled by
41/// `is_priority_path` checking the parent directory).
42const PRIORITY_BASENAMES: &[&str] = &[
43    ".cursorrules",
44    ".cursorignore",
45    ".clinerules",
46    ".windsurfrules",
47    "CLAUDE.md",
48    "AGENTS.md",
49    "copilot-instructions.md",
50    "mcp.json",
51    ".mcp.json",
52    "mcp_settings.json",
53    "devcontainer.json",
54];
55
56/// Parent directories that make generic filenames count as priority.
57const PRIORITY_PARENT_DIRS: &[&str] = &[
58    ".claude",
59    ".vscode",
60    ".cursor",
61    ".windsurf",
62    ".cline",
63    ".continue",
64    ".github",
65    ".devcontainer",
66    ".roo",
67];
68
69/// Run a file scan operation.
70///
71/// Detection is always free (ADR-13). `max_files` is a caller-provided safety
72/// cap (e.g. for resource-constrained CI), not a license gate.
73pub fn scan(config: &ScanConfig) -> ScanResult {
74    let mut files = collect_files(&config.path, config.recursive, &config.ignore_patterns);
75
76    // Sort: known config files first, then lexicographic
77    files.sort_by(|a, b| {
78        let a_priority = is_priority_file(a);
79        let b_priority = is_priority_file(b);
80        match (a_priority, b_priority) {
81            (true, false) => std::cmp::Ordering::Less,
82            (false, true) => std::cmp::Ordering::Greater,
83            _ => a.cmp(b),
84        }
85    });
86
87    let mut truncated = false;
88    let mut truncation_reason = None;
89    let mut skipped_count = 0;
90
91    // Apply caller-provided safety cap (not a license gate)
92    if let Some(max) = config.max_files {
93        if files.len() > max {
94            skipped_count = files.len() - max;
95            files.truncate(max);
96            truncated = true;
97            truncation_reason = Some(format!(
98                "Scan capped at {max} files ({skipped_count} skipped)."
99            ));
100        }
101    }
102
103    let mut file_results = Vec::new();
104    for file_path in &files {
105        if let Some(result) = scan_single_file(file_path) {
106            file_results.push(result);
107        } else {
108            skipped_count += 1;
109        }
110    }
111
112    ScanResult {
113        scanned_count: file_results.len(),
114        skipped_count,
115        truncated,
116        truncation_reason,
117        file_results,
118    }
119}
120
121/// Scan a single file and return its results.
122pub fn scan_single_file(file_path: &Path) -> Option<FileScanResult> {
123    // Read file content with size cap (10 MiB)
124    const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
125
126    let metadata = match std::fs::metadata(file_path) {
127        Ok(m) => m,
128        Err(e) => {
129            eprintln!(
130                "tirith: scan: cannot read metadata for {}: {e}",
131                file_path.display()
132            );
133            return None;
134        }
135    };
136    if metadata.len() > MAX_FILE_SIZE {
137        eprintln!(
138            "tirith: scan: skipping {} ({}B exceeds {}B limit)",
139            file_path.display(),
140            metadata.len(),
141            MAX_FILE_SIZE
142        );
143        return None;
144    }
145
146    let raw_bytes = match std::fs::read(file_path) {
147        Ok(b) => b,
148        Err(e) => {
149            eprintln!("tirith: scan: cannot read {}: {e}", file_path.display());
150            return None;
151        }
152    };
153    let content = String::from_utf8_lossy(&raw_bytes).into_owned();
154
155    let is_config = is_priority_file(file_path);
156
157    let cwd = file_path
158        .parent()
159        .map(|p| p.display().to_string())
160        .filter(|s| !s.is_empty());
161    let ctx = AnalysisContext {
162        input: content,
163        shell: ShellType::Posix,
164        scan_context: ScanContext::FileScan,
165        raw_bytes: Some(raw_bytes),
166        interactive: false,
167        cwd: cwd.clone(),
168        file_path: Some(file_path.to_path_buf()),
169        repo_root: None,
170        is_config_override: false,
171        clipboard_html: None,
172    };
173
174    let verdict = engine::analyze(&ctx);
175
176    // Apply paranoia filter to scan findings
177    let policy = crate::policy::Policy::discover(cwd.as_deref());
178    let mut findings = verdict.findings;
179    engine::filter_findings_by_paranoia_vec(&mut findings, policy.paranoia);
180
181    Some(FileScanResult {
182        path: file_path.to_path_buf(),
183        findings,
184        is_config_file: is_config,
185    })
186}
187
188/// Scan content from stdin (no file path).
189pub fn scan_stdin(content: &str, raw_bytes: &[u8]) -> FileScanResult {
190    let cwd = std::env::current_dir()
191        .ok()
192        .map(|p| p.display().to_string());
193    let ctx = AnalysisContext {
194        input: content.to_string(),
195        shell: ShellType::Posix,
196        scan_context: ScanContext::FileScan,
197        raw_bytes: Some(raw_bytes.to_vec()),
198        interactive: false,
199        cwd: cwd.clone(),
200        file_path: None,
201        repo_root: None,
202        is_config_override: false,
203        clipboard_html: None,
204    };
205
206    let verdict = engine::analyze(&ctx);
207
208    // Apply paranoia filter to scan findings
209    let policy = crate::policy::Policy::discover(cwd.as_deref());
210    let mut findings = verdict.findings;
211    engine::filter_findings_by_paranoia_vec(&mut findings, policy.paranoia);
212
213    FileScanResult {
214        path: PathBuf::from("<stdin>"),
215        findings,
216        is_config_file: false,
217    }
218}
219
220/// Check if a path matches a priority config file.
221/// Matches either by AI-specific basename or by being inside a known config directory.
222fn is_priority_file(path: &Path) -> bool {
223    let basename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
224
225    // Direct AI-specific basename match
226    if PRIORITY_BASENAMES.contains(&basename) {
227        return true;
228    }
229
230    // Generic filenames are priority only inside known config dirs
231    if let Some(parent) = path.parent() {
232        let parent_name = parent.file_name().and_then(|n| n.to_str()).unwrap_or("");
233        if PRIORITY_PARENT_DIRS.contains(&parent_name) {
234            return true;
235        }
236    }
237
238    false
239}
240
241/// Collect files from a path (directory or single file).
242fn collect_files(path: &Path, recursive: bool, ignore_patterns: &[String]) -> Vec<PathBuf> {
243    if path.is_file() {
244        return vec![path.to_path_buf()];
245    }
246
247    if !path.is_dir() {
248        eprintln!("tirith: scan: path does not exist: {}", path.display());
249        return vec![];
250    }
251
252    let mut files = Vec::new();
253    collect_files_recursive(path, recursive, ignore_patterns, &mut files);
254    files
255}
256
257fn collect_files_recursive(
258    dir: &Path,
259    recursive: bool,
260    ignore_patterns: &[String],
261    files: &mut Vec<PathBuf>,
262) {
263    let entries = match std::fs::read_dir(dir) {
264        Ok(e) => e,
265        Err(e) => {
266            eprintln!("tirith: scan: cannot read directory {}: {e}", dir.display());
267            return;
268        }
269    };
270
271    for entry in entries {
272        let entry = match entry {
273            Ok(e) => e,
274            Err(e) => {
275                eprintln!(
276                    "tirith: scan: error reading entry in {}: {e}",
277                    dir.display()
278                );
279                continue;
280            }
281        };
282        let path = entry.path();
283        let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
284
285        // Skip hidden dirs (except known config dirs) and common non-useful dirs
286        if path.is_dir() {
287            if should_skip_dir(name) && !is_known_config_dir(name) {
288                continue;
289            }
290            if recursive || is_known_config_dir(name) {
291                collect_files_recursive(&path, recursive, ignore_patterns, files);
292            }
293            continue;
294        }
295
296        // Skip binary/non-text files by extension
297        if is_binary_extension(name) {
298            continue;
299        }
300
301        // Apply ignore patterns
302        if ignore_patterns
303            .iter()
304            .any(|pat| name.contains(pat.as_str()))
305        {
306            continue;
307        }
308
309        files.push(path);
310    }
311}
312
313/// Directories to skip during scanning.
314fn should_skip_dir(name: &str) -> bool {
315    matches!(
316        name,
317        ".git"
318            | "node_modules"
319            | "target"
320            | "__pycache__"
321            | ".tox"
322            | "dist"
323            | "build"
324            | ".next"
325            | "vendor"
326            | ".cache"
327    )
328}
329
330/// Known AI config directories that should always be entered.
331fn is_known_config_dir(name: &str) -> bool {
332    matches!(
333        name,
334        ".claude"
335            | ".vscode"
336            | ".cursor"
337            | ".windsurf"
338            | ".cline"
339            | ".continue"
340            | ".github"
341            | ".devcontainer"
342            | ".roo"
343    )
344}
345
346/// File extensions that indicate binary content (skip scanning).
347fn is_binary_extension(name: &str) -> bool {
348    let binary_exts = [
349        ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg", ".webp", ".mp3", ".mp4", ".wav",
350        ".avi", ".mov", ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".exe", ".dll", ".so",
351        ".dylib", ".o", ".a", ".wasm", ".pyc", ".class", ".jar",
352    ];
353    let name_lower = name.to_lowercase();
354    binary_exts.iter().any(|ext| name_lower.ends_with(ext))
355}
356
357impl ScanResult {
358    /// Check if any finding meets or exceeds the given severity threshold.
359    pub fn has_findings_at_or_above(&self, threshold: Severity) -> bool {
360        self.file_results
361            .iter()
362            .flat_map(|r| &r.findings)
363            .any(|f| f.severity >= threshold)
364    }
365
366    /// Total number of findings across all files.
367    pub fn total_findings(&self) -> usize {
368        self.file_results.iter().map(|r| r.findings.len()).sum()
369    }
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375
376    #[test]
377    fn test_binary_extension_skip() {
378        assert!(is_binary_extension("image.png"));
379        assert!(is_binary_extension("archive.tar.gz"));
380        assert!(!is_binary_extension("config.json"));
381        assert!(!is_binary_extension("CLAUDE.md"));
382    }
383
384    #[test]
385    fn test_priority_file_detection() {
386        // AI-specific basenames are always priority
387        assert!(is_priority_file(Path::new(".cursorrules")));
388        assert!(is_priority_file(Path::new("CLAUDE.md")));
389        assert!(is_priority_file(Path::new("mcp.json")));
390        assert!(!is_priority_file(Path::new("README.md")));
391
392        // Generic filenames are priority only inside known config dirs
393        assert!(!is_priority_file(Path::new("settings.json")));
394        assert!(!is_priority_file(Path::new("config.json")));
395        assert!(is_priority_file(Path::new(".claude/settings.json")));
396        assert!(is_priority_file(Path::new(".vscode/settings.json")));
397        assert!(is_priority_file(Path::new(".roo/rules.md")));
398    }
399
400    #[test]
401    fn test_skip_dirs() {
402        assert!(should_skip_dir(".git"));
403        assert!(should_skip_dir("node_modules"));
404        assert!(should_skip_dir("target"));
405        assert!(!should_skip_dir("src"));
406        assert!(!should_skip_dir(".vscode"));
407    }
408
409    #[test]
410    fn test_known_config_dirs() {
411        assert!(is_known_config_dir(".claude"));
412        assert!(is_known_config_dir(".vscode"));
413        assert!(is_known_config_dir(".cursor"));
414        assert!(!is_known_config_dir("src"));
415        assert!(!is_known_config_dir(".git"));
416    }
417}