Skip to main content

tirith_core/
scan.rs

1use std::path::{Path, PathBuf};
2
3use crate::engine::{self, AnalysisContext};
4use crate::extract::ScanContext;
5use crate::tokenize::ShellType;
6use crate::verdict::{Finding, Severity};
7
8/// Configuration for a file scan operation.
9pub struct ScanConfig {
10    /// Path to scan (directory or single file).
11    pub path: PathBuf,
12    /// Recurse into subdirectories.
13    pub recursive: bool,
14    /// Severity threshold for CI failure.
15    pub fail_on: Severity,
16    /// Glob patterns to ignore.
17    pub ignore_patterns: Vec<String>,
18    /// Include only files matching these patterns (empty = include all).
19    pub include_patterns: Vec<String>,
20    /// Exclude files matching these patterns (applied after include).
21    pub exclude_patterns: Vec<String>,
22    /// Max files to scan (None = unlimited).
23    pub max_files: Option<usize>,
24}
25
26/// Result of a complete scan operation.
27pub struct ScanResult {
28    pub file_results: Vec<FileScanResult>,
29    pub scanned_count: usize,
30    pub skipped_count: usize,
31    pub truncated: bool,
32    pub truncation_reason: Option<String>,
33}
34
35/// Result of scanning a single file.
36pub struct FileScanResult {
37    pub path: PathBuf,
38    pub findings: Vec<Finding>,
39    pub is_config_file: bool,
40}
41
42/// Known AI config file basenames (scanned first for priority ordering).
43/// Only includes names specific to AI tooling — generic names like settings.json
44/// are only prioritized when found inside a known config directory (handled by
45/// `is_priority_path` checking the parent directory).
46const PRIORITY_BASENAMES: &[&str] = &[
47    ".cursorrules",
48    ".cursorignore",
49    ".clinerules",
50    ".windsurfrules",
51    "CLAUDE.md",
52    "AGENTS.md",
53    "copilot-instructions.md",
54    "mcp.json",
55    ".mcp.json",
56    "mcp_settings.json",
57    "devcontainer.json",
58];
59
60/// Parent directories that make generic filenames count as priority.
61const PRIORITY_PARENT_DIRS: &[&str] = &[
62    ".claude",
63    ".vscode",
64    ".cursor",
65    ".windsurf",
66    ".cline",
67    ".continue",
68    ".github",
69    ".devcontainer",
70    ".roo",
71];
72
73/// Run a file scan operation.
74///
75/// Detection is always free (ADR-13). `max_files` is a caller-provided safety
76/// cap (e.g. for resource-constrained CI), not a license gate.
77pub fn scan(config: &ScanConfig) -> ScanResult {
78    let mut files = collect_files(
79        &config.path,
80        config.recursive,
81        &config.ignore_patterns,
82        &config.include_patterns,
83        &config.exclude_patterns,
84    );
85
86    // Sort: known config files first, then lexicographic
87    files.sort_by(|a, b| {
88        let a_priority = is_priority_file(a);
89        let b_priority = is_priority_file(b);
90        match (a_priority, b_priority) {
91            (true, false) => std::cmp::Ordering::Less,
92            (false, true) => std::cmp::Ordering::Greater,
93            _ => a.cmp(b),
94        }
95    });
96
97    let mut truncated = false;
98    let mut truncation_reason = None;
99    let mut skipped_count = 0;
100
101    // Caller-provided safety cap; not a license gate.
102    if let Some(max) = config.max_files {
103        if files.len() > max {
104            skipped_count = files.len() - max;
105            files.truncate(max);
106            truncated = true;
107            truncation_reason = Some(format!(
108                "Scan capped at {max} files ({skipped_count} skipped)."
109            ));
110        }
111    }
112
113    let mut file_results = Vec::new();
114    for file_path in &files {
115        // Panic in any rule is bounded to its file; the rest of the walk
116        // continues. Single-file callers (CLI, MCP, `policy test`) bypass
117        // this guard on purpose so panics still surface honestly there.
118        match catch_panic_scanning(file_path, || scan_single_file(file_path)) {
119            Some(Some(result)) => file_results.push(result),
120            Some(None) | None => skipped_count += 1,
121        }
122    }
123
124    ScanResult {
125        scanned_count: file_results.len(),
126        skipped_count,
127        truncated,
128        truncation_reason,
129        file_results,
130    }
131}
132
133/// Scan a single file and return its results.
134pub fn scan_single_file(file_path: &Path) -> Option<FileScanResult> {
135    // 10 MiB cap — large enough for any realistic config/source file but
136    // small enough that a hostile `.git/objects/pack-*.pack` won't blow us up.
137    const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
138
139    let metadata = match std::fs::metadata(file_path) {
140        Ok(m) => m,
141        Err(e) => {
142            eprintln!(
143                "tirith: scan: cannot read metadata for {}: {e}",
144                file_path.display()
145            );
146            return None;
147        }
148    };
149    if metadata.len() > MAX_FILE_SIZE {
150        eprintln!(
151            "tirith: scan: skipping {} ({}B exceeds {}B limit)",
152            file_path.display(),
153            metadata.len(),
154            MAX_FILE_SIZE
155        );
156        return None;
157    }
158
159    let raw_bytes = match std::fs::read(file_path) {
160        Ok(b) => b,
161        Err(e) => {
162            eprintln!("tirith: scan: cannot read {}: {e}", file_path.display());
163            return None;
164        }
165    };
166    let content = String::from_utf8_lossy(&raw_bytes).into_owned();
167
168    let is_config = is_priority_file(file_path);
169
170    let cwd = file_path
171        .parent()
172        .map(|p| p.display().to_string())
173        .filter(|s| !s.is_empty());
174    let ctx = AnalysisContext {
175        input: content,
176        shell: ShellType::Posix,
177        scan_context: ScanContext::FileScan,
178        raw_bytes: Some(raw_bytes),
179        interactive: false,
180        cwd: cwd.clone(),
181        file_path: Some(file_path.to_path_buf()),
182        repo_root: None,
183        is_config_override: false,
184        clipboard_html: None,
185    };
186
187    let verdict = engine::analyze(&ctx);
188
189    let policy = crate::policy::Policy::discover(cwd.as_deref());
190    let mut findings = verdict.findings;
191    engine::filter_findings_by_paranoia_vec(&mut findings, policy.paranoia);
192
193    Some(FileScanResult {
194        path: file_path.to_path_buf(),
195        findings,
196        is_config_file: is_config,
197    })
198}
199
200/// Wrap `f` in `catch_unwind` for the directory-walk code path. On panic,
201/// log a skip message to stderr and return `None`; the caller then bumps
202/// `skipped_count` so the rest of the walk continues.
203///
204/// **Contract notes:**
205/// - The default Rust panic hook fires *before* unwinding, so the panic
206///   payload + backtrace will already be on stderr before our skip line.
207///   We deliberately don't install a custom panic hook — that would mutate
208///   process-global state and affect every other caller.
209/// - Only effective in `panic = "unwind"` builds (the workspace default).
210///   `panic = "abort"` builds bypass `catch_unwind` entirely.
211/// - `AssertUnwindSafe` is asserted because the closure type does not
212///   auto-impl `UnwindSafe`. Today the closure captures only `&Path` and
213///   a function pointer, both trivially safe; if a future refactor expands
214///   the closure body to capture mutable state, that state must remain
215///   unused after the panic to keep the assertion sound.
216fn catch_panic_scanning<T>(file_path: &Path, f: impl FnOnce() -> T) -> Option<T> {
217    match std::panic::catch_unwind(std::panic::AssertUnwindSafe(f)) {
218        Ok(v) => Some(v),
219        Err(_) => {
220            eprintln!(
221                "tirith: scan: internal error scanning {} (skipped — see panic message above)",
222                file_path.display()
223            );
224            None
225        }
226    }
227}
228
229/// Scan content from stdin (no file path).
230pub fn scan_stdin(content: &str, raw_bytes: &[u8]) -> FileScanResult {
231    let cwd = std::env::current_dir()
232        .ok()
233        .map(|p| p.display().to_string());
234    let ctx = AnalysisContext {
235        input: content.to_string(),
236        shell: ShellType::Posix,
237        scan_context: ScanContext::FileScan,
238        raw_bytes: Some(raw_bytes.to_vec()),
239        interactive: false,
240        cwd: cwd.clone(),
241        file_path: None,
242        repo_root: None,
243        is_config_override: false,
244        clipboard_html: None,
245    };
246
247    let verdict = engine::analyze(&ctx);
248
249    let policy = crate::policy::Policy::discover(cwd.as_deref());
250    let mut findings = verdict.findings;
251    engine::filter_findings_by_paranoia_vec(&mut findings, policy.paranoia);
252
253    FileScanResult {
254        path: PathBuf::from("<stdin>"),
255        findings,
256        is_config_file: false,
257    }
258}
259
260/// Check if a path matches a priority config file.
261/// Matches either by AI-specific basename or by being inside a known config directory.
262fn is_priority_file(path: &Path) -> bool {
263    let basename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
264
265    // Direct AI-specific basename match
266    if PRIORITY_BASENAMES.contains(&basename) {
267        return true;
268    }
269
270    // Generic filenames are priority only inside known config dirs
271    if let Some(parent) = path.parent() {
272        let parent_name = parent.file_name().and_then(|n| n.to_str()).unwrap_or("");
273        if PRIORITY_PARENT_DIRS.contains(&parent_name) {
274            return true;
275        }
276    }
277
278    false
279}
280
281/// Collect files from a path (directory or single file).
282fn collect_files(
283    path: &Path,
284    recursive: bool,
285    ignore_patterns: &[String],
286    include_patterns: &[String],
287    exclude_patterns: &[String],
288) -> Vec<PathBuf> {
289    if path.is_file() {
290        return vec![path.to_path_buf()];
291    }
292
293    if !path.is_dir() {
294        eprintln!("tirith: scan: path does not exist: {}", path.display());
295        return vec![];
296    }
297
298    let mut files = Vec::new();
299    collect_files_recursive(
300        path,
301        path,
302        recursive,
303        ignore_patterns,
304        include_patterns,
305        exclude_patterns,
306        &mut files,
307    );
308    files
309}
310
311fn collect_files_recursive(
312    root: &Path,
313    dir: &Path,
314    recursive: bool,
315    ignore_patterns: &[String],
316    include_patterns: &[String],
317    exclude_patterns: &[String],
318    files: &mut Vec<PathBuf>,
319) {
320    let entries = match std::fs::read_dir(dir) {
321        Ok(e) => e,
322        Err(e) => {
323            eprintln!("tirith: scan: cannot read directory {}: {e}", dir.display());
324            return;
325        }
326    };
327
328    for entry in entries {
329        let entry = match entry {
330            Ok(e) => e,
331            Err(e) => {
332                eprintln!(
333                    "tirith: scan: error reading entry in {}: {e}",
334                    dir.display()
335                );
336                continue;
337            }
338        };
339        let path = entry.path();
340        let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
341
342        // Skip hidden dirs (except known config dirs) and common non-useful dirs
343        if path.is_dir() {
344            if should_skip_dir(name) && !is_known_config_dir(name) {
345                continue;
346            }
347            if recursive || is_known_config_dir(name) {
348                collect_files_recursive(
349                    root,
350                    &path,
351                    recursive,
352                    ignore_patterns,
353                    include_patterns,
354                    exclude_patterns,
355                    files,
356                );
357            }
358            continue;
359        }
360
361        // Skip binary/non-text files by extension
362        if is_binary_extension(name) {
363            continue;
364        }
365
366        // Apply ignore patterns against basename and relative path
367        let rel_path = path
368            .strip_prefix(root)
369            .ok()
370            .and_then(|p| p.to_str())
371            .unwrap_or(name);
372        if ignore_patterns
373            .iter()
374            .any(|pat| matches_ignore_pattern(name, pat) || matches_ignore_pattern(rel_path, pat))
375        {
376            continue;
377        }
378
379        // Apply include patterns with negation support.
380        // Patterns prefixed with `!` act as excludes within the include set.
381        if !include_patterns.is_empty() {
382            let mut included = false;
383            let mut negated = false;
384            let has_positive = include_patterns.iter().any(|p| !p.starts_with('!'));
385
386            for pat in include_patterns {
387                if let Some(stripped) = pat.strip_prefix('!') {
388                    // Negation: exclude from the include set
389                    if matches_ignore_pattern(name, stripped)
390                        || matches_ignore_pattern(rel_path, stripped)
391                    {
392                        negated = true;
393                    }
394                } else {
395                    // Positive: file must match at least one
396                    if matches_ignore_pattern(name, pat) || matches_ignore_pattern(rel_path, pat) {
397                        included = true;
398                    }
399                }
400            }
401
402            // A file passes include if:
403            // - No positive includes OR matches at least one positive include
404            // - AND does not match any negated include
405            if negated || (has_positive && !included) {
406                continue;
407            }
408        }
409
410        // Apply exclude patterns: skip matching files
411        if exclude_patterns
412            .iter()
413            .any(|pat| matches_ignore_pattern(name, pat) || matches_ignore_pattern(rel_path, pat))
414        {
415            continue;
416        }
417
418        files.push(path);
419    }
420}
421
422/// Directories to skip during scanning.
423fn should_skip_dir(name: &str) -> bool {
424    matches!(
425        name,
426        ".git"
427            | "node_modules"
428            | "target"
429            | "__pycache__"
430            | ".tox"
431            | "dist"
432            | "build"
433            | ".next"
434            | "vendor"
435            | ".cache"
436    )
437}
438
439/// Known AI config directories that should always be entered.
440fn is_known_config_dir(name: &str) -> bool {
441    matches!(
442        name,
443        ".claude"
444            | ".vscode"
445            | ".cursor"
446            | ".windsurf"
447            | ".cline"
448            | ".continue"
449            | ".github"
450            | ".devcontainer"
451            | ".roo"
452    )
453}
454
455/// File extensions that indicate binary content (skip scanning).
456fn is_binary_extension(name: &str) -> bool {
457    let binary_exts = [
458        ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg", ".webp", ".mp3", ".mp4", ".wav",
459        ".avi", ".mov", ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".exe", ".dll", ".so",
460        ".dylib", ".o", ".a", ".wasm", ".pyc", ".class", ".jar",
461    ];
462    let name_lower = name.to_lowercase();
463    binary_exts.iter().any(|ext| name_lower.ends_with(ext))
464}
465
466/// Match a filename against an ignore pattern.
467/// Supports simple glob patterns: `*.ext` (suffix), `prefix*` (prefix),
468/// `*middle*` (contains), and exact matches. Falls back to substring
469/// matching for patterns without `*`.
470pub fn matches_ignore_pattern(name: &str, pattern: &str) -> bool {
471    if pattern.contains('*') {
472        let parts: Vec<&str> = pattern.split('*').collect();
473        match parts.as_slice() {
474            // "*.ext" — suffix match
475            [prefix, suffix] if prefix.is_empty() && !suffix.is_empty() => name.ends_with(suffix),
476            // "prefix*" — prefix match
477            [prefix, suffix] if !prefix.is_empty() && suffix.is_empty() => name.starts_with(prefix),
478            // "pre*suf" — prefix + suffix match
479            [prefix, suffix] if !prefix.is_empty() && !suffix.is_empty() => {
480                name.starts_with(prefix)
481                    && name.ends_with(suffix)
482                    && name.len() >= prefix.len() + suffix.len()
483            }
484            // "*" alone matches everything
485            [_, _] => true,
486            // Fallback for multiple wildcards: all parts must appear in order
487            _ => {
488                let mut remaining = name;
489                for (i, part) in parts.iter().enumerate() {
490                    if part.is_empty() {
491                        continue;
492                    }
493                    if i == 0 {
494                        if !remaining.starts_with(part) {
495                            return false;
496                        }
497                        remaining = &remaining[part.len()..];
498                    } else if let Some(pos) = remaining.find(part) {
499                        remaining = &remaining[pos + part.len()..];
500                    } else {
501                        return false;
502                    }
503                }
504                true
505            }
506        }
507    } else {
508        // No wildcard: substring match (backwards compatible)
509        name.contains(pattern)
510    }
511}
512
513impl ScanResult {
514    /// Check if any finding meets or exceeds the given severity threshold.
515    pub fn has_findings_at_or_above(&self, threshold: Severity) -> bool {
516        self.file_results
517            .iter()
518            .flat_map(|r| &r.findings)
519            .any(|f| f.severity >= threshold)
520    }
521
522    /// Total number of findings across all files.
523    pub fn total_findings(&self) -> usize {
524        self.file_results.iter().map(|r| r.findings.len()).sum()
525    }
526}
527
528#[cfg(test)]
529mod tests {
530    use super::*;
531
532    #[test]
533    fn catch_panic_scanning_returns_some_on_clean_run() {
534        let path = Path::new("dummy");
535        let result = catch_panic_scanning(path, || 42_i32);
536        assert_eq!(result, Some(42));
537    }
538
539    /// Serializes any test that mutates the global panic hook. Without this,
540    /// a parallel test that panics during the hook-swap window inherits the
541    /// empty hook, and concurrent hook swaps race each other's restore.
542    /// Tolerates poisoning so a single panic doesn't cascade.
543    static PANIC_HOOK_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
544
545    #[test]
546    fn catch_panic_scanning_returns_none_on_panic() {
547        let _lock = PANIC_HOOK_LOCK.lock().unwrap_or_else(|e| e.into_inner());
548        let path = Path::new("dummy");
549        // Suppress the default panic-hook output for this test only — we're
550        // intentionally inducing a panic and don't want it cluttering stderr.
551        // The hook is restored before the lock guard drops.
552        let prev = std::panic::take_hook();
553        std::panic::set_hook(Box::new(|_| {}));
554        let result: Option<i32> = catch_panic_scanning(path, || {
555            panic!("simulated rule panic");
556        });
557        std::panic::set_hook(prev);
558        assert!(result.is_none(), "panic must produce None, got {result:?}");
559    }
560
561    #[test]
562    fn test_binary_extension_skip() {
563        assert!(is_binary_extension("image.png"));
564        assert!(is_binary_extension("archive.tar.gz"));
565        assert!(!is_binary_extension("config.json"));
566        assert!(!is_binary_extension("CLAUDE.md"));
567    }
568
569    #[test]
570    fn test_priority_file_detection() {
571        // AI-specific basenames are always priority
572        assert!(is_priority_file(Path::new(".cursorrules")));
573        assert!(is_priority_file(Path::new("CLAUDE.md")));
574        assert!(is_priority_file(Path::new("mcp.json")));
575        assert!(!is_priority_file(Path::new("README.md")));
576
577        // Generic filenames are priority only inside known config dirs
578        assert!(!is_priority_file(Path::new("settings.json")));
579        assert!(!is_priority_file(Path::new("config.json")));
580        assert!(is_priority_file(Path::new(".claude/settings.json")));
581        assert!(is_priority_file(Path::new(".vscode/settings.json")));
582        assert!(is_priority_file(Path::new(".roo/rules.md")));
583    }
584
585    #[test]
586    fn test_skip_dirs() {
587        assert!(should_skip_dir(".git"));
588        assert!(should_skip_dir("node_modules"));
589        assert!(should_skip_dir("target"));
590        assert!(!should_skip_dir("src"));
591        assert!(!should_skip_dir(".vscode"));
592    }
593
594    #[test]
595    fn test_known_config_dirs() {
596        assert!(is_known_config_dir(".claude"));
597        assert!(is_known_config_dir(".vscode"));
598        assert!(is_known_config_dir(".cursor"));
599        assert!(!is_known_config_dir("src"));
600        assert!(!is_known_config_dir(".git"));
601    }
602
603    #[test]
604    fn test_ignore_pattern_matching() {
605        // Suffix glob
606        assert!(matches_ignore_pattern("test.log", "*.log"));
607        assert!(!matches_ignore_pattern("test.txt", "*.log"));
608
609        // Prefix glob
610        assert!(matches_ignore_pattern("test_output.txt", "test_*"));
611        assert!(!matches_ignore_pattern("my_test.txt", "test_*"));
612
613        // Contains (no wildcard — backward compatible)
614        assert!(matches_ignore_pattern("my_test_file.txt", "test"));
615        assert!(!matches_ignore_pattern("readme.md", "test"));
616
617        // Prefix + suffix glob
618        assert!(matches_ignore_pattern("test_file.log", "test_*.log"));
619        assert!(!matches_ignore_pattern("test_file.txt", "test_*.log"));
620
621        // Exact match
622        assert!(matches_ignore_pattern("Cargo.lock", "Cargo.lock"));
623
624        // Path-aware patterns (matched against relative paths)
625        assert!(matches_ignore_pattern(".claude/settings.json", ".claude/*"));
626        assert!(!matches_ignore_pattern("src/main.rs", ".claude/*"));
627        assert!(matches_ignore_pattern("docs/CLAUDE.md", "*/CLAUDE.md"));
628        assert!(!matches_ignore_pattern("README.md", "*/CLAUDE.md"));
629    }
630
631    #[test]
632    fn test_variation_selector_visible_in_scan() {
633        // Write a temp file with a variation selector (U+FE0F = EF B8 8F in UTF-8)
634        // into a temp directory with no local policy so paranoia is deterministic.
635        let tmp = tempfile::tempdir().expect("create temp dir");
636        let file_path = tmp.path().join("test_vs.txt");
637        std::fs::write(&file_path, b"A\xef\xb8\x8f").expect("write temp file");
638
639        let result = scan_single_file(&file_path).expect("scan should succeed");
640
641        // VariationSelector is Medium, so it must survive the default paranoia filter.
642        let policy = crate::policy::Policy::discover(Some(tmp.path().to_str().unwrap()));
643        let mut findings = result.findings;
644        crate::engine::filter_findings_by_paranoia_vec(&mut findings, policy.paranoia);
645
646        assert!(
647            findings
648                .iter()
649                .any(|f| f.rule_id == crate::verdict::RuleId::VariationSelector),
650            "VariationSelector should be visible in scan at default paranoia: {findings:?}"
651        );
652    }
653
654    #[test]
655    fn test_negated_include_patterns() {
656        let tmp = tempfile::tempdir().expect("create temp dir");
657        std::fs::write(tmp.path().join("a.md"), "hello").unwrap();
658        std::fs::write(tmp.path().join("b.test.md"), "world").unwrap();
659        std::fs::write(tmp.path().join("c.rs"), "fn main() {}").unwrap();
660
661        // Include *.md but exclude *.test.md via negation
662        let files = collect_files(
663            tmp.path(),
664            false,
665            &[],
666            &["*.md".to_string(), "!*.test.md".to_string()],
667            &[],
668        );
669
670        let names: Vec<&str> = files
671            .iter()
672            .filter_map(|p| p.file_name().and_then(|n| n.to_str()))
673            .collect();
674        assert!(names.contains(&"a.md"), "a.md should be included");
675        assert!(
676            !names.contains(&"b.test.md"),
677            "b.test.md should be excluded by negation"
678        );
679        assert!(
680            !names.contains(&"c.rs"),
681            "c.rs should not match *.md include"
682        );
683    }
684
685    #[test]
686    fn test_negation_only_include_patterns() {
687        let tmp = tempfile::tempdir().expect("create temp dir");
688        std::fs::write(tmp.path().join("a.md"), "hello").unwrap();
689        std::fs::write(tmp.path().join("b.test.md"), "world").unwrap();
690        std::fs::write(tmp.path().join("c.rs"), "fn main() {}").unwrap();
691
692        // Only negation patterns (no positive includes) — include everything
693        // except negated patterns
694        let files = collect_files(tmp.path(), false, &[], &["!*.test.md".to_string()], &[]);
695
696        let names: Vec<&str> = files
697            .iter()
698            .filter_map(|p| p.file_name().and_then(|n| n.to_str()))
699            .collect();
700        assert!(names.contains(&"a.md"), "a.md should be included");
701        assert!(
702            !names.contains(&"b.test.md"),
703            "b.test.md should be excluded by negation"
704        );
705        assert!(
706            names.contains(&"c.rs"),
707            "c.rs should be included (no positive filter)"
708        );
709    }
710}