Skip to main content

torii_lib/vcs/
scanner.rs

1// Sensitive data scanner — runs before every commit
2use std::path::Path;
3use crate::error::Result;
4
5/// A detected sensitive pattern in a file
6pub struct Finding {
7    pub file: String,
8    pub line: usize,
9    pub pattern_name: String,
10    pub preview: String,
11}
12
13/// Patterns that indicate sensitive data
14struct Pattern {
15    name: &'static str,
16    /// Returns true if the line matches
17    detect: fn(&str) -> bool,
18}
19
20fn mask(value: &str) -> String {
21    let chars: Vec<char> = value.chars().collect();
22    if chars.len() <= 8 {
23        return "*".repeat(chars.len());
24    }
25    let visible = 4;
26    format!("{}{}",
27        &chars[..visible].iter().collect::<String>(),
28        "*".repeat(chars.len() - visible)
29    )
30}
31
32/// Skip blobs larger than this when scanning. Secrets virtually never live in
33/// huge files (lockfiles, generated assets) and reading a 500MB blob to
34/// memory blows up OOM during `scan --history` on big repos. Override with
35/// the env var `TORII_SCAN_MAX_BYTES`.
36const DEFAULT_MAX_BLOB_BYTES: usize = 5 * 1024 * 1024;
37
38fn max_blob_bytes() -> usize {
39    std::env::var("TORII_SCAN_MAX_BYTES")
40        .ok()
41        .and_then(|s| s.parse().ok())
42        .unwrap_or(DEFAULT_MAX_BLOB_BYTES)
43}
44
45const PATTERNS: &[Pattern] = &[
46    Pattern {
47        name: "Private key (PEM)",
48        detect: |l| l.contains("-----BEGIN") && (
49            l.contains("PRIVATE KEY") ||
50            l.contains("RSA PRIVATE") ||
51            l.contains("EC PRIVATE")
52        ),
53    },
54    Pattern {
55        name: "JWT token",
56        detect: |l| {
57            // eyJ... base64 header — at least 3 segments
58            l.split_whitespace().any(|w| {
59                let w = w.trim_matches(|c: char| !c.is_alphanumeric() && c != '.' && c != '_' && c != '-');
60                let parts: Vec<&str> = w.split('.').collect();
61                parts.len() == 3
62                    && parts[0].starts_with("eyJ")
63                    && parts[0].len() > 10
64                    && parts[1].len() > 10
65            })
66        },
67    },
68    Pattern {
69        name: "AWS access key",
70        detect: |l| {
71            l.split_whitespace().any(|w| {
72                let w = w.trim_matches(|c: char| !c.is_alphanumeric());
73                (w.starts_with("AKIA") || w.starts_with("ASIA") || w.starts_with("AROA"))
74                    && w.len() == 20
75                    && w.chars().all(|c| c.is_ascii_uppercase() || c.is_ascii_digit())
76            })
77        },
78    },
79    Pattern {
80        name: "AWS secret key",
81        detect: |l| {
82            let lower = l.to_lowercase();
83            (lower.contains("aws_secret") || lower.contains("aws secret"))
84                && (l.contains('=') || l.contains(':'))
85                && l.len() > 40
86        },
87    },
88    Pattern {
89        name: "GitHub/GitLab token",
90        detect: |l| {
91            let trimmed = l.trim();
92            // Skip HTML/template lines — tokens inside HTML are demo content
93            if trimmed.starts_with('<') || trimmed.starts_with("//") || trimmed.starts_with("*") {
94                return false;
95            }
96            l.split_whitespace().any(|w| {
97                let w = w.trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-');
98                // Skip obvious placeholder/example tokens
99                if w.ends_with("xxx") || w.ends_with("_xxx") || w.contains("xxxx") {
100                    return false;
101                }
102                // Skip bare prefixes used in docs (ghp_, glpat-, etc.) — real tokens are longer
103                let is_prefix_only = (w.starts_with("ghp_") && w.len() <= 5) ||
104                    (w.starts_with("gho_") && w.len() <= 5) ||
105                    (w.starts_with("ghs_") && w.len() <= 5) ||
106                    (w.starts_with("glpat-") && w.len() <= 7) ||
107                    (w.starts_with("glptt-") && w.len() <= 7) ||
108                    (w.starts_with("github_pat_") && w.len() <= 12);
109                if is_prefix_only { return false; }
110                (w.starts_with("ghp_") || w.starts_with("gho_") ||
111                w.starts_with("ghs_") || w.starts_with("github_pat_") ||
112                w.starts_with("glpat-") || w.starts_with("glptt-")) && w.len() > 20
113            })
114        },
115    },
116    Pattern {
117        name: "Generic API key / token",
118        detect: |l| {
119            let lower = l.to_lowercase();
120            let has_key_word =
121                lower.contains("api_key") || lower.contains("api_secret") ||
122                lower.contains("auth_token") || lower.contains("access_token") ||
123                lower.contains("secret_key") || lower.contains("private_key") ||
124                lower.contains("password") || lower.contains("passwd") ||
125                lower.contains("auth_token");
126            let has_assignment = l.contains('=') || l.contains(':');
127            let has_value = l.split(&['=', ':'][..])
128                .nth(1)
129                .map(|v| {
130                    let v = v.trim().trim_matches(|c: char| c == '"' || c == '\'' || c == '`');
131                    let vl = v.to_lowercase();
132                    // Real secrets: no spaces, no sentence punctuation, min length
133                    let looks_like_secret = v.len() >= 16
134                        && !v.contains(' ')
135                        && !v.contains('.')  // sentences have dots
136                        && !v.starts_with("${")
137                        && !v.starts_with("$(")
138                        && !v.starts_with("process.env")
139                        && !v.starts_with("env.")
140                        && !v.starts_with("os.environ")
141                        && !v.starts_with("<")
142                        // English placeholders
143                        && !vl.eq("your_secret_here")
144                        && !vl.eq("changeme")
145                        && !vl.eq("placeholder")
146                        && !vl.eq("todo")
147                        && !vl.starts_with("your_")
148                        && !vl.starts_with("my_")
149                        && !vl.contains("example")
150                        && !vl.contains("sample")
151                        && !vl.contains("replace")
152                        && !vl.contains("change_me")
153                        && !vl.contains("insert")
154                        // Spanish placeholders
155                        && !vl.starts_with("tu_")
156                        && !vl.starts_with("mi_")
157                        && !vl.contains("cambiar")
158                        && !vl.contains("reemplazar")
159                        && !vl.contains("ejemplo")
160                        && !vl.contains("aqui")
161                        && !vl.contains("pon_")
162                        && !vl.contains("escribe");
163                    looks_like_secret
164                })
165                .unwrap_or(false);
166            has_key_word && has_assignment && has_value
167        },
168    },
169    Pattern {
170        name: "Database connection string with credentials",
171        detect: |l| {
172            let lower = l.to_lowercase();
173            (lower.contains("postgresql://") || lower.contains("mysql://") ||
174             lower.contains("mongodb://") || lower.contains("redis://") ||
175             lower.contains("libsql://") || lower.contains("turso://"))
176                && l.contains('@')
177                && !l.contains("user:password@")
178                && !l.contains("user:pass@")
179                && !l.contains("<password>")
180        },
181    },
182    Pattern {
183        name: "Stripe key",
184        detect: |l| {
185            l.split_whitespace().any(|w| {
186                let w = w.trim_matches(|c: char| !c.is_alphanumeric() && c != '_');
187                (w.starts_with("sk_live_") || w.starts_with("pk_live_") ||
188                w.starts_with("rk_live_")) && w.len() > 16
189            })
190        },
191    },
192    Pattern {
193        name: "Twilio / SendGrid / Brevo key",
194        detect: |l| {
195            l.split_whitespace().any(|w| {
196                let w = w.trim_matches(|c: char| !c.is_alphanumeric() && c != '-');
197                // SG. prefix = SendGrid
198                (w.starts_with("SG.") && w.len() > 40) ||
199                // AC... = Twilio account SID
200                (w.starts_with("AC") && w.len() == 34 && w.chars().all(|c| c.is_ascii_alphanumeric()))
201            })
202        },
203    },
204];
205
206/// Extensions/suffixes that are safe to commit with example values
207fn is_example_file(path: &str) -> bool {
208    let lower = path.to_lowercase();
209    lower.ends_with(".example")
210        || lower.ends_with(".sample")
211        || lower.ends_with(".template")
212        || lower.ends_with(".example.env")
213        || lower.ends_with(".env.example")
214        || lower.ends_with(".env.sample")
215        || lower.ends_with(".env.template")
216        || lower.contains(".example.")
217        || lower.contains(".sample.")
218}
219
220/// Files that are inherently sensitive and should never be committed
221fn is_sensitive_file(path: &str) -> bool {
222    let lower = path.to_lowercase();
223    let filename = lower.split('/').last().unwrap_or(&lower);
224
225    // Exact filenames
226    matches!(filename,
227        ".env" | ".envrc" | "secrets.json" | "secrets.yaml" | "secrets.yml" |
228        "credentials.json" | "credentials.yml" | "credentials.yaml" |
229        ".netrc" | ".npmrc" | ".pypirc"
230    )
231    // .env variants: .env.local, .env.production, etc.
232    || (filename.starts_with(".env.") && !is_example_file(path))
233    // Private key files
234    || lower.ends_with("_rsa")
235    || lower.ends_with("_ed25519")
236    || lower.ends_with("_ecdsa")
237    || lower.ends_with(".pem")
238    || lower.ends_with(".p12")
239    || lower.ends_with(".pfx")
240    || lower.ends_with(".key")
241    || lower.ends_with(".keystore")
242    // Auth files
243    || filename == "id_rsa"
244    || filename == "id_ed25519"
245    || filename == "id_ecdsa"
246}
247
248/// Binary-like or generated files to skip
249fn should_skip_file(path: &str) -> bool {
250    let lower = path.to_lowercase();
251    lower.ends_with(".lock")
252        || lower.ends_with(".png")
253        || lower.ends_with(".jpg")
254        || lower.ends_with(".jpeg")
255        || lower.ends_with(".gif")
256        || lower.ends_with(".svg")
257        || lower.ends_with(".ico")
258        || lower.ends_with(".wasm")
259        || lower.ends_with(".pdf")
260        || lower.ends_with(".zip")
261        || lower.contains("bun.lock")
262        || lower.contains("package-lock")
263        || lower.contains("yarn.lock")
264        || lower.contains("/i18n/")
265        || lower.contains("\\i18n\\")
266}
267
268/// Return list of paths staged vs HEAD (used by hooks/size guard)
269pub fn staged_paths(repo_path: &Path) -> Result<Vec<String>> {
270    use git2::Repository;
271    let repo = Repository::discover(repo_path).map_err(crate::error::ToriiError::Git)?;
272    let index = repo.index().map_err(crate::error::ToriiError::Git)?;
273    let head_tree = repo.head().ok().and_then(|h| h.peel_to_tree().ok());
274    let diff = match &head_tree {
275        Some(tree) => repo.diff_tree_to_index(Some(tree), Some(&index), None),
276        None => repo.diff_tree_to_index(None, Some(&index), None),
277    }.map_err(crate::error::ToriiError::Git)?;
278    let mut out = Vec::new();
279    diff.foreach(&mut |delta, _| {
280        if let Some(p) = delta.new_file().path() {
281            out.push(p.to_string_lossy().to_string());
282        }
283        true
284    }, None, None, None).map_err(crate::error::ToriiError::Git)?;
285    Ok(out)
286}
287
288/// Scan staged content using user-defined regex rules from .toriignore.
289/// Returns findings — empty if no rules or no matches.
290pub fn scan_staged_with_custom(
291    repo_path: &Path,
292    rules: &[crate::toriignore::SecretRule],
293) -> Result<Vec<Finding>> {
294    use git2::Repository;
295    if rules.is_empty() { return Ok(Vec::new()); }
296
297    let mut findings = Vec::new();
298    let repo = Repository::discover(repo_path).map_err(crate::error::ToriiError::Git)?;
299    let index = repo.index().map_err(crate::error::ToriiError::Git)?;
300    let paths = staged_paths(repo_path)?;
301
302    for file_path in &paths {
303        let p = std::path::Path::new(file_path);
304        if is_example_file(file_path) || should_skip_file(file_path) { continue; }
305        let entry = match index.get_path(p, 0) { Some(e) => e, None => continue };
306        let blob = match repo.find_blob(entry.id) { Ok(b) => b, Err(_) => continue };
307        if blob.size() > max_blob_bytes() { continue; }
308        let content = String::from_utf8_lossy(blob.content()).to_string();
309
310        for (i, line) in content.lines().enumerate() {
311            let trimmed = line.trim();
312            // Same comment-skip as scan_staged — custom rules should not
313            // false-positive on documentation/comments that mention the
314            // very patterns they describe.
315            if trimmed.starts_with('#') || trimmed.starts_with("//") || trimmed.starts_with("/*") || trimmed.starts_with('*') {
316                continue;
317            }
318            for rule in rules {
319                if rule.regex.is_match(line) {
320                    findings.push(Finding {
321                        file: file_path.clone(),
322                        line: i + 1,
323                        pattern_name: format!("custom: {}", rule.name),
324                        preview: mask(line.trim()),
325                    });
326                    break;
327                }
328            }
329        }
330    }
331    Ok(findings)
332}
333
334/// Scan staged files in the git index for sensitive data.
335/// Returns a list of findings.
336pub fn scan_staged(repo_path: &Path) -> Result<Vec<Finding>> {
337    use git2::Repository;
338
339    let mut findings = Vec::new();
340
341    let repo = Repository::discover(repo_path)
342        .map_err(|e| crate::error::ToriiError::Git(e))?;
343    let index = repo.index()
344        .map_err(|e| crate::error::ToriiError::Git(e))?;
345
346    // Walk staged entries (index vs HEAD diff gives us changed files)
347    let head_tree = repo.head().ok()
348        .and_then(|h| h.peel_to_tree().ok());
349
350    let diff = match &head_tree {
351        Some(tree) => repo.diff_tree_to_index(Some(tree), Some(&index), None),
352        None => repo.diff_tree_to_index(None, Some(&index), None),
353    }.map_err(|e| crate::error::ToriiError::Git(e))?;
354
355    let mut staged_files: Vec<String> = Vec::new();
356    diff.foreach(
357        &mut |delta, _| {
358            if let Some(path) = delta.new_file().path() {
359                staged_files.push(path.to_string_lossy().to_string());
360            }
361            true
362        },
363        None, None, None,
364    ).map_err(|e| crate::error::ToriiError::Git(e))?;
365
366    for file_path in &staged_files {
367        let file_path_str = file_path.as_str();
368
369        if is_example_file(file_path_str) || should_skip_file(file_path_str) {
370            continue;
371        }
372
373        if is_sensitive_file(file_path_str) {
374            findings.push(Finding {
375                file: file_path.clone(),
376                line: 0,
377                pattern_name: "Sensitive file — should not be committed".to_string(),
378                preview: format!("⚠  {} should not be tracked by version control", file_path),
379            });
380            continue;
381        }
382
383        // Read staged content from index blob
384        let entry = index.get_path(std::path::Path::new(file_path_str), 0);
385        let content = match entry {
386            Some(e) => {
387                match repo.find_blob(e.id) {
388                    Ok(blob) => {
389                        if blob.size() > max_blob_bytes() { continue; }
390                        String::from_utf8_lossy(blob.content()).to_string()
391                    }
392                    Err(_) => continue,
393                }
394            }
395            None => continue,
396        };
397
398        for (line_num, line) in content.lines().enumerate() {
399            let trimmed = line.trim();
400            if trimmed.starts_with('#') || trimmed.starts_with("//") || trimmed.starts_with("/*") || trimmed.starts_with('*') {
401                continue;
402            }
403
404            for pattern in PATTERNS {
405                if (pattern.detect)(line) {
406                    let preview = mask(line.trim());
407                    findings.push(Finding {
408                        file: file_path.clone(),
409                        line: line_num + 1,
410                        pattern_name: pattern.name.to_string(),
411                        preview,
412                    });
413                    break;
414                }
415            }
416        }
417    }
418
419    Ok(findings)
420}
421
422/// Scan an entire git history for sensitive data (for migration use).
423/// Returns findings grouped by commit.
424pub fn scan_history(repo_path: &Path) -> Result<Vec<(String, Vec<Finding>)>> {
425    use git2::Repository;
426
427    let mut results = Vec::new();
428
429    let repo = Repository::discover(repo_path)
430        .map_err(|e| crate::error::ToriiError::Git(e))?;
431
432    // Walk all commits reachable from any reference
433    let mut revwalk = repo.revwalk()
434        .map_err(|e| crate::error::ToriiError::Git(e))?;
435    revwalk.push_glob("*").map_err(|e| crate::error::ToriiError::Git(e))?;
436
437    let commits: Vec<(git2::Oid, String)> = revwalk
438        .filter_map(|id| id.ok())
439        .filter_map(|id| {
440            repo.find_commit(id).ok().map(|c| {
441                let subject = c.summary().unwrap_or("").to_string();
442                (id, subject)
443            })
444        })
445        .collect();
446
447    println!("🔍 Scanning {} commits...", commits.len());
448
449    for (oid, subject) in &commits {
450        let commit = match repo.find_commit(*oid) {
451            Ok(c) => c,
452            Err(_) => continue,
453        };
454
455        // Get diff against first parent (or empty tree for root commits)
456        let commit_tree = match commit.tree() {
457            Ok(t) => t,
458            Err(_) => continue,
459        };
460        let parent_tree = commit.parent(0).ok().and_then(|p| p.tree().ok());
461
462        let diff = match repo.diff_tree_to_tree(
463            parent_tree.as_ref(),
464            Some(&commit_tree),
465            None,
466        ) {
467            Ok(d) => d,
468            Err(_) => continue,
469        };
470
471        let mut commit_findings = Vec::new();
472
473        // For each changed file, read its content from the commit tree
474        let mut changed_files: Vec<String> = Vec::new();
475        let _ = diff.foreach(
476            &mut |delta, _| {
477                if let Some(path) = delta.new_file().path() {
478                    changed_files.push(path.to_string_lossy().to_string());
479                }
480                true
481            },
482            None, None, None,
483        );
484
485        for file_path in &changed_files {
486            if is_example_file(file_path) || should_skip_file(file_path) {
487                continue;
488            }
489
490            // Read file content from this commit's tree
491            let entry = commit_tree.get_path(std::path::Path::new(file_path));
492            let content = match entry {
493                Ok(e) => match repo.find_blob(e.id()) {
494                    Ok(blob) => {
495                        if blob.size() > max_blob_bytes() { continue; }
496                        String::from_utf8_lossy(blob.content()).to_string()
497                    }
498                    Err(_) => continue,
499                },
500                Err(_) => continue,
501            };
502
503            for (line_num, line) in content.lines().enumerate() {
504                let trimmed = line.trim();
505                if trimmed.starts_with('#') || trimmed.starts_with("//") || trimmed.starts_with("/*") || trimmed.starts_with('*') {
506                    continue;
507                }
508
509                for pattern in PATTERNS {
510                    if (pattern.detect)(line) {
511                        commit_findings.push(Finding {
512                            file: file_path.clone(),
513                            line: line_num + 1,
514                            pattern_name: pattern.name.to_string(),
515                            preview: mask(line.trim()),
516                        });
517                        break;
518                    }
519                }
520            }
521        }
522
523        if !commit_findings.is_empty() {
524            results.push((
525                format!("{} — {}", &oid.to_string()[..8], subject),
526                commit_findings,
527            ));
528        }
529    }
530
531    Ok(results)
532}