1use std::path::Path;
2use std::sync::LazyLock;
3
4use crate::Result;
5use crate::config::{Config, SensitiveConfig};
6use crate::git;
7use crate::sensitive::{
8 SensitiveFinding, SensitiveReport, scan_diff_for_sensitive_content,
9 scan_diff_for_sensitive_content_with_options,
10};
11
12#[derive(Debug, Clone, PartialEq, Eq)]
14pub enum TruncationMode {
15 Full,
16 Sections,
17 Outline,
18 Skipped,
19}
20
21impl std::fmt::Display for TruncationMode {
22 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
23 match self {
24 TruncationMode::Full => write!(f, "full"),
25 TruncationMode::Sections => write!(f, "sections"),
26 TruncationMode::Outline => write!(f, "outline"),
27 TruncationMode::Skipped => write!(f, "skipped"),
28 }
29 }
30}
31
32#[derive(Debug, Clone)]
34pub struct FileContext {
35 pub path: String,
36 pub content: String,
37 pub truncation_mode: TruncationMode,
38}
39
40#[derive(Debug, Clone)]
42pub struct CommitContext {
43 pub diff: String,
44 pub recent_commits: Vec<String>,
45 pub branch: String,
46 pub file_contents: Vec<FileContext>,
47 pub changed_files: Vec<String>,
48 pub sensitive_report: SensitiveReport,
49 pub sensitive_findings: Vec<SensitiveFinding>,
50 pub has_sensitive_content: bool,
51}
52
53static SKIP_PATTERNS: LazyLock<Vec<regex::Regex>> = LazyLock::new(|| {
56 [
57 r"\.lock$",
58 r"package-lock\.json$",
59 r"yarn\.lock$",
60 r"pnpm-lock\.yaml$",
61 r"bun\.lockb$",
62 r"Cargo\.lock$",
63 r"Gemfile\.lock$",
64 r"poetry\.lock$",
65 r"composer\.lock$",
66 r"go\.sum$",
67 r"\.min\.js$",
68 r"\.min\.css$",
69 r"\.map$",
70 r"\.bundle\.js$",
71 r"\.png$",
72 r"\.jpg$",
73 r"\.jpeg$",
74 r"\.gif$",
75 r"\.ico$",
76 r"\.woff2?$",
77 r"\.ttf$",
78 r"\.eot$",
79 r"(?:^|/)dist/",
80 r"(?:^|/)build/",
81 r"(?:^|/)node_modules/",
82 r"(?:^|/)\.next/",
83 r"(?:^|/)__pycache__/",
84 ]
85 .iter()
86 .map(|p| regex::Regex::new(p).unwrap())
87 .collect()
88});
89
90pub fn detect_sensitive_content(diff: &str, changed_files: &[String]) -> bool {
92 detect_sensitive_report(diff, changed_files, None).has_findings()
93}
94
95pub fn detect_sensitive_findings(diff: &str, changed_files: &[String]) -> Vec<SensitiveFinding> {
97 detect_sensitive_report(diff, changed_files, None).findings
98}
99
100pub fn detect_sensitive_report(
102 diff: &str,
103 changed_files: &[String],
104 sensitive: Option<&SensitiveConfig>,
105) -> SensitiveReport {
106 match sensitive {
107 Some(sensitive) => scan_diff_for_sensitive_content_with_options(
108 diff,
109 changed_files,
110 sensitive.enforcement,
111 &sensitive.allowlist,
112 ),
113 None => scan_diff_for_sensitive_content(diff, changed_files),
114 }
115}
116
117pub fn should_skip(file_path: &str) -> bool {
119 SKIP_PATTERNS.iter().any(|p| p.is_match(file_path))
120}
121
122pub fn filter_diff(diff: &str) -> String {
125 if diff.is_empty() {
126 return String::new();
127 }
128
129 let mut result = String::new();
130 let mut current_section = String::new();
131 let mut skip_current = false;
132
133 for line in diff.lines() {
134 if line.starts_with("diff --git ") {
135 if !skip_current && !current_section.is_empty() {
137 result.push_str(¤t_section);
138 }
139
140 current_section = String::new();
142 current_section.push_str(line);
143 current_section.push('\n');
144
145 skip_current = line
147 .rsplit_once(" b/")
148 .map(|(_, path)| should_skip(path))
149 .unwrap_or(false);
150 } else {
151 current_section.push_str(line);
152 current_section.push('\n');
153 }
154 }
155
156 if !skip_current && !current_section.is_empty() {
158 result.push_str(¤t_section);
159 }
160
161 result
162}
163
164static SIGNATURE_PATTERN: LazyLock<regex::Regex> = LazyLock::new(|| {
167 regex::Regex::new(
168 r"^(?:export\s+)?(?:default\s+)?(?:async\s+)?(?:function|class|interface|type|const|let|var|enum|abstract\s+class|public|private|protected|def |fn )\b",
169 )
170 .unwrap()
171});
172
173pub fn extract_changed_file_paths(diff: &str) -> Vec<String> {
175 let mut paths = Vec::new();
176 let re = regex::Regex::new(r"^diff --git a/.+ b/(.+)$").unwrap();
177 for line in diff.lines() {
178 if let Some(caps) = re.captures(line) {
179 paths.push(caps[1].to_owned());
180 }
181 }
182 paths
183}
184
185fn get_hunk_line_numbers(diff: &str, file_path: &str) -> Vec<usize> {
187 let mut lines = Vec::new();
188 let mut in_file = false;
189 let hunk_re = regex::Regex::new(r"^@@ -\d+(?:,\d+)? \+(\d+)").unwrap();
190
191 for line in diff.lines() {
192 if line.starts_with("diff --git") {
193 in_file = line.contains(&format!("b/{file_path}"));
194 continue;
195 }
196 if in_file
197 && let Some(caps) = hunk_re.captures(line)
198 && let Ok(n) = caps[1].parse::<usize>()
199 {
200 lines.push(n);
201 }
202 }
203 lines
204}
205
206fn read_file_content(file_path: &str, repo_root: &Path, diff: &str) -> FileContext {
208 let full_path = repo_root.join(file_path);
209
210 if let (Ok(resolved), Ok(resolved_root)) = (full_path.canonicalize(), repo_root.canonicalize())
212 && !resolved.starts_with(&resolved_root)
213 {
214 return FileContext {
215 path: file_path.to_owned(),
216 content: String::new(),
217 truncation_mode: TruncationMode::Skipped,
218 };
219 }
220
221 let content = match std::fs::read_to_string(&full_path) {
222 Ok(c) => c,
223 Err(_) => {
224 return FileContext {
225 path: file_path.to_owned(),
226 content: String::new(),
227 truncation_mode: TruncationMode::Skipped,
228 };
229 }
230 };
231
232 let file_lines: Vec<&str> = content.lines().collect();
233 let line_count = file_lines.len();
234
235 if line_count <= 500 {
237 return FileContext {
238 path: file_path.to_owned(),
239 content,
240 truncation_mode: TruncationMode::Full,
241 };
242 }
243
244 let hunk_lines = get_hunk_line_numbers(diff, file_path);
245
246 if line_count <= 2000 {
248 let mut parts = Vec::new();
249 let header_end = 30.min(file_lines.len());
250 parts.push(file_lines[..header_end].join("\n"));
251
252 for &hunk_line in &hunk_lines {
253 let start = hunk_line.saturating_sub(25);
254 let end = (hunk_line + 25).min(file_lines.len());
255 parts.push(format!("\n... (line {}) ...\n", start + 1));
256 parts.push(file_lines[start..end].join("\n"));
257 }
258
259 return FileContext {
260 path: file_path.to_owned(),
261 content: parts.join("\n"),
262 truncation_mode: TruncationMode::Sections,
263 };
264 }
265
266 let mut parts: Vec<String> = Vec::new();
268 for line in &file_lines {
269 if SIGNATURE_PATTERN.is_match(line.trim()) {
270 parts.push(line.to_string());
271 }
272 }
273
274 for &hunk_line in &hunk_lines {
275 let start = hunk_line.saturating_sub(10);
276 let end = (hunk_line + 10).min(file_lines.len());
277 parts.push(format!("\n... (line {}) ...\n", start + 1));
278 parts.push(file_lines[start..end].join("\n"));
279 }
280
281 FileContext {
282 path: file_path.to_owned(),
283 content: parts.join("\n"),
284 truncation_mode: TruncationMode::Outline,
285 }
286}
287
288pub fn get_file_contents(
290 changed_files: &[String],
291 repo_root: &Path,
292 diff: &str,
293) -> Vec<FileContext> {
294 const TOTAL_BUDGET: usize = 30_000;
295 let mut results = Vec::new();
296 let mut total_chars = 0;
297
298 let mut files_with_size: Vec<_> = changed_files
300 .iter()
301 .filter(|f| !should_skip(f))
302 .map(|f| {
303 let size = repo_root
304 .join(f)
305 .metadata()
306 .map(|m| m.len() as usize)
307 .unwrap_or(0);
308 (f.as_str(), size)
309 })
310 .collect();
311 files_with_size.sort_by_key(|&(_, size)| size);
312
313 for (file, _) in files_with_size {
314 if total_chars >= TOTAL_BUDGET {
315 break;
316 }
317
318 let mut fc = read_file_content(file, repo_root, diff);
319 if fc.truncation_mode == TruncationMode::Skipped || fc.content.is_empty() {
320 continue;
321 }
322
323 let remaining = TOTAL_BUDGET - total_chars;
325 if fc.content.len() > remaining {
326 fc.content = format!(
327 "{}\n... (truncated to fit context budget)",
328 &fc.content[..remaining]
329 );
330 }
331
332 total_chars += fc.content.len();
333 results.push(fc);
334 }
335
336 results
337}
338
339pub fn gather_context(repo_root: &Path, config: &Config) -> Result<CommitContext> {
341 let diff = git::get_diff(config.diff_source, repo_root)?;
342 let recent_commits = git::get_recent_commits(repo_root, 10).unwrap_or_default();
343 let branch = git::get_branch_name(repo_root).unwrap_or_else(|_| "unknown".to_owned());
344 let changed_files = extract_changed_file_paths(&diff);
345 let sensitive_report = detect_sensitive_report(&diff, &changed_files, Some(&config.sensitive));
346 let sensitive_findings = sensitive_report.findings.clone();
347 let has_sensitive_content = sensitive_report.has_findings();
348 let file_contents = get_file_contents(&changed_files, repo_root, &diff);
349
350 Ok(CommitContext {
351 diff,
352 recent_commits,
353 branch,
354 file_contents,
355 changed_files,
356 sensitive_report,
357 sensitive_findings,
358 has_sensitive_content,
359 })
360}
361
362#[cfg(test)]
363mod tests {
364 use super::*;
365
366 #[test]
369 fn detects_env_file() {
370 assert!(detect_sensitive_content("some diff", &[".env".to_owned()]));
371 }
372
373 #[test]
374 fn detects_env_production() {
375 assert!(detect_sensitive_content(
376 "some diff",
377 &[".env.production".to_owned()]
378 ));
379 }
380
381 #[test]
382 fn detects_nested_env_file() {
383 assert!(detect_sensitive_content(
384 "some diff",
385 &["config/.env.local".to_owned()]
386 ));
387 }
388
389 #[test]
390 fn detects_credentials_json() {
391 assert!(detect_sensitive_content(
392 "some diff",
393 &["credentials.json".to_owned()]
394 ));
395 }
396
397 #[test]
398 fn detects_api_key_in_added_lines() {
399 let diff = "diff --git a/config.ts b/config.ts\n+const API_KEY = \"sk-proj-abcdefghijklmnopqrstuvwxyz1234567890\"";
400 assert!(detect_sensitive_content(diff, &["config.ts".to_owned()]));
401 }
402
403 #[test]
404 fn detects_secret_key_in_added_lines() {
405 let diff = "+ SECRET_KEY: \"Alpha9981Zeta\"";
406 assert!(detect_sensitive_content(diff, &["config.ts".to_owned()]));
407 }
408
409 #[test]
410 fn detects_access_token_in_added_lines() {
411 let diff = "+export const ACCESS_TOKEN = \"Alpha9981Zeta99\"";
412 assert!(detect_sensitive_content(diff, &["auth.ts".to_owned()]));
413 }
414
415 #[test]
416 fn detects_password_in_added_lines() {
417 let diff = "+ DB_PASSWORD=Alpha9981Zeta";
418 assert!(detect_sensitive_content(diff, &["config.ts".to_owned()]));
419 }
420
421 #[test]
422 fn detects_sk_prefixed_keys() {
423 let diff = "+ key: \"sk-proj-abcdefghijklmnopqrstuvwxyz1234567890\"";
424 assert!(detect_sensitive_content(diff, &["config.ts".to_owned()]));
425 }
426
427 #[test]
428 fn detects_ghp_tokens() {
429 let diff = "+ GITHUB_TOKEN=ghp_abcdefghijklmnopqrstuvwxyz1234";
430 assert!(detect_sensitive_content(diff, &["ci.yml".to_owned()]));
431 }
432
433 #[test]
434 fn detects_aws_access_keys() {
435 let diff = "+ aws_key = \"AKIAIOSFODNN7EXAMPLE\"";
436 assert!(detect_sensitive_content(diff, &["config.ts".to_owned()]));
437 }
438
439 #[test]
440 fn ignores_removed_lines() {
441 let diff = "- API_KEY = \"old-key\"";
442 assert!(!detect_sensitive_content(diff, &["config.ts".to_owned()]));
443 }
444
445 #[test]
446 fn ignores_diff_header_lines() {
447 let diff = "+++ b/API_KEY_handler.ts";
448 assert!(!detect_sensitive_content(
449 diff,
450 &["API_KEY_handler.ts".to_owned()]
451 ));
452 }
453
454 #[test]
455 fn returns_false_for_normal_code() {
456 let diff = "+ const result = await fetchData()";
457 assert!(!detect_sensitive_content(diff, &["app.ts".to_owned()]));
458 }
459
460 #[test]
461 fn detects_source_map_files() {
462 assert!(detect_sensitive_content(
463 "diff",
464 &["bundle.js.map".to_owned()]
465 ));
466 assert!(detect_sensitive_content(
467 "diff",
468 &["styles.css.map".to_owned()]
469 ));
470 assert!(detect_sensitive_content(
471 "diff",
472 &["dist/app.map".to_owned()]
473 ));
474 }
475
476 #[test]
477 fn detects_private_key_files() {
478 assert!(detect_sensitive_content("diff", &["server.pem".to_owned()]));
479 assert!(detect_sensitive_content("diff", &["cert.p12".to_owned()]));
480 assert!(detect_sensitive_content("diff", &["ssl.key".to_owned()]));
481 assert!(detect_sensitive_content(
482 "diff",
483 &["app.keystore".to_owned()]
484 ));
485 }
486
487 #[test]
488 fn detects_ssh_private_keys() {
489 assert!(detect_sensitive_content("diff", &["id_rsa".to_owned()]));
490 assert!(detect_sensitive_content("diff", &["id_ed25519".to_owned()]));
491 assert!(detect_sensitive_content(
492 "diff",
493 &[".ssh/config".to_owned()]
494 ));
495 }
496
497 #[test]
498 fn detects_htpasswd() {
499 assert!(detect_sensitive_content("diff", &[".htpasswd".to_owned()]));
500 }
501
502 #[test]
505 fn skips_lock_files() {
506 assert!(should_skip("package-lock.json"));
507 assert!(should_skip("yarn.lock"));
508 assert!(should_skip("Cargo.lock"));
509 assert!(should_skip("bun.lockb"));
510 }
511
512 #[test]
513 fn skips_minified_files() {
514 assert!(should_skip("bundle.min.js"));
515 assert!(should_skip("styles.min.css"));
516 }
517
518 #[test]
519 fn skips_images_and_fonts() {
520 assert!(should_skip("logo.png"));
521 assert!(should_skip("icon.jpg"));
522 assert!(should_skip("font.woff2"));
523 assert!(should_skip("font.ttf"));
524 }
525
526 #[test]
527 fn skips_dist_and_build() {
528 assert!(should_skip("dist/bundle.js"));
529 assert!(should_skip("build/output.js"));
530 assert!(should_skip("node_modules/pkg/index.js"));
531 }
532
533 #[test]
534 fn does_not_skip_source_files() {
535 assert!(!should_skip("src/app.ts"));
536 assert!(!should_skip("lib/utils.rs"));
537 assert!(!should_skip("README.md"));
538 }
539
540 #[test]
543 fn extracts_file_paths_from_diff() {
544 let diff = "diff --git a/src/app.ts b/src/app.ts\nindex abc..def 100644\n--- a/src/app.ts\n+++ b/src/app.ts\n@@ -1,3 +1,4 @@\n+import something\ndiff --git a/lib/utils.ts b/lib/utils.ts\n";
545 let paths = extract_changed_file_paths(diff);
546 assert_eq!(paths, vec!["src/app.ts", "lib/utils.ts"]);
547 }
548
549 #[test]
552 fn filter_diff_removes_lock_files() {
553 let diff = "\
554diff --git a/src/main.rs b/src/main.rs
555--- a/src/main.rs
556+++ b/src/main.rs
557@@ -1,3 +1,4 @@
558+new line
559diff --git a/package-lock.json b/package-lock.json
560--- a/package-lock.json
561+++ b/package-lock.json
562@@ -1,100 +1,200 @@
563+huge lock file changes
564diff --git a/src/utils.rs b/src/utils.rs
565--- a/src/utils.rs
566+++ b/src/utils.rs
567@@ -1,2 +1,3 @@
568+another line
569";
570 let filtered = filter_diff(diff);
571 assert!(filtered.contains("src/main.rs"), "should keep source files");
572 assert!(
573 filtered.contains("src/utils.rs"),
574 "should keep source files"
575 );
576 assert!(
577 !filtered.contains("package-lock.json"),
578 "should remove lock files"
579 );
580 }
581
582 #[test]
583 fn filter_diff_removes_binary_and_minified() {
584 let diff = "\
585diff --git a/app.js b/app.js
586+code
587diff --git a/dist/bundle.min.js b/dist/bundle.min.js
588+minified
589diff --git a/logo.png b/logo.png
590Binary files differ
591";
592 let filtered = filter_diff(diff);
593 assert!(filtered.contains("app.js"));
594 assert!(!filtered.contains("bundle.min.js"));
595 assert!(!filtered.contains("logo.png"));
596 }
597
598 #[test]
599 fn filter_diff_empty_input() {
600 assert_eq!(filter_diff(""), "");
601 }
602
603 #[test]
604 fn filter_diff_no_skippable_files() {
605 let diff = "diff --git a/src/lib.rs b/src/lib.rs\n+code\n";
606 let filtered = filter_diff(diff);
607 assert_eq!(filtered, diff);
608 }
609}