Skip to main content

opencodecommit/
context.rs

1use std::path::Path;
2use std::sync::LazyLock;
3
4use crate::Result;
5use crate::config::{Config, SensitiveConfig};
6use crate::git;
7use crate::sensitive::{
8    SensitiveFinding, SensitiveReport, scan_diff_for_sensitive_content,
9    scan_diff_for_sensitive_content_with_options,
10};
11
12/// Truncation strategy applied to a file.
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub enum TruncationMode {
15    Full,
16    Sections,
17    Outline,
18    Skipped,
19}
20
21impl std::fmt::Display for TruncationMode {
22    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
23        match self {
24            TruncationMode::Full => write!(f, "full"),
25            TruncationMode::Sections => write!(f, "sections"),
26            TruncationMode::Outline => write!(f, "outline"),
27            TruncationMode::Skipped => write!(f, "skipped"),
28        }
29    }
30}
31
32/// File content with truncation metadata.
33#[derive(Debug, Clone)]
34pub struct FileContext {
35    pub path: String,
36    pub content: String,
37    pub truncation_mode: TruncationMode,
38}
39
40/// Full context for commit message generation.
41#[derive(Debug, Clone)]
42pub struct CommitContext {
43    pub diff: String,
44    pub recent_commits: Vec<String>,
45    pub branch: String,
46    pub file_contents: Vec<FileContext>,
47    pub changed_files: Vec<String>,
48    pub sensitive_report: SensitiveReport,
49    pub sensitive_findings: Vec<SensitiveFinding>,
50    pub has_sensitive_content: bool,
51}
52
53// --- Skip patterns ---
54
55static SKIP_PATTERNS: LazyLock<Vec<regex::Regex>> = LazyLock::new(|| {
56    [
57        r"\.lock$",
58        r"package-lock\.json$",
59        r"yarn\.lock$",
60        r"pnpm-lock\.yaml$",
61        r"bun\.lockb$",
62        r"Cargo\.lock$",
63        r"Gemfile\.lock$",
64        r"poetry\.lock$",
65        r"composer\.lock$",
66        r"go\.sum$",
67        r"\.min\.js$",
68        r"\.min\.css$",
69        r"\.map$",
70        r"\.bundle\.js$",
71        r"\.png$",
72        r"\.jpg$",
73        r"\.jpeg$",
74        r"\.gif$",
75        r"\.ico$",
76        r"\.woff2?$",
77        r"\.ttf$",
78        r"\.eot$",
79        r"(?:^|/)dist/",
80        r"(?:^|/)build/",
81        r"(?:^|/)node_modules/",
82        r"(?:^|/)\.next/",
83        r"(?:^|/)__pycache__/",
84    ]
85    .iter()
86    .map(|p| regex::Regex::new(p).unwrap())
87    .collect()
88});
89
90/// Detect if the diff or changed files contain sensitive content.
91pub fn detect_sensitive_content(diff: &str, changed_files: &[String]) -> bool {
92    detect_sensitive_report(diff, changed_files, None).has_findings()
93}
94
95/// Return structured findings for sensitive content matches.
96pub fn detect_sensitive_findings(diff: &str, changed_files: &[String]) -> Vec<SensitiveFinding> {
97    detect_sensitive_report(diff, changed_files, None).findings
98}
99
100/// Return the full sensitive report for the provided diff.
101pub fn detect_sensitive_report(
102    diff: &str,
103    changed_files: &[String],
104    sensitive: Option<&SensitiveConfig>,
105) -> SensitiveReport {
106    match sensitive {
107        Some(sensitive) => scan_diff_for_sensitive_content_with_options(
108            diff,
109            changed_files,
110            sensitive.enforcement,
111            &sensitive.allowlist,
112        ),
113        None => scan_diff_for_sensitive_content(diff, changed_files),
114    }
115}
116
117/// Check if a file should be skipped for context reading.
118pub fn should_skip(file_path: &str) -> bool {
119    SKIP_PATTERNS.iter().any(|p| p.is_match(file_path))
120}
121
122/// Filter a unified diff, removing file sections that match skip patterns.
123/// Reuses the existing SKIP_PATTERNS (lock files, binaries, minified, generated).
124pub fn filter_diff(diff: &str) -> String {
125    if diff.is_empty() {
126        return String::new();
127    }
128
129    let mut result = String::new();
130    let mut current_section = String::new();
131    let mut skip_current = false;
132
133    for line in diff.lines() {
134        if line.starts_with("diff --git ") {
135            // Flush previous section if not skipped
136            if !skip_current && !current_section.is_empty() {
137                result.push_str(&current_section);
138            }
139
140            // Start new section
141            current_section = String::new();
142            current_section.push_str(line);
143            current_section.push('\n');
144
145            // Extract b/ path and check if we should skip
146            skip_current = line
147                .rsplit_once(" b/")
148                .map(|(_, path)| should_skip(path))
149                .unwrap_or(false);
150        } else {
151            current_section.push_str(line);
152            current_section.push('\n');
153        }
154    }
155
156    // Flush last section
157    if !skip_current && !current_section.is_empty() {
158        result.push_str(&current_section);
159    }
160
161    result
162}
163
164// --- Signature pattern for outline mode ---
165
166static SIGNATURE_PATTERN: LazyLock<regex::Regex> = LazyLock::new(|| {
167    regex::Regex::new(
168        r"^(?:export\s+)?(?:default\s+)?(?:async\s+)?(?:function|class|interface|type|const|let|var|enum|abstract\s+class|public|private|protected|def |fn )\b",
169    )
170    .unwrap()
171});
172
173/// Extract changed file paths from a unified diff.
174pub fn extract_changed_file_paths(diff: &str) -> Vec<String> {
175    let mut paths = Vec::new();
176    let re = regex::Regex::new(r"^diff --git a/.+ b/(.+)$").unwrap();
177    for line in diff.lines() {
178        if let Some(caps) = re.captures(line) {
179            paths.push(caps[1].to_owned());
180        }
181    }
182    paths
183}
184
185/// Extract hunk start line numbers for a specific file from a diff.
186fn get_hunk_line_numbers(diff: &str, file_path: &str) -> Vec<usize> {
187    let mut lines = Vec::new();
188    let mut in_file = false;
189    let hunk_re = regex::Regex::new(r"^@@ -\d+(?:,\d+)? \+(\d+)").unwrap();
190
191    for line in diff.lines() {
192        if line.starts_with("diff --git") {
193            in_file = line.contains(&format!("b/{file_path}"));
194            continue;
195        }
196        if in_file
197            && let Some(caps) = hunk_re.captures(line)
198            && let Ok(n) = caps[1].parse::<usize>()
199        {
200            lines.push(n);
201        }
202    }
203    lines
204}
205
206/// Read a file with smart truncation.
207fn read_file_content(file_path: &str, repo_root: &Path, diff: &str) -> FileContext {
208    let full_path = repo_root.join(file_path);
209
210    // Guard against path traversal
211    if let (Ok(resolved), Ok(resolved_root)) = (full_path.canonicalize(), repo_root.canonicalize())
212        && !resolved.starts_with(&resolved_root)
213    {
214        return FileContext {
215            path: file_path.to_owned(),
216            content: String::new(),
217            truncation_mode: TruncationMode::Skipped,
218        };
219    }
220
221    let content = match std::fs::read_to_string(&full_path) {
222        Ok(c) => c,
223        Err(_) => {
224            return FileContext {
225                path: file_path.to_owned(),
226                content: String::new(),
227                truncation_mode: TruncationMode::Skipped,
228            };
229        }
230    };
231
232    let file_lines: Vec<&str> = content.lines().collect();
233    let line_count = file_lines.len();
234
235    // Full mode: ≤500 lines
236    if line_count <= 500 {
237        return FileContext {
238            path: file_path.to_owned(),
239            content,
240            truncation_mode: TruncationMode::Full,
241        };
242    }
243
244    let hunk_lines = get_hunk_line_numbers(diff, file_path);
245
246    // Sections mode: ≤2000 lines — header + context windows around hunks
247    if line_count <= 2000 {
248        let mut parts = Vec::new();
249        let header_end = 30.min(file_lines.len());
250        parts.push(file_lines[..header_end].join("\n"));
251
252        for &hunk_line in &hunk_lines {
253            let start = hunk_line.saturating_sub(25);
254            let end = (hunk_line + 25).min(file_lines.len());
255            parts.push(format!("\n... (line {}) ...\n", start + 1));
256            parts.push(file_lines[start..end].join("\n"));
257        }
258
259        return FileContext {
260            path: file_path.to_owned(),
261            content: parts.join("\n"),
262            truncation_mode: TruncationMode::Sections,
263        };
264    }
265
266    // Outline mode: >2000 lines — signatures + hunk windows
267    let mut parts: Vec<String> = Vec::new();
268    for line in &file_lines {
269        if SIGNATURE_PATTERN.is_match(line.trim()) {
270            parts.push(line.to_string());
271        }
272    }
273
274    for &hunk_line in &hunk_lines {
275        let start = hunk_line.saturating_sub(10);
276        let end = (hunk_line + 10).min(file_lines.len());
277        parts.push(format!("\n... (line {}) ...\n", start + 1));
278        parts.push(file_lines[start..end].join("\n"));
279    }
280
281    FileContext {
282        path: file_path.to_owned(),
283        content: parts.join("\n"),
284        truncation_mode: TruncationMode::Outline,
285    }
286}
287
288/// Read file contents for changed files with a total character budget.
289pub fn get_file_contents(
290    changed_files: &[String],
291    repo_root: &Path,
292    diff: &str,
293) -> Vec<FileContext> {
294    const TOTAL_BUDGET: usize = 30_000;
295    let mut results = Vec::new();
296    let mut total_chars = 0;
297
298    // Filter skipped files and sort by file size (smallest first)
299    let mut files_with_size: Vec<_> = changed_files
300        .iter()
301        .filter(|f| !should_skip(f))
302        .map(|f| {
303            let size = repo_root
304                .join(f)
305                .metadata()
306                .map(|m| m.len() as usize)
307                .unwrap_or(0);
308            (f.as_str(), size)
309        })
310        .collect();
311    files_with_size.sort_by_key(|&(_, size)| size);
312
313    for (file, _) in files_with_size {
314        if total_chars >= TOTAL_BUDGET {
315            break;
316        }
317
318        let mut fc = read_file_content(file, repo_root, diff);
319        if fc.truncation_mode == TruncationMode::Skipped || fc.content.is_empty() {
320            continue;
321        }
322
323        // Trim to fit budget
324        let remaining = TOTAL_BUDGET - total_chars;
325        if fc.content.len() > remaining {
326            fc.content = format!(
327                "{}\n... (truncated to fit context budget)",
328                &fc.content[..remaining]
329            );
330        }
331
332        total_chars += fc.content.len();
333        results.push(fc);
334    }
335
336    results
337}
338
339/// Gather full context for commit message generation.
340pub fn gather_context(repo_root: &Path, config: &Config) -> Result<CommitContext> {
341    let diff = git::get_diff(config.diff_source, repo_root)?;
342    let recent_commits = git::get_recent_commits(repo_root, 10).unwrap_or_default();
343    let branch = git::get_branch_name(repo_root).unwrap_or_else(|_| "unknown".to_owned());
344    let changed_files = extract_changed_file_paths(&diff);
345    let sensitive_report = detect_sensitive_report(&diff, &changed_files, Some(&config.sensitive));
346    let sensitive_findings = sensitive_report.findings.clone();
347    let has_sensitive_content = sensitive_report.has_findings();
348    let file_contents = get_file_contents(&changed_files, repo_root, &diff);
349
350    Ok(CommitContext {
351        diff,
352        recent_commits,
353        branch,
354        file_contents,
355        changed_files,
356        sensitive_report,
357        sensitive_findings,
358        has_sensitive_content,
359    })
360}
361
362#[cfg(test)]
363mod tests {
364    use super::*;
365
366    // --- detectSensitiveContent tests (ported from TS) ---
367
368    #[test]
369    fn detects_env_file() {
370        assert!(detect_sensitive_content("some diff", &[".env".to_owned()]));
371    }
372
373    #[test]
374    fn detects_env_production() {
375        assert!(detect_sensitive_content(
376            "some diff",
377            &[".env.production".to_owned()]
378        ));
379    }
380
381    #[test]
382    fn detects_nested_env_file() {
383        assert!(detect_sensitive_content(
384            "some diff",
385            &["config/.env.local".to_owned()]
386        ));
387    }
388
389    #[test]
390    fn detects_credentials_json() {
391        assert!(detect_sensitive_content(
392            "some diff",
393            &["credentials.json".to_owned()]
394        ));
395    }
396
397    #[test]
398    fn detects_api_key_in_added_lines() {
399        let diff = "diff --git a/config.ts b/config.ts\n+const API_KEY = \"sk-proj-abcdefghijklmnopqrstuvwxyz1234567890\"";
400        assert!(detect_sensitive_content(diff, &["config.ts".to_owned()]));
401    }
402
403    #[test]
404    fn detects_secret_key_in_added_lines() {
405        let diff = "+  SECRET_KEY: \"Alpha9981Zeta\"";
406        assert!(detect_sensitive_content(diff, &["config.ts".to_owned()]));
407    }
408
409    #[test]
410    fn detects_access_token_in_added_lines() {
411        let diff = "+export const ACCESS_TOKEN = \"Alpha9981Zeta99\"";
412        assert!(detect_sensitive_content(diff, &["auth.ts".to_owned()]));
413    }
414
415    #[test]
416    fn detects_password_in_added_lines() {
417        let diff = "+  DB_PASSWORD=Alpha9981Zeta";
418        assert!(detect_sensitive_content(diff, &["config.ts".to_owned()]));
419    }
420
421    #[test]
422    fn detects_sk_prefixed_keys() {
423        let diff = "+  key: \"sk-proj-abcdefghijklmnopqrstuvwxyz1234567890\"";
424        assert!(detect_sensitive_content(diff, &["config.ts".to_owned()]));
425    }
426
427    #[test]
428    fn detects_ghp_tokens() {
429        let diff = "+  GITHUB_TOKEN=ghp_abcdefghijklmnopqrstuvwxyz1234";
430        assert!(detect_sensitive_content(diff, &["ci.yml".to_owned()]));
431    }
432
433    #[test]
434    fn detects_aws_access_keys() {
435        let diff = "+  aws_key = \"AKIAIOSFODNN7EXAMPLE\"";
436        assert!(detect_sensitive_content(diff, &["config.ts".to_owned()]));
437    }
438
439    #[test]
440    fn ignores_removed_lines() {
441        let diff = "-  API_KEY = \"old-key\"";
442        assert!(!detect_sensitive_content(diff, &["config.ts".to_owned()]));
443    }
444
445    #[test]
446    fn ignores_diff_header_lines() {
447        let diff = "+++ b/API_KEY_handler.ts";
448        assert!(!detect_sensitive_content(
449            diff,
450            &["API_KEY_handler.ts".to_owned()]
451        ));
452    }
453
454    #[test]
455    fn returns_false_for_normal_code() {
456        let diff = "+  const result = await fetchData()";
457        assert!(!detect_sensitive_content(diff, &["app.ts".to_owned()]));
458    }
459
460    #[test]
461    fn detects_source_map_files() {
462        assert!(detect_sensitive_content(
463            "diff",
464            &["bundle.js.map".to_owned()]
465        ));
466        assert!(detect_sensitive_content(
467            "diff",
468            &["styles.css.map".to_owned()]
469        ));
470        assert!(detect_sensitive_content(
471            "diff",
472            &["dist/app.map".to_owned()]
473        ));
474    }
475
476    #[test]
477    fn detects_private_key_files() {
478        assert!(detect_sensitive_content("diff", &["server.pem".to_owned()]));
479        assert!(detect_sensitive_content("diff", &["cert.p12".to_owned()]));
480        assert!(detect_sensitive_content("diff", &["ssl.key".to_owned()]));
481        assert!(detect_sensitive_content(
482            "diff",
483            &["app.keystore".to_owned()]
484        ));
485    }
486
487    #[test]
488    fn detects_ssh_private_keys() {
489        assert!(detect_sensitive_content("diff", &["id_rsa".to_owned()]));
490        assert!(detect_sensitive_content("diff", &["id_ed25519".to_owned()]));
491        assert!(detect_sensitive_content(
492            "diff",
493            &[".ssh/config".to_owned()]
494        ));
495    }
496
497    #[test]
498    fn detects_htpasswd() {
499        assert!(detect_sensitive_content("diff", &[".htpasswd".to_owned()]));
500    }
501
502    // --- skip patterns ---
503
504    #[test]
505    fn skips_lock_files() {
506        assert!(should_skip("package-lock.json"));
507        assert!(should_skip("yarn.lock"));
508        assert!(should_skip("Cargo.lock"));
509        assert!(should_skip("bun.lockb"));
510    }
511
512    #[test]
513    fn skips_minified_files() {
514        assert!(should_skip("bundle.min.js"));
515        assert!(should_skip("styles.min.css"));
516    }
517
518    #[test]
519    fn skips_images_and_fonts() {
520        assert!(should_skip("logo.png"));
521        assert!(should_skip("icon.jpg"));
522        assert!(should_skip("font.woff2"));
523        assert!(should_skip("font.ttf"));
524    }
525
526    #[test]
527    fn skips_dist_and_build() {
528        assert!(should_skip("dist/bundle.js"));
529        assert!(should_skip("build/output.js"));
530        assert!(should_skip("node_modules/pkg/index.js"));
531    }
532
533    #[test]
534    fn does_not_skip_source_files() {
535        assert!(!should_skip("src/app.ts"));
536        assert!(!should_skip("lib/utils.rs"));
537        assert!(!should_skip("README.md"));
538    }
539
540    // --- extract_changed_file_paths ---
541
542    #[test]
543    fn extracts_file_paths_from_diff() {
544        let diff = "diff --git a/src/app.ts b/src/app.ts\nindex abc..def 100644\n--- a/src/app.ts\n+++ b/src/app.ts\n@@ -1,3 +1,4 @@\n+import something\ndiff --git a/lib/utils.ts b/lib/utils.ts\n";
545        let paths = extract_changed_file_paths(diff);
546        assert_eq!(paths, vec!["src/app.ts", "lib/utils.ts"]);
547    }
548
549    // --- filter_diff ---
550
551    #[test]
552    fn filter_diff_removes_lock_files() {
553        let diff = "\
554diff --git a/src/main.rs b/src/main.rs
555--- a/src/main.rs
556+++ b/src/main.rs
557@@ -1,3 +1,4 @@
558+new line
559diff --git a/package-lock.json b/package-lock.json
560--- a/package-lock.json
561+++ b/package-lock.json
562@@ -1,100 +1,200 @@
563+huge lock file changes
564diff --git a/src/utils.rs b/src/utils.rs
565--- a/src/utils.rs
566+++ b/src/utils.rs
567@@ -1,2 +1,3 @@
568+another line
569";
570        let filtered = filter_diff(diff);
571        assert!(filtered.contains("src/main.rs"), "should keep source files");
572        assert!(
573            filtered.contains("src/utils.rs"),
574            "should keep source files"
575        );
576        assert!(
577            !filtered.contains("package-lock.json"),
578            "should remove lock files"
579        );
580    }
581
582    #[test]
583    fn filter_diff_removes_binary_and_minified() {
584        let diff = "\
585diff --git a/app.js b/app.js
586+code
587diff --git a/dist/bundle.min.js b/dist/bundle.min.js
588+minified
589diff --git a/logo.png b/logo.png
590Binary files differ
591";
592        let filtered = filter_diff(diff);
593        assert!(filtered.contains("app.js"));
594        assert!(!filtered.contains("bundle.min.js"));
595        assert!(!filtered.contains("logo.png"));
596    }
597
598    #[test]
599    fn filter_diff_empty_input() {
600        assert_eq!(filter_diff(""), "");
601    }
602
603    #[test]
604    fn filter_diff_no_skippable_files() {
605        let diff = "diff --git a/src/lib.rs b/src/lib.rs\n+code\n";
606        let filtered = filter_diff(diff);
607        assert_eq!(filtered, diff);
608    }
609}