Skip to main content

tokmd_git/
lib.rs

1//! # tokmd-git
2//!
3//! **Tier 2 (Utilities)**
4//!
5//! Streaming git log adapter for tokmd analysis. Collects commit history
6//! without loading the entire history into memory.
7//!
8//! ## What belongs here
9//! * Git history collection
10//! * Commit parsing (timestamp, author, affected files)
11//! * Streaming interface
12//!
13//! ## What does NOT belong here
14//! * Analysis computation (use tokmd-analysis)
15//! * Git history modification
16//! * Complex git operations (use git2 crate directly if needed)
17
18use std::io::{BufRead, BufReader};
19use std::path::{Path, PathBuf};
20use std::process::Stdio;
21
22use anyhow::{Context, Result};
23pub use tokmd_types::CommitIntentKind;
24
25mod command;
26mod intent;
27mod refs;
28
29pub use command::git_cmd;
30pub use intent::classify_intent;
31pub use refs::{resolve_base_ref, rev_exists};
32
33#[derive(Debug, Clone)]
34pub struct GitCommit {
35    pub timestamp: i64,
36    pub author: String,
37    pub hash: Option<String>,
38    pub subject: String,
39    pub files: Vec<String>,
40}
41
42/// Git range syntax for comparing commits.
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
44pub enum GitRangeMode {
45    /// Two-dot syntax: `A..B` - commits in B but not A.
46    #[default]
47    TwoDot,
48    /// Three-dot syntax: `A...B` - symmetric difference from merge-base.
49    ThreeDot,
50}
51
52impl GitRangeMode {
53    /// Format the range string for git commands.
54    pub fn format(&self, base: &str, head: &str) -> String {
55        match self {
56            GitRangeMode::TwoDot => format!("{}..{}", base, head),
57            GitRangeMode::ThreeDot => format!("{}...{}", base, head),
58        }
59    }
60}
61
62pub fn git_available() -> bool {
63    git_cmd()
64        .arg("--version")
65        .stdout(Stdio::null())
66        .stderr(Stdio::null())
67        .status()
68        .map(|s| s.success())
69        .unwrap_or(false)
70}
71
72pub fn repo_root(path: &Path) -> Option<PathBuf> {
73    let output = git_cmd()
74        .arg("-C")
75        .arg(path)
76        .arg("rev-parse")
77        .arg("--show-toplevel")
78        .output()
79        .ok()?;
80    if !output.status.success() {
81        return None;
82    }
83    let root = String::from_utf8_lossy(&output.stdout).trim().to_string();
84    if root.is_empty() {
85        None
86    } else {
87        Some(PathBuf::from(root))
88    }
89}
90
91pub fn collect_history(
92    repo_root: &Path,
93    max_commits: Option<usize>,
94    max_commit_files: Option<usize>,
95) -> Result<Vec<GitCommit>> {
96    let mut child = git_cmd()
97        .arg("-C")
98        .arg(repo_root)
99        .arg("log")
100        .arg("--name-only")
101        .arg("--pretty=format:%ct|%ae|%H|%s")
102        .stdout(Stdio::piped())
103        .stderr(Stdio::null())
104        .spawn()
105        .context("Failed to spawn git log")?;
106
107    let stdout = child.stdout.take().context("Missing git log stdout")?;
108    let reader = BufReader::new(stdout);
109
110    let mut commits: Vec<GitCommit> = Vec::new();
111    let mut current: Option<GitCommit> = None;
112
113    for line in reader.lines() {
114        let line = line?;
115        if line.trim().is_empty() {
116            if let Some(commit) = current.take() {
117                commits.push(commit);
118                if max_commits.is_some_and(|limit| commits.len() >= limit) {
119                    break;
120                }
121            }
122            continue;
123        }
124
125        if current.is_none() {
126            let mut parts = line.splitn(4, '|');
127            let ts = parts.next().unwrap_or("0").parse::<i64>().unwrap_or(0);
128            let author = parts.next().unwrap_or("").to_string();
129            let hash_str = parts.next().unwrap_or("").to_string();
130            let subject = parts.next().unwrap_or("").to_string();
131            let hash = if hash_str.is_empty() {
132                None
133            } else {
134                Some(hash_str)
135            };
136            current = Some(GitCommit {
137                timestamp: ts,
138                author,
139                hash,
140                subject,
141                files: Vec::new(),
142            });
143            continue;
144        }
145
146        if let Some(commit) = current.as_mut()
147            && max_commit_files
148                .map(|limit| commit.files.len() < limit)
149                .unwrap_or(true)
150        {
151            commit.files.push(line.trim().to_string());
152        }
153    }
154
155    if let Some(commit) = current.take() {
156        commits.push(commit);
157    }
158
159    let status = child.wait()?;
160    if !status.success() {
161        return Err(anyhow::anyhow!("git log failed"));
162    }
163
164    Ok(commits)
165}
166
167/// Get the set of added line numbers per file between two refs.
168pub fn get_added_lines(
169    repo_root: &Path,
170    base: &str,
171    head: &str,
172    range_mode: GitRangeMode,
173) -> Result<std::collections::BTreeMap<PathBuf, std::collections::BTreeSet<usize>>> {
174    let range = range_mode.format(base, head);
175    let output = git_cmd()
176        .arg("-C")
177        .arg(repo_root)
178        .args(["diff", "--unified=0", &range])
179        .output()
180        .context("Failed to run git diff")?;
181
182    if !output.status.success() {
183        let stderr = String::from_utf8_lossy(&output.stderr);
184        return Err(anyhow::anyhow!("git diff failed: {}", stderr.trim()));
185    }
186
187    let stdout = String::from_utf8_lossy(&output.stdout);
188    let mut result: std::collections::BTreeMap<PathBuf, std::collections::BTreeSet<usize>> =
189        std::collections::BTreeMap::new();
190    let mut current_file: Option<PathBuf> = None;
191
192    for line in stdout.lines() {
193        if let Some(file_path) = line.strip_prefix("+++ b/") {
194            current_file = Some(PathBuf::from(file_path));
195            continue;
196        }
197
198        if line.starts_with("@@") {
199            let Some(file) = current_file.as_ref() else {
200                continue;
201            };
202
203            // Hunk header: @@ -a,b +c,d @@
204            // We care about +c,d
205            let parts: Vec<&str> = line.split_whitespace().collect();
206            if parts.len() < 3 {
207                continue;
208            }
209
210            let new_range = parts[2]; // +c,d
211            let range_str = new_range.strip_prefix('+').unwrap_or(new_range);
212            let range_parts: Vec<&str> = range_str.split(',').collect();
213
214            let start: usize = range_parts[0].parse().unwrap_or(0);
215            let count: usize = if range_parts.len() > 1 {
216                range_parts[1].parse().unwrap_or(1)
217            } else {
218                1
219            };
220
221            if count > 0 && start > 0 {
222                let set = result.entry(file.clone()).or_default();
223                for i in 0..count {
224                    set.insert(start + i);
225                }
226            }
227        }
228    }
229
230    Ok(result)
231}
232
233#[cfg(test)]
234mod tests {
235    use super::*;
236    use std::collections::{BTreeMap, BTreeSet};
237    use std::process::Command;
238
239    fn test_git(dir: &Path) -> Command {
240        let mut cmd = git_cmd();
241        cmd.arg("-C").arg(dir);
242        cmd
243    }
244
245    fn run_git(dir: &Path, args: &[&str]) {
246        let output = test_git(dir).args(args).output().unwrap();
247        assert!(
248            output.status.success(),
249            "git {:?} failed\nstdout: {}\nstderr: {}",
250            args,
251            String::from_utf8_lossy(&output.stdout),
252            String::from_utf8_lossy(&output.stderr)
253        );
254    }
255
256    fn init_repo() -> tempfile::TempDir {
257        let dir = tempfile::tempdir().unwrap();
258        run_git(dir.path(), &["init", "-b", "main"]);
259        run_git(dir.path(), &["config", "user.email", "test@test.com"]);
260        run_git(dir.path(), &["config", "user.name", "Test"]);
261        dir
262    }
263
264    fn commit_all(dir: &Path, message: &str) {
265        run_git(dir, &["add", "."]);
266        run_git(dir, &["commit", "-m", message]);
267    }
268
269    #[test]
270    fn git_range_two_dot_format() {
271        assert_eq!(GitRangeMode::TwoDot.format("main", "HEAD"), "main..HEAD");
272    }
273
274    #[test]
275    fn git_range_three_dot_format() {
276        assert_eq!(GitRangeMode::ThreeDot.format("main", "HEAD"), "main...HEAD");
277    }
278
279    #[test]
280    fn git_range_default_is_two_dot() {
281        assert_eq!(GitRangeMode::default(), GitRangeMode::TwoDot);
282    }
283
284    #[test]
285    fn collect_history_preserves_commit_metadata_and_limits_files() {
286        if !git_available() {
287            return;
288        }
289        let dir = init_repo();
290
291        std::fs::write(dir.path().join("alpha.txt"), "alpha\n").unwrap();
292        std::fs::write(dir.path().join("beta.txt"), "beta\n").unwrap();
293        commit_all(dir.path(), "feat: add fixtures");
294
295        let commits = collect_history(dir.path(), None, Some(1)).unwrap();
296
297        assert_eq!(commits.len(), 1);
298        let commit = &commits[0];
299        assert_eq!(commit.author, "test@test.com");
300        assert_eq!(commit.subject, "feat: add fixtures");
301        assert!(commit.hash.as_deref().is_some_and(|hash| hash.len() == 40));
302        assert_eq!(commit.files.len(), 1);
303        assert!(["alpha.txt", "beta.txt"].contains(&commit.files[0].as_str()));
304    }
305
306    #[test]
307    fn collect_history_respects_commit_and_file_limits() {
308        if !git_available() {
309            return;
310        }
311        let dir = init_repo();
312
313        std::fs::write(dir.path().join("first.txt"), "first\n").unwrap();
314        commit_all(dir.path(), "chore: first");
315        std::fs::write(dir.path().join("second.txt"), "second\n").unwrap();
316        std::fs::write(dir.path().join("third.txt"), "third\n").unwrap();
317        commit_all(dir.path(), "fix: second");
318
319        let commits = collect_history(dir.path(), Some(1), Some(0)).unwrap();
320
321        assert_eq!(commits.len(), 1);
322        assert_eq!(commits[0].subject, "fix: second");
323        assert!(commits[0].files.is_empty());
324    }
325
326    #[test]
327    fn get_added_lines_reports_new_line_numbers_per_file() {
328        if !git_available() {
329            return;
330        }
331        let dir = init_repo();
332
333        std::fs::create_dir(dir.path().join("src")).unwrap();
334        std::fs::write(dir.path().join("src/lib.rs"), "fn a() {}\nfn d() {}\n").unwrap();
335        commit_all(dir.path(), "base");
336        run_git(dir.path(), &["tag", "base"]);
337
338        std::fs::write(
339            dir.path().join("src/lib.rs"),
340            "fn a() {}\nfn b() {}\nfn c() {}\nfn d() {}\n",
341        )
342        .unwrap();
343        commit_all(dir.path(), "add middle functions");
344
345        let added = get_added_lines(dir.path(), "base", "HEAD", GitRangeMode::TwoDot).unwrap();
346
347        let mut expected = BTreeMap::new();
348        expected.insert(
349            PathBuf::from("src/lib.rs"),
350            BTreeSet::from([2_usize, 3_usize]),
351        );
352        assert_eq!(added, expected);
353    }
354}