sqry_core/git/mod.rs
1//! Git integration for change-aware index updates
2//!
3//! This module provides git-based change detection to enable 10-100x faster
4//! incremental index builds by processing only files that have changed since
5//! the last index build.
6//!
7//! # Architecture
8//!
9//! The module uses a trait-based design to support multiple backends:
10//! - `SubprocessGit`: Subprocess-based git command execution (current)
11//! - `NoGit`: Fallback when git is unavailable (always returns empty changes)
12//! - Future: `Git2Backend` for libgit2-based implementation (enterprise)
13//!
14//! # Security
15//!
16//! - All git commands use `Command::new("git")` with array arguments (no shell)
17//! - File paths are canonicalized and validated to remain under workspace root
18//! - Environment variables are validated and clamped to safe ranges
19//! - Git command output is limited to 10MB to prevent memory exhaustion
20//! - Timeouts enforce SIGTERM then SIGKILL process cleanup
21//!
22//! # Example
23//!
24//! ```no_run
25//! use sqry_core::git::{GitChangeTracker, ChangeSet};
26//! use std::path::Path;
27//!
28//! let workspace = Path::new("/path/to/repo");
29//! let mut tracker = GitChangeTracker::new(workspace)?;
30//!
31//! // Detect changes since last indexed commit
32//! let baseline = Some("abc123");
33//! let (changes, new_head) = tracker.detect_changes(baseline)?;
34//!
35//! println!("Changed files: {}", changes.total());
36//! println!("New HEAD: {:?}", new_head);
37//! # Ok::<(), Box<dyn std::error::Error>>(())
38//! ```
39
40use std::path::{Path, PathBuf};
41use std::result::Result as StdResult;
42
43mod nogit;
44mod parser;
45pub mod recency;
46mod subprocess;
47mod worktree;
48
49pub use nogit::NoGit;
50pub use parser::{parse_diff_name_status, parse_porcelain};
51pub use recency::RecencyIndex;
52pub use subprocess::{SubprocessGit, max_git_output_size};
53pub use worktree::WorktreeManager;
54
55/// Result type for git operations
56pub type Result<T> = StdResult<T, GitError>;
57
58/// Errors that can occur during git operations
59#[derive(Debug, thiserror::Error)]
60pub enum GitError {
61 /// Git binary not found in PATH
62 #[error("Git binary not found in PATH")]
63 NotFound,
64
65 /// Directory is not a git repository
66 #[error("Not a git repository: {0}")]
67 NotARepo(PathBuf),
68
69 /// Git command timed out
70 #[error("Git command timed out after {0}ms")]
71 Timeout(u64),
72
73 /// Git command failed with non-zero exit
74 #[error("Git command failed: {message}\nstdout: {stdout}\nstderr: {stderr}")]
75 CommandFailed {
76 /// Error message describing the failure
77 message: String,
78 /// Standard output from the git command
79 stdout: String,
80 /// Standard error from the git command
81 stderr: String,
82 },
83
84 /// Failed to parse git output
85 #[error("Failed to parse git output: {0}")]
86 InvalidOutput(String),
87
88 /// Git output exceeded configured limit (P1-17)
89 ///
90 /// This error occurs when a git command produces more output than the
91 /// configured limit (default 10MB, range 1MB-100MB via `SQRY_GIT_MAX_OUTPUT_SIZE`).
92 ///
93 /// # Security
94 ///
95 /// This protects against `DoS` attacks from malicious repositories with
96 /// arbitrarily large git diffs or status output.
97 ///
98 /// # Resolution
99 ///
100 /// 1. Investigate the large output: `git diff --stat`
101 /// 2. Check for accidentally committed binaries or vendored dependencies
102 /// 3. If legitimate, increase the limit: `export SQRY_GIT_MAX_OUTPUT_SIZE=<bytes>`
103 #[error("Git output exceeded configured limit")]
104 OutputExceededLimit {
105 /// Configured limit in bytes
106 limit_bytes: usize,
107 /// Actual output size in bytes (conservative estimate if truncated)
108 actual_bytes: usize,
109 },
110
111 /// IO error occurred
112 #[error("IO error: {0}")]
113 Io(#[from] std::io::Error),
114
115 /// Feature not supported by this backend
116 #[error("Feature not supported: {0}")]
117 NotSupported(String),
118}
119
120impl GitError {
121 /// Calculate suggested new limit (2× actual, rounded up to nearest MB)
122 #[must_use]
123 pub fn suggested_limit(&self) -> usize {
124 match self {
125 GitError::OutputExceededLimit { actual_bytes, .. } => {
126 let suggested = actual_bytes * 2;
127 // Round up to nearest MB
128 ((suggested / (1024 * 1024)) + 1) * (1024 * 1024)
129 }
130 _ => 0,
131 }
132 }
133
134 /// Get detailed error message with suggestions (P1-17)
135 ///
136 /// For `OutputExceededLimit`, returns a detailed message with:
137 /// - Current limit in MB
138 /// - Actual output size in MB
139 /// - Suggested new limit (2× actual, rounded up)
140 /// - Investigation steps
141 #[must_use]
142 pub fn detailed_message(&self) -> String {
143 match self {
144 GitError::OutputExceededLimit {
145 limit_bytes,
146 actual_bytes,
147 } => {
148 let limit_mb = bytes_to_mb(*limit_bytes);
149 let actual_mb = bytes_to_mb(*actual_bytes);
150 let suggested = actual_bytes * 2;
151 let suggested_limit = ((suggested / (1024 * 1024)) + 1) * (1024 * 1024);
152 let suggested_mb = bytes_to_mb(suggested_limit);
153
154 format!(
155 "Git output exceeded configured limit\n \
156 Limit: {limit_mb:.1} MB (set via SQRY_GIT_MAX_OUTPUT_SIZE)\n \
157 Actual: >{actual_mb:.1} MB\n\n \
158 Suggestions:\n \
159 - Increase limit: export SQRY_GIT_MAX_OUTPUT_SIZE={suggested_limit} # {suggested_mb:.0} MB\n \
160 - Investigate large diffs: git diff --stat\n \
161 - Check for accidentally committed binaries"
162 )
163 }
164 other => format!("{other}"),
165 }
166 }
167}
168
169#[inline]
170#[allow(clippy::cast_precision_loss)] // MB conversion uses human-readable floats; loss is acceptable
171fn bytes_to_mb(bytes: usize) -> f64 {
172 bytes as f64 / (1024.0 * 1024.0)
173}
174
175/// Represents a set of file changes detected by git
176///
177/// **Path Semantics**: All paths are repo-root-relative (not absolute).
178/// Example: `src/main.rs` not `/home/user/project/src/main.rs`
179///
180/// **Security**: Before use, paths must be canonicalized and validated
181/// to remain under the workspace root to prevent path traversal attacks.
182#[derive(Debug, Clone, Default, PartialEq, Eq)]
183pub struct ChangeSet {
184 /// Files added (new files)
185 pub added: Vec<PathBuf>,
186
187 /// Files modified (content changed)
188 pub modified: Vec<PathBuf>,
189
190 /// Files deleted (removed)
191 pub deleted: Vec<PathBuf>,
192
193 /// Files renamed (`old_path`, `new_path`)
194 pub renamed: Vec<(PathBuf, PathBuf)>,
195}
196
197impl ChangeSet {
198 /// Create an empty change set
199 #[must_use]
200 pub fn new() -> Self {
201 Self::default()
202 }
203
204 /// Returns the total number of changes
205 #[must_use]
206 pub fn total(&self) -> usize {
207 self.added.len() + self.modified.len() + self.deleted.len() + self.renamed.len()
208 }
209
210 /// Returns true if there are no changes
211 #[must_use]
212 pub fn is_empty(&self) -> bool {
213 self.total() == 0
214 }
215}
216
217/// Git backend abstraction for change detection
218///
219/// This trait provides a consistent interface for different git
220/// backend implementations (subprocess, libgit2, etc.).
221pub trait GitBackend: Send + Sync {
222 /// Check if the path is a git repository
223 ///
224 /// Returns `Ok(true)` if path is a git repo, `Ok(false)` otherwise.
225 /// Returns `Err` for permission errors or IO failures to surface
226 /// them instead of silently falling back.
227 ///
228 /// # Errors
229 ///
230 /// Propagates [`GitError`] when repository detection fails (for example,
231 /// when the directory cannot be accessed or git is unavailable).
232 fn is_repo(&self, root: &Path) -> Result<bool>;
233
234 /// Get the repository root path
235 ///
236 /// This handles git worktrees correctly (where `.git` is a file
237 /// pointing to the actual git directory).
238 ///
239 /// Returns the canonicalized absolute path to the repository root.
240 ///
241 /// # Errors
242 ///
243 /// Returns [`GitError`] when git metadata cannot be inspected or when the
244 /// path cannot be canonicalized.
245 fn repo_root(&self, root: &Path) -> Result<PathBuf>;
246
247 /// Get the current HEAD commit SHA
248 ///
249 /// Returns `Ok(Some(sha))` for repos with commits.
250 /// Returns `Ok(None)` for newly initialized repos without commits.
251 /// Returns `Err` for permission errors or command failures.
252 ///
253 /// # Errors
254 ///
255 /// Returns [`GitError`] when invoking git fails or when the output cannot
256 /// be parsed.
257 fn head(&self, root: &Path) -> Result<Option<String>>;
258
259 /// Get uncommitted changes (index + working tree)
260 ///
261 /// Returns a tuple of `(ChangeSet, current_head)` to avoid race
262 /// conditions between querying changes and querying HEAD separately.
263 ///
264 /// # Arguments
265 ///
266 /// * `root` - Repository root path
267 /// * `include_untracked` - Whether to include untracked files
268 ///
269 /// # Returns
270 ///
271 /// * `ChangeSet` - Files changed in index and working tree
272 /// * `Option<String>` - Current HEAD commit SHA (None if no commits)
273 ///
274 /// # Errors
275 ///
276 /// Returns [`GitError`] when git commands fail, time out, or output parsing
277 /// detects malformed entries.
278 fn uncommitted(
279 &self,
280 root: &Path,
281 include_untracked: bool,
282 ) -> Result<(ChangeSet, Option<String>)>;
283
284 /// Get changes since a baseline commit
285 ///
286 /// Returns a tuple of `(ChangeSet, current_head)` to avoid race
287 /// conditions between diff and HEAD query.
288 ///
289 /// # Arguments
290 ///
291 /// * `root` - Repository root path
292 /// * `baseline` - Baseline commit SHA to compare against
293 /// * `rename_similarity` - Rename detection threshold (0-100)
294 ///
295 /// # Returns
296 ///
297 /// * `ChangeSet` - Files changed between baseline and HEAD
298 /// * `Option<String>` - Current HEAD commit SHA (None if no commits)
299 ///
300 /// # Errors
301 ///
302 /// Returns `Err` if baseline commit doesn't exist (e.g., shallow clone
303 /// where baseline was pruned). Caller should fall back to hash-based.
304 fn since(
305 &self,
306 root: &Path,
307 baseline: &str,
308 rename_similarity: u8,
309 ) -> Result<(ChangeSet, Option<String>)>;
310
311 /// Get backend-specific capabilities
312 ///
313 /// This allows backends to advertise optional features like
314 /// blame, time-travel indexing, etc. Used for enterprise features.
315 fn capabilities(&self) -> GitCapabilities {
316 GitCapabilities::default()
317 }
318}
319
320/// Backend-specific capabilities
321///
322/// Allows different git backends to advertise their supported features.
323/// For example, git2 backend can provide blame info, subprocess cannot.
324#[derive(Debug, Clone, Default)]
325pub struct GitCapabilities {
326 /// Whether this backend supports blame overlays
327 pub supports_blame: bool,
328
329 /// Whether this backend supports time-travel indexing
330 pub supports_time_travel: bool,
331
332 /// Whether this backend supports historical indexing
333 pub supports_history_index: bool,
334}
335
336/// High-level facade for git change tracking
337///
338/// This provides a convenient API over the `GitBackend` trait with
339/// caching and error handling.
340pub struct GitChangeTracker {
341 backend: Box<dyn GitBackend>,
342 root: PathBuf,
343 cached_head: Option<String>,
344}
345
346impl GitChangeTracker {
347 /// Create a new git change tracker
348 ///
349 /// Automatically selects the appropriate backend based on the
350 /// `SQRY_GIT_BACKEND` environment variable:
351 /// - `auto` (default): Use subprocess git, fall back to `NoGit`
352 /// - `subprocess`: Force subprocess git (fail if git not found)
353 /// - `none`: Always use `NoGit` (disable git integration)
354 ///
355 /// # Errors
356 ///
357 /// Returns `GitError::NotFound` if git binary is not in PATH and
358 /// backend is set to `subprocess`.
359 ///
360 /// Returns `GitError::NotARepo` if path is not a git repository
361 /// (only when backend is `subprocess` or `auto` with git available).
362 pub fn new(root: &Path) -> Result<Self> {
363 let backend_type = std::env::var("SQRY_GIT_BACKEND").unwrap_or_else(|_| "auto".to_string());
364
365 let backend: Box<dyn GitBackend> = match backend_type.as_str() {
366 "subprocess" => {
367 let subprocess = SubprocessGit::new();
368 if !subprocess.is_repo(root)? {
369 return Err(GitError::NotARepo(root.to_path_buf()));
370 }
371 Box::new(subprocess)
372 }
373 "none" => Box::new(NoGit),
374 _ => {
375 let subprocess = SubprocessGit::new();
376 match subprocess.is_repo(root) {
377 Ok(true) => Box::new(subprocess),
378 Ok(false) => return Err(GitError::NotARepo(root.to_path_buf())),
379 Err(GitError::NotFound) => Box::new(NoGit),
380 Err(e) => return Err(e),
381 }
382 }
383 };
384
385 Ok(Self {
386 backend,
387 root: root.to_path_buf(),
388 cached_head: None,
389 })
390 }
391
392 /// Detect changes since last indexed commit
393 ///
394 /// If `baseline` is `None`, returns uncommitted changes only.
395 /// If `baseline` is `Some(commit_sha)`, returns changes since that commit.
396 ///
397 /// # Returns
398 ///
399 /// * `ChangeSet` - Files changed
400 /// * `Option<String>` - Current HEAD commit SHA (for updating baseline)
401 ///
402 /// # Configuration
403 ///
404 /// Respects these environment variables:
405 /// - `SQRY_GIT_INCLUDE_UNTRACKED`: Include untracked files (default: 1)
406 /// - `SQRY_GIT_RENAME_SIMILARITY`: Rename detection threshold 0-100 (default: 50)
407 ///
408 /// # Errors
409 ///
410 /// Returns [`GitError`] when the underlying backend encounters IO failures,
411 /// the repository is not available, or git command output is malformed.
412 pub fn detect_changes(
413 &mut self,
414 baseline: Option<&str>,
415 ) -> Result<(ChangeSet, Option<String>)> {
416 let include_untracked = std::env::var("SQRY_GIT_INCLUDE_UNTRACKED")
417 .ok()
418 .and_then(|v| v.parse::<u8>().ok())
419 != Some(0);
420
421 let rename_similarity = std::env::var("SQRY_GIT_RENAME_SIMILARITY")
422 .ok()
423 .and_then(|v| v.parse::<u8>().ok())
424 .map_or(50, |v| v.clamp(0, 100));
425
426 if let Some(baseline_sha) = baseline {
427 let (changes, new_head) =
428 self.backend
429 .since(&self.root, baseline_sha, rename_similarity)?;
430 self.cached_head.clone_from(&new_head);
431 Ok((changes, new_head))
432 } else {
433 let (changes, new_head) = self.backend.uncommitted(&self.root, include_untracked)?;
434 self.cached_head.clone_from(&new_head);
435 Ok((changes, new_head))
436 }
437 }
438
439 /// Get the current HEAD commit SHA
440 ///
441 /// Uses cached value if available, otherwise queries git.
442 ///
443 /// # Errors
444 ///
445 /// Returns [`GitError`] when requesting the HEAD from the backend fails.
446 pub fn head(&mut self) -> Result<Option<String>> {
447 if let Some(ref head) = self.cached_head {
448 return Ok(Some(head.clone()));
449 }
450
451 let head = self.backend.head(&self.root)?;
452 self.cached_head.clone_from(&head);
453 Ok(head)
454 }
455
456 /// Get the repository root path
457 ///
458 /// # Errors
459 ///
460 /// Returns [`GitError`] when the backend cannot determine or canonicalize
461 /// the root directory.
462 pub fn repo_root(&self) -> Result<PathBuf> {
463 self.backend.repo_root(&self.root)
464 }
465
466 /// Get backend capabilities
467 #[must_use]
468 pub fn capabilities(&self) -> GitCapabilities {
469 self.backend.capabilities()
470 }
471}
472
473#[cfg(test)]
474mod tests {
475 use super::*;
476
477 #[test]
478 fn test_changeset_total() {
479 let mut changes = ChangeSet::new();
480 assert_eq!(changes.total(), 0);
481 assert!(changes.is_empty());
482
483 changes.added.push(PathBuf::from("a.rs"));
484 changes.modified.push(PathBuf::from("b.rs"));
485 changes.deleted.push(PathBuf::from("c.rs"));
486 changes
487 .renamed
488 .push((PathBuf::from("d.rs"), PathBuf::from("e.rs")));
489
490 assert_eq!(changes.total(), 4);
491 assert!(!changes.is_empty());
492 }
493
494 #[test]
495 fn test_changeset_equality() {
496 let mut changes1 = ChangeSet::new();
497 changes1.added.push(PathBuf::from("a.rs"));
498
499 let mut changes2 = ChangeSet::new();
500 changes2.added.push(PathBuf::from("a.rs"));
501
502 assert_eq!(changes1, changes2);
503
504 changes2.modified.push(PathBuf::from("b.rs"));
505 assert_ne!(changes1, changes2);
506 }
507}