Skip to main content

sqry_core/git/
mod.rs

1//! Git integration for change-aware index updates
2//!
3//! This module provides git-based change detection to enable 10-100x faster
4//! incremental index builds by processing only files that have changed since
5//! the last index build.
6//!
7//! # Architecture
8//!
9//! The module uses a trait-based design to support multiple backends:
10//! - `SubprocessGit`: Subprocess-based git command execution (current)
11//! - `NoGit`: Fallback when git is unavailable (always returns empty changes)
12//! - Future: `Git2Backend` for libgit2-based implementation (enterprise)
13//!
14//! # Security
15//!
16//! - All git commands use `Command::new("git")` with array arguments (no shell)
17//! - File paths are canonicalized and validated to remain under workspace root
18//! - Environment variables are validated and clamped to safe ranges
19//! - Git command output is limited to 10MB to prevent memory exhaustion
20//! - Timeouts enforce SIGTERM then SIGKILL process cleanup
21//!
22//! # Example
23//!
24//! ```no_run
25//! use sqry_core::git::{GitChangeTracker, ChangeSet};
26//! use std::path::Path;
27//!
28//! let workspace = Path::new("/path/to/repo");
29//! let mut tracker = GitChangeTracker::new(workspace)?;
30//!
31//! // Detect changes since last indexed commit
32//! let baseline = Some("abc123");
33//! let (changes, new_head) = tracker.detect_changes(baseline)?;
34//!
35//! println!("Changed files: {}", changes.total());
36//! println!("New HEAD: {:?}", new_head);
37//! # Ok::<(), Box<dyn std::error::Error>>(())
38//! ```
39
40use std::path::{Path, PathBuf};
41use std::result::Result as StdResult;
42
43mod nogit;
44mod parser;
45pub mod recency;
46mod subprocess;
47mod worktree;
48
49pub use nogit::NoGit;
50pub use parser::{parse_diff_name_status, parse_porcelain};
51pub use recency::RecencyIndex;
52pub use subprocess::{SubprocessGit, max_git_output_size};
53pub use worktree::WorktreeManager;
54
55/// Result type for git operations
56pub type Result<T> = StdResult<T, GitError>;
57
58/// Errors that can occur during git operations
59#[derive(Debug, thiserror::Error)]
60pub enum GitError {
61    /// Git binary not found in PATH
62    #[error("Git binary not found in PATH")]
63    NotFound,
64
65    /// Directory is not a git repository
66    #[error("Not a git repository: {0}")]
67    NotARepo(PathBuf),
68
69    /// Git command timed out
70    #[error("Git command timed out after {0}ms")]
71    Timeout(u64),
72
73    /// Git command failed with non-zero exit
74    #[error("Git command failed: {message}\nstdout: {stdout}\nstderr: {stderr}")]
75    CommandFailed {
76        /// Error message describing the failure
77        message: String,
78        /// Standard output from the git command
79        stdout: String,
80        /// Standard error from the git command
81        stderr: String,
82    },
83
84    /// Failed to parse git output
85    #[error("Failed to parse git output: {0}")]
86    InvalidOutput(String),
87
88    /// Git output exceeded configured limit (P1-17)
89    ///
90    /// This error occurs when a git command produces more output than the
91    /// configured limit (default 10MB, range 1MB-100MB via `SQRY_GIT_MAX_OUTPUT_SIZE`).
92    ///
93    /// # Security
94    ///
95    /// This protects against `DoS` attacks from malicious repositories with
96    /// arbitrarily large git diffs or status output.
97    ///
98    /// # Resolution
99    ///
100    /// 1. Investigate the large output: `git diff --stat`
101    /// 2. Check for accidentally committed binaries or vendored dependencies
102    /// 3. If legitimate, increase the limit: `export SQRY_GIT_MAX_OUTPUT_SIZE=<bytes>`
103    #[error("Git output exceeded configured limit")]
104    OutputExceededLimit {
105        /// Configured limit in bytes
106        limit_bytes: usize,
107        /// Actual output size in bytes (conservative estimate if truncated)
108        actual_bytes: usize,
109    },
110
111    /// IO error occurred
112    #[error("IO error: {0}")]
113    Io(#[from] std::io::Error),
114
115    /// Feature not supported by this backend
116    #[error("Feature not supported: {0}")]
117    NotSupported(String),
118}
119
120impl GitError {
121    /// Calculate suggested new limit (2× actual, rounded up to nearest MB)
122    #[must_use]
123    pub fn suggested_limit(&self) -> usize {
124        match self {
125            GitError::OutputExceededLimit { actual_bytes, .. } => {
126                let suggested = actual_bytes * 2;
127                // Round up to nearest MB
128                ((suggested / (1024 * 1024)) + 1) * (1024 * 1024)
129            }
130            _ => 0,
131        }
132    }
133
134    /// Get detailed error message with suggestions (P1-17)
135    ///
136    /// For `OutputExceededLimit`, returns a detailed message with:
137    /// - Current limit in MB
138    /// - Actual output size in MB
139    /// - Suggested new limit (2× actual, rounded up)
140    /// - Investigation steps
141    #[must_use]
142    pub fn detailed_message(&self) -> String {
143        match self {
144            GitError::OutputExceededLimit {
145                limit_bytes,
146                actual_bytes,
147            } => {
148                let limit_mb = bytes_to_mb(*limit_bytes);
149                let actual_mb = bytes_to_mb(*actual_bytes);
150                let suggested = actual_bytes * 2;
151                let suggested_limit = ((suggested / (1024 * 1024)) + 1) * (1024 * 1024);
152                let suggested_mb = bytes_to_mb(suggested_limit);
153
154                format!(
155                    "Git output exceeded configured limit\n  \
156                     Limit: {limit_mb:.1} MB (set via SQRY_GIT_MAX_OUTPUT_SIZE)\n  \
157                     Actual: >{actual_mb:.1} MB\n\n  \
158                     Suggestions:\n  \
159                     - Increase limit: export SQRY_GIT_MAX_OUTPUT_SIZE={suggested_limit}  # {suggested_mb:.0} MB\n  \
160                     - Investigate large diffs: git diff --stat\n  \
161                     - Check for accidentally committed binaries"
162                )
163            }
164            other => format!("{other}"),
165        }
166    }
167}
168
169#[inline]
170#[allow(clippy::cast_precision_loss)] // MB conversion uses human-readable floats; loss is acceptable
171fn bytes_to_mb(bytes: usize) -> f64 {
172    bytes as f64 / (1024.0 * 1024.0)
173}
174
175/// Represents a set of file changes detected by git
176///
177/// **Path Semantics**: All paths are repo-root-relative (not absolute).
178/// Example: `src/main.rs` not `/home/user/project/src/main.rs`
179///
180/// **Security**: Before use, paths must be canonicalized and validated
181/// to remain under the workspace root to prevent path traversal attacks.
182#[derive(Debug, Clone, Default, PartialEq, Eq)]
183pub struct ChangeSet {
184    /// Files added (new files)
185    pub added: Vec<PathBuf>,
186
187    /// Files modified (content changed)
188    pub modified: Vec<PathBuf>,
189
190    /// Files deleted (removed)
191    pub deleted: Vec<PathBuf>,
192
193    /// Files renamed (`old_path`, `new_path`)
194    pub renamed: Vec<(PathBuf, PathBuf)>,
195}
196
197impl ChangeSet {
198    /// Create an empty change set
199    #[must_use]
200    pub fn new() -> Self {
201        Self::default()
202    }
203
204    /// Returns the total number of changes
205    #[must_use]
206    pub fn total(&self) -> usize {
207        self.added.len() + self.modified.len() + self.deleted.len() + self.renamed.len()
208    }
209
210    /// Returns true if there are no changes
211    #[must_use]
212    pub fn is_empty(&self) -> bool {
213        self.total() == 0
214    }
215}
216
217/// Git backend abstraction for change detection
218///
219/// This trait provides a consistent interface for different git
220/// backend implementations (subprocess, libgit2, etc.).
221pub trait GitBackend: Send + Sync {
222    /// Check if the path is a git repository
223    ///
224    /// Returns `Ok(true)` if path is a git repo, `Ok(false)` otherwise.
225    /// Returns `Err` for permission errors or IO failures to surface
226    /// them instead of silently falling back.
227    ///
228    /// # Errors
229    ///
230    /// Propagates [`GitError`] when repository detection fails (for example,
231    /// when the directory cannot be accessed or git is unavailable).
232    fn is_repo(&self, root: &Path) -> Result<bool>;
233
234    /// Get the repository root path
235    ///
236    /// This handles git worktrees correctly (where `.git` is a file
237    /// pointing to the actual git directory).
238    ///
239    /// Returns the canonicalized absolute path to the repository root.
240    ///
241    /// # Errors
242    ///
243    /// Returns [`GitError`] when git metadata cannot be inspected or when the
244    /// path cannot be canonicalized.
245    fn repo_root(&self, root: &Path) -> Result<PathBuf>;
246
247    /// Get the current HEAD commit SHA
248    ///
249    /// Returns `Ok(Some(sha))` for repos with commits.
250    /// Returns `Ok(None)` for newly initialized repos without commits.
251    /// Returns `Err` for permission errors or command failures.
252    ///
253    /// # Errors
254    ///
255    /// Returns [`GitError`] when invoking git fails or when the output cannot
256    /// be parsed.
257    fn head(&self, root: &Path) -> Result<Option<String>>;
258
259    /// Get uncommitted changes (index + working tree)
260    ///
261    /// Returns a tuple of `(ChangeSet, current_head)` to avoid race
262    /// conditions between querying changes and querying HEAD separately.
263    ///
264    /// # Arguments
265    ///
266    /// * `root` - Repository root path
267    /// * `include_untracked` - Whether to include untracked files
268    ///
269    /// # Returns
270    ///
271    /// * `ChangeSet` - Files changed in index and working tree
272    /// * `Option<String>` - Current HEAD commit SHA (None if no commits)
273    ///
274    /// # Errors
275    ///
276    /// Returns [`GitError`] when git commands fail, time out, or output parsing
277    /// detects malformed entries.
278    fn uncommitted(
279        &self,
280        root: &Path,
281        include_untracked: bool,
282    ) -> Result<(ChangeSet, Option<String>)>;
283
284    /// Get changes since a baseline commit
285    ///
286    /// Returns a tuple of `(ChangeSet, current_head)` to avoid race
287    /// conditions between diff and HEAD query.
288    ///
289    /// # Arguments
290    ///
291    /// * `root` - Repository root path
292    /// * `baseline` - Baseline commit SHA to compare against
293    /// * `rename_similarity` - Rename detection threshold (0-100)
294    ///
295    /// # Returns
296    ///
297    /// * `ChangeSet` - Files changed between baseline and HEAD
298    /// * `Option<String>` - Current HEAD commit SHA (None if no commits)
299    ///
300    /// # Errors
301    ///
302    /// Returns `Err` if baseline commit doesn't exist (e.g., shallow clone
303    /// where baseline was pruned). Caller should fall back to hash-based.
304    fn since(
305        &self,
306        root: &Path,
307        baseline: &str,
308        rename_similarity: u8,
309    ) -> Result<(ChangeSet, Option<String>)>;
310
311    /// Get backend-specific capabilities
312    ///
313    /// This allows backends to advertise optional features like
314    /// blame, time-travel indexing, etc. Used for enterprise features.
315    fn capabilities(&self) -> GitCapabilities {
316        GitCapabilities::default()
317    }
318}
319
320/// Backend-specific capabilities
321///
322/// Allows different git backends to advertise their supported features.
323/// For example, git2 backend can provide blame info, subprocess cannot.
324#[derive(Debug, Clone, Default)]
325pub struct GitCapabilities {
326    /// Whether this backend supports blame overlays
327    pub supports_blame: bool,
328
329    /// Whether this backend supports time-travel indexing
330    pub supports_time_travel: bool,
331
332    /// Whether this backend supports historical indexing
333    pub supports_history_index: bool,
334}
335
336/// High-level facade for git change tracking
337///
338/// This provides a convenient API over the `GitBackend` trait with
339/// caching and error handling.
340pub struct GitChangeTracker {
341    backend: Box<dyn GitBackend>,
342    root: PathBuf,
343    cached_head: Option<String>,
344}
345
346impl GitChangeTracker {
347    /// Create a new git change tracker
348    ///
349    /// Automatically selects the appropriate backend based on the
350    /// `SQRY_GIT_BACKEND` environment variable:
351    /// - `auto` (default): Use subprocess git, fall back to `NoGit`
352    /// - `subprocess`: Force subprocess git (fail if git not found)
353    /// - `none`: Always use `NoGit` (disable git integration)
354    ///
355    /// # Errors
356    ///
357    /// Returns `GitError::NotFound` if git binary is not in PATH and
358    /// backend is set to `subprocess`.
359    ///
360    /// Returns `GitError::NotARepo` if path is not a git repository
361    /// (only when backend is `subprocess` or `auto` with git available).
362    pub fn new(root: &Path) -> Result<Self> {
363        let backend_type = std::env::var("SQRY_GIT_BACKEND").unwrap_or_else(|_| "auto".to_string());
364
365        let backend: Box<dyn GitBackend> = match backend_type.as_str() {
366            "subprocess" => {
367                let subprocess = SubprocessGit::new();
368                if !subprocess.is_repo(root)? {
369                    return Err(GitError::NotARepo(root.to_path_buf()));
370                }
371                Box::new(subprocess)
372            }
373            "none" => Box::new(NoGit),
374            _ => {
375                let subprocess = SubprocessGit::new();
376                match subprocess.is_repo(root) {
377                    Ok(true) => Box::new(subprocess),
378                    Ok(false) => return Err(GitError::NotARepo(root.to_path_buf())),
379                    Err(GitError::NotFound) => Box::new(NoGit),
380                    Err(e) => return Err(e),
381                }
382            }
383        };
384
385        Ok(Self {
386            backend,
387            root: root.to_path_buf(),
388            cached_head: None,
389        })
390    }
391
392    /// Detect changes since last indexed commit
393    ///
394    /// If `baseline` is `None`, returns uncommitted changes only.
395    /// If `baseline` is `Some(commit_sha)`, returns changes since that commit.
396    ///
397    /// # Returns
398    ///
399    /// * `ChangeSet` - Files changed
400    /// * `Option<String>` - Current HEAD commit SHA (for updating baseline)
401    ///
402    /// # Configuration
403    ///
404    /// Respects these environment variables:
405    /// - `SQRY_GIT_INCLUDE_UNTRACKED`: Include untracked files (default: 1)
406    /// - `SQRY_GIT_RENAME_SIMILARITY`: Rename detection threshold 0-100 (default: 50)
407    ///
408    /// # Errors
409    ///
410    /// Returns [`GitError`] when the underlying backend encounters IO failures,
411    /// the repository is not available, or git command output is malformed.
412    pub fn detect_changes(
413        &mut self,
414        baseline: Option<&str>,
415    ) -> Result<(ChangeSet, Option<String>)> {
416        let include_untracked = std::env::var("SQRY_GIT_INCLUDE_UNTRACKED")
417            .ok()
418            .and_then(|v| v.parse::<u8>().ok())
419            != Some(0);
420
421        let rename_similarity = std::env::var("SQRY_GIT_RENAME_SIMILARITY")
422            .ok()
423            .and_then(|v| v.parse::<u8>().ok())
424            .map_or(50, |v| v.clamp(0, 100));
425
426        if let Some(baseline_sha) = baseline {
427            let (changes, new_head) =
428                self.backend
429                    .since(&self.root, baseline_sha, rename_similarity)?;
430            self.cached_head.clone_from(&new_head);
431            Ok((changes, new_head))
432        } else {
433            let (changes, new_head) = self.backend.uncommitted(&self.root, include_untracked)?;
434            self.cached_head.clone_from(&new_head);
435            Ok((changes, new_head))
436        }
437    }
438
439    /// Get the current HEAD commit SHA
440    ///
441    /// Uses cached value if available, otherwise queries git.
442    ///
443    /// # Errors
444    ///
445    /// Returns [`GitError`] when requesting the HEAD from the backend fails.
446    pub fn head(&mut self) -> Result<Option<String>> {
447        if let Some(ref head) = self.cached_head {
448            return Ok(Some(head.clone()));
449        }
450
451        let head = self.backend.head(&self.root)?;
452        self.cached_head.clone_from(&head);
453        Ok(head)
454    }
455
456    /// Get the repository root path
457    ///
458    /// # Errors
459    ///
460    /// Returns [`GitError`] when the backend cannot determine or canonicalize
461    /// the root directory.
462    pub fn repo_root(&self) -> Result<PathBuf> {
463        self.backend.repo_root(&self.root)
464    }
465
466    /// Get backend capabilities
467    #[must_use]
468    pub fn capabilities(&self) -> GitCapabilities {
469        self.backend.capabilities()
470    }
471}
472
473#[cfg(test)]
474mod tests {
475    use super::*;
476
477    #[test]
478    fn test_changeset_total() {
479        let mut changes = ChangeSet::new();
480        assert_eq!(changes.total(), 0);
481        assert!(changes.is_empty());
482
483        changes.added.push(PathBuf::from("a.rs"));
484        changes.modified.push(PathBuf::from("b.rs"));
485        changes.deleted.push(PathBuf::from("c.rs"));
486        changes
487            .renamed
488            .push((PathBuf::from("d.rs"), PathBuf::from("e.rs")));
489
490        assert_eq!(changes.total(), 4);
491        assert!(!changes.is_empty());
492    }
493
494    #[test]
495    fn test_changeset_equality() {
496        let mut changes1 = ChangeSet::new();
497        changes1.added.push(PathBuf::from("a.rs"));
498
499        let mut changes2 = ChangeSet::new();
500        changes2.added.push(PathBuf::from("a.rs"));
501
502        assert_eq!(changes1, changes2);
503
504        changes2.modified.push(PathBuf::from("b.rs"));
505        assert_ne!(changes1, changes2);
506    }
507}