Skip to main content

cc_audit/remote/
clone.rs

1use super::error::RemoteError;
2use std::io::Write;
3use std::path::{Path, PathBuf};
4use std::process::{Command, Stdio};
5use std::sync::LazyLock;
6use std::time::Duration;
7use tempfile::{NamedTempFile, TempDir};
8
9static TOKEN_URL_PATTERN: LazyLock<regex::Regex> = LazyLock::new(|| {
10    regex::Regex::new(r"https://[^@\s]+@").expect("TOKEN_URL_PATTERN is a valid regex literal")
11});
12
13static BEARER_PATTERN: LazyLock<regex::Regex> = LazyLock::new(|| {
14    regex::Regex::new(r"Bearer\s+\S+").expect("BEARER_PATTERN is a valid regex literal")
15});
16
17/// Result of a successful clone operation
18pub struct ClonedRepo {
19    /// Path to the cloned repository
20    pub path: PathBuf,
21    /// Original repository URL
22    pub url: String,
23    /// Git ref that was checked out
24    pub git_ref: String,
25    /// Commit SHA of the checked out ref
26    pub commit_sha: Option<String>,
27    /// Temporary directory handle (dropped when ClonedRepo is dropped)
28    _temp_dir: TempDir,
29}
30
31impl ClonedRepo {
32    /// Get the path to the cloned repository
33    pub fn path(&self) -> &Path {
34        &self.path
35    }
36}
37
38/// Git repository cloner with security measures
39pub struct GitCloner {
40    /// Optional authentication token for private repositories
41    auth_token: Option<String>,
42    /// Clone timeout in seconds
43    timeout_secs: u64,
44    /// Maximum repository size in MB (0 = unlimited)
45    max_size_mb: u64,
46}
47
48impl Default for GitCloner {
49    fn default() -> Self {
50        Self::new()
51    }
52}
53
54impl GitCloner {
55    /// Create a new GitCloner with default settings
56    pub fn new() -> Self {
57        Self {
58            auth_token: None,
59            timeout_secs: 300, // 5 minutes
60            max_size_mb: 0,    // unlimited
61        }
62    }
63
64    /// Set authentication token for private repositories
65    pub fn with_auth_token(mut self, token: Option<String>) -> Self {
66        self.auth_token = token;
67        self
68    }
69
70    /// Set clone timeout in seconds
71    pub fn with_timeout(mut self, secs: u64) -> Self {
72        self.timeout_secs = secs;
73        self
74    }
75
76    /// Set maximum repository size in MB
77    pub fn with_max_size(mut self, mb: u64) -> Self {
78        self.max_size_mb = mb;
79        self
80    }
81
82    /// Clone a repository with security measures
83    ///
84    /// Security measures:
85    /// - Uses shallow clone (depth=1)
86    /// - Disables git hooks (template and local)
87    /// - Uses temporary directory that is automatically cleaned up
88    /// - Token is passed via GIT_ASKPASS (not embedded in URL)
89    /// - Clone has configurable timeout
90    pub fn clone(&self, url: &str, git_ref: &str) -> Result<ClonedRepo, RemoteError> {
91        // Validate URL format
92        self.validate_url(url)?;
93
94        // Check if git is available
95        self.check_git_available()?;
96
97        // Create temporary directory
98        let temp_dir = TempDir::new().map_err(|e| RemoteError::TempDir(e.to_string()))?;
99        let repo_path = temp_dir.path().to_path_buf();
100
101        // Execute git clone with security measures (token via env, not URL)
102        self.execute_clone(url, &repo_path, git_ref)?;
103
104        // Get commit SHA
105        let commit_sha = self.get_commit_sha(&repo_path).ok();
106
107        Ok(ClonedRepo {
108            path: repo_path,
109            url: url.to_string(),
110            git_ref: git_ref.to_string(),
111            commit_sha,
112            _temp_dir: temp_dir,
113        })
114    }
115
116    /// Validate the repository URL format
117    fn validate_url(&self, url: &str) -> Result<(), RemoteError> {
118        // Check for basic URL structure
119        if !url.starts_with("https://") && !url.starts_with("git@") {
120            return Err(RemoteError::InvalidUrl(format!(
121                "URL must start with https:// or git@: {}",
122                url
123            )));
124        }
125
126        // Check for GitHub URL format
127        if url.starts_with("https://github.com/") || url.starts_with("git@github.com:") {
128            // Valid GitHub URL
129            return Ok(());
130        }
131
132        // Allow other HTTPS URLs but warn about non-GitHub sources
133        if url.starts_with("https://") {
134            return Ok(());
135        }
136
137        Err(RemoteError::InvalidUrl(format!(
138            "Unsupported URL format: {}",
139            url
140        )))
141    }
142
143    /// Check if git command is available
144    fn check_git_available(&self) -> Result<(), RemoteError> {
145        Command::new("git")
146            .arg("--version")
147            .output()
148            .map_err(|_| RemoteError::GitNotFound)?;
149        Ok(())
150    }
151
152    /// Create a temporary GIT_ASKPASS script that returns the token.
153    /// This is more secure than embedding the token in the URL because:
154    /// - Token is not visible in process list (ps aux)
155    /// - Token is not logged in git error messages
156    /// - Script is automatically cleaned up
157    fn create_askpass_script(&self) -> Result<Option<NamedTempFile>, RemoteError> {
158        let Some(ref token) = self.auth_token else {
159            return Ok(None);
160        };
161
162        let mut script = NamedTempFile::new().map_err(|e| RemoteError::TempDir(e.to_string()))?;
163
164        // Write a shell script that outputs the token
165        // The script receives the prompt as an argument but we ignore it
166        writeln!(script, "#!/bin/sh").map_err(|e| RemoteError::TempDir(e.to_string()))?;
167        writeln!(script, "echo '{}'", token.replace('\'', "'\"'\"'"))
168            .map_err(|e| RemoteError::TempDir(e.to_string()))?;
169
170        // Make the script executable (Unix only)
171        #[cfg(unix)]
172        {
173            use std::os::unix::fs::PermissionsExt;
174            let path = script.path();
175            std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o700))
176                .map_err(|e| RemoteError::TempDir(e.to_string()))?;
177        }
178
179        Ok(Some(script))
180    }
181
182    /// Sanitize error messages to remove any potential token leakage.
183    fn sanitize_error_message(&self, message: &str) -> String {
184        let mut sanitized = message.to_string();
185
186        // Remove any token-like patterns from error messages
187        if let Some(ref token) = self.auth_token {
188            sanitized = sanitized.replace(token, "[REDACTED]");
189        }
190
191        // Remove patterns that look like tokens embedded in URLs
192        // Pattern: https://TOKEN@github.com or similar
193        sanitized = TOKEN_URL_PATTERN
194            .replace_all(&sanitized, "https://[REDACTED]@")
195            .to_string();
196
197        // Also redact Bearer tokens
198        sanitized = BEARER_PATTERN
199            .replace_all(&sanitized, "Bearer [REDACTED]")
200            .to_string();
201
202        sanitized
203    }
204
205    /// Execute git clone command with security measures and timeout.
206    fn execute_clone(&self, url: &str, path: &Path, git_ref: &str) -> Result<(), RemoteError> {
207        // Create askpass script for secure token handling
208        let askpass_script = self.create_askpass_script()?;
209
210        // Build the git clone command with security measures
211        let mut cmd = Command::new("git");
212
213        // Disable hooks for security
214        cmd.env("GIT_TEMPLATE_DIR", "");
215
216        // Set up authentication via GIT_ASKPASS if we have a token
217        if let Some(ref script) = askpass_script {
218            cmd.env("GIT_ASKPASS", script.path());
219            // Disable terminal prompts to force use of ASKPASS
220            cmd.env("GIT_TERMINAL_PROMPT", "0");
221        }
222
223        // Clone with shallow depth
224        cmd.args([
225            "clone",
226            "--depth",
227            "1",
228            "--single-branch",
229            "--no-tags",
230            "-c",
231            "core.hooksPath=/dev/null",
232            "-c",
233            "advice.detachedHead=false",
234        ]);
235
236        // Add branch/ref if not HEAD
237        if git_ref != "HEAD" && !git_ref.is_empty() {
238            cmd.args(["--branch", git_ref]);
239        }
240
241        cmd.arg(url);
242        cmd.arg(path);
243
244        // Execute with timeout using a child process
245        cmd.stdout(Stdio::piped());
246        cmd.stderr(Stdio::piped());
247
248        let mut child = cmd.spawn().map_err(|e| RemoteError::CloneFailed {
249            url: url.to_string(),
250            message: self.sanitize_error_message(&e.to_string()),
251        })?;
252
253        // Wait with timeout
254        let timeout = Duration::from_secs(self.timeout_secs);
255        let start = std::time::Instant::now();
256
257        loop {
258            match child.try_wait() {
259                Ok(Some(status)) => {
260                    // Process finished
261                    let output =
262                        child
263                            .wait_with_output()
264                            .map_err(|e| RemoteError::CloneFailed {
265                                url: url.to_string(),
266                                message: self.sanitize_error_message(&e.to_string()),
267                            })?;
268
269                    if !status.success() {
270                        let stderr = String::from_utf8_lossy(&output.stderr);
271                        let sanitized_stderr = self.sanitize_error_message(&stderr);
272
273                        // Check for common error patterns
274                        if stderr.contains("Repository not found") || stderr.contains("404") {
275                            return Err(RemoteError::NotFound(url.to_string()));
276                        }
277
278                        if stderr.contains("Authentication failed")
279                            || stderr.contains("could not read Username")
280                        {
281                            return Err(RemoteError::AuthRequired(url.to_string()));
282                        }
283
284                        return Err(RemoteError::CloneFailed {
285                            url: url.to_string(),
286                            message: sanitized_stderr,
287                        });
288                    }
289
290                    return Ok(());
291                }
292                Ok(None) => {
293                    // Process still running, check timeout
294                    if start.elapsed() > timeout {
295                        // Kill the process
296                        let _ = child.kill();
297                        return Err(RemoteError::CloneFailed {
298                            url: url.to_string(),
299                            message: format!("Clone timed out after {} seconds", self.timeout_secs),
300                        });
301                    }
302                    // Sleep briefly before checking again
303                    std::thread::sleep(Duration::from_millis(100));
304                }
305                Err(e) => {
306                    return Err(RemoteError::CloneFailed {
307                        url: url.to_string(),
308                        message: self.sanitize_error_message(&e.to_string()),
309                    });
310                }
311            }
312        }
313    }
314
315    /// Get the commit SHA of HEAD
316    fn get_commit_sha(&self, path: &Path) -> Result<String, RemoteError> {
317        let output = Command::new("git")
318            .args(["rev-parse", "HEAD"])
319            .current_dir(path)
320            .output()
321            .map_err(|e| RemoteError::CloneFailed {
322                url: "".to_string(),
323                message: e.to_string(),
324            })?;
325
326        if output.status.success() {
327            Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
328        } else {
329            Err(RemoteError::CloneFailed {
330                url: "".to_string(),
331                message: "Failed to get commit SHA".to_string(),
332            })
333        }
334    }
335}
336
337/// Parse GitHub URL to extract owner and repo name
338pub fn parse_github_url(url: &str) -> Option<(String, String)> {
339    // Handle HTTPS URLs: https://github.com/owner/repo or https://github.com/owner/repo.git
340    if url.starts_with("https://github.com/") {
341        let path = url.trim_start_matches("https://github.com/");
342        let path = path.trim_end_matches(".git");
343        let parts: Vec<&str> = path.split('/').collect();
344        if parts.len() >= 2 {
345            return Some((parts[0].to_string(), parts[1].to_string()));
346        }
347    }
348
349    // Handle SSH URLs: git@github.com:owner/repo.git
350    if url.starts_with("git@github.com:") {
351        let path = url.trim_start_matches("git@github.com:");
352        let path = path.trim_end_matches(".git");
353        let parts: Vec<&str> = path.split('/').collect();
354        if parts.len() >= 2 {
355            return Some((parts[0].to_string(), parts[1].to_string()));
356        }
357    }
358
359    None
360}
361
362#[cfg(test)]
363mod tests {
364    use super::*;
365
366    #[test]
367    fn test_parse_github_url_https() {
368        let result = parse_github_url("https://github.com/owner/repo");
369        assert_eq!(result, Some(("owner".to_string(), "repo".to_string())));
370
371        let result = parse_github_url("https://github.com/owner/repo.git");
372        assert_eq!(result, Some(("owner".to_string(), "repo".to_string())));
373    }
374
375    #[test]
376    fn test_parse_github_url_ssh() {
377        let result = parse_github_url("git@github.com:owner/repo.git");
378        assert_eq!(result, Some(("owner".to_string(), "repo".to_string())));
379    }
380
381    #[test]
382    fn test_parse_github_url_invalid() {
383        assert!(parse_github_url("https://gitlab.com/owner/repo").is_none());
384        assert!(parse_github_url("not-a-url").is_none());
385    }
386
387    #[test]
388    fn test_validate_url_https() {
389        let cloner = GitCloner::new();
390        assert!(cloner.validate_url("https://github.com/owner/repo").is_ok());
391        assert!(cloner.validate_url("https://example.com/repo").is_ok());
392    }
393
394    #[test]
395    fn test_validate_url_invalid() {
396        let cloner = GitCloner::new();
397        assert!(cloner.validate_url("http://github.com/owner/repo").is_err());
398        assert!(cloner.validate_url("ftp://github.com/owner/repo").is_err());
399    }
400
401    #[test]
402    fn test_sanitize_error_message() {
403        let cloner = GitCloner::new().with_auth_token(Some("ghp_secret123".to_string()));
404
405        // Test direct token replacement
406        let msg = "failed with ghp_secret123 in message";
407        assert_eq!(
408            cloner.sanitize_error_message(msg),
409            "failed with [REDACTED] in message"
410        );
411
412        // Test URL token pattern
413        let msg = "failed: https://token123@github.com/repo";
414        assert!(cloner.sanitize_error_message(msg).contains("[REDACTED]"));
415        assert!(!cloner.sanitize_error_message(msg).contains("token123"));
416    }
417
418    #[test]
419    fn test_sanitize_error_message_no_token() {
420        let cloner = GitCloner::new();
421
422        // Without token, message should still sanitize URL patterns
423        let msg = "failed: https://sometoken@github.com/repo";
424        let sanitized = cloner.sanitize_error_message(msg);
425        assert!(sanitized.contains("[REDACTED]"));
426    }
427
428    #[test]
429    fn test_sanitize_bearer_token() {
430        let cloner = GitCloner::new();
431
432        let msg = "Authorization: Bearer ghp_secret123456";
433        let sanitized = cloner.sanitize_error_message(msg);
434        assert!(!sanitized.contains("ghp_secret123456"));
435        assert!(sanitized.contains("[REDACTED]"));
436    }
437
438    #[cfg(unix)]
439    #[test]
440    fn test_create_askpass_script() {
441        let cloner = GitCloner::new().with_auth_token(Some("test_token".to_string()));
442        let script = cloner.create_askpass_script().unwrap();
443
444        assert!(script.is_some());
445        let script = script.unwrap();
446
447        // Verify script exists and is executable
448        let path = script.path();
449        assert!(path.exists());
450
451        let metadata = std::fs::metadata(path).unwrap();
452        use std::os::unix::fs::PermissionsExt;
453        assert_eq!(metadata.permissions().mode() & 0o700, 0o700);
454    }
455
456    #[test]
457    fn test_create_askpass_script_no_token() {
458        let cloner = GitCloner::new();
459        let script = cloner.create_askpass_script().unwrap();
460        assert!(script.is_none());
461    }
462
463    #[test]
464    fn test_cloner_with_timeout() {
465        let cloner = GitCloner::new().with_timeout(60);
466        assert_eq!(cloner.timeout_secs, 60);
467    }
468
469    #[test]
470    fn test_cloner_with_max_size() {
471        let cloner = GitCloner::new().with_max_size(100);
472        assert_eq!(cloner.max_size_mb, 100);
473    }
474}