Skip to main content

thoughts_tool/
repo_identity.rs

1//! Canonical repository identity normalization.
2//!
3//! This module provides `RepoIdentity` as the single source of truth for repository identity,
4//! enabling consistent URL normalization across SSH, HTTPS, and various git hosting formats.
5
6use anyhow::{Result, bail};
7
8/// Maximum allowed subgroup nesting depth (GitLab supports up to 20 levels).
9const MAX_SUBGROUP_DEPTH: usize = 20;
10
11/// Canonical repository identity extracted from a git URL.
12///
13/// This struct normalizes various URL formats (SSH, HTTPS, with/without .git suffix)
14/// into a consistent identity that can be used for deduplication and matching.
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct RepoIdentity {
17    /// Host name (lowercased), e.g., "github.com"
18    pub host: String,
19    /// Organization path (may contain multiple segments for GitLab subgroups), e.g., "org" or "group/subgroup"
20    pub org_path: String,
21    /// Repository name (no .git suffix, no trailing slash)
22    pub repo: String,
23}
24
25/// Canonical key for identity-based lookups and deduplication.
26///
27/// All fields are lowercased for case-insensitive matching.
28#[derive(Debug, Clone, PartialEq, Eq, Hash)]
29pub struct RepoIdentityKey {
30    pub host: String,
31    pub org_path: String,
32    pub repo: String,
33}
34
35impl RepoIdentity {
36    /// Parse a git URL into a RepoIdentity.
37    ///
38    /// Supported formats:
39    /// - SSH scp-like: `git@github.com:org/repo.git`
40    /// - SSH with port: `ssh://git@host:2222/org/repo.git`
41    /// - HTTPS: `https://github.com/org/repo` or `https://github.com/org/repo.git`
42    /// - GitLab subgroups: `https://gitlab.com/a/b/c/repo.git`
43    /// - Azure DevOps: `https://dev.azure.com/org/proj/_git/repo`
44    ///
45    /// # Errors
46    /// Returns an error if the URL cannot be parsed or has invalid structure.
47    pub fn parse(url: &str) -> Result<Self> {
48        let url = url.trim();
49
50        // Determine URL type and extract host + path
51        let (host, path) = if url.starts_with("git@") {
52            // SSH scp-like: git@host:path
53            parse_scp_url(url)?
54        } else if url.starts_with("ssh://") {
55            // SSH with scheme: ssh://[user@]host[:port]/path
56            parse_ssh_scheme_url(url)?
57        } else if url.starts_with("https://") || url.starts_with("http://") {
58            // HTTPS/HTTP: scheme://[user@]host[:port]/path
59            parse_https_url(url)?
60        } else {
61            bail!("Unsupported URL format: {}", url);
62        };
63
64        // Normalize path: remove trailing slashes and .git suffix
65        let path = path
66            .trim_end_matches('/')
67            .trim_end_matches(".git")
68            .trim_end_matches('/');
69
70        // Split path into segments and validate
71        let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
72
73        if segments.is_empty() {
74            bail!("URL has no path segments: {}", url);
75        }
76
77        // Check for invalid segments
78        for seg in &segments {
79            if *seg == "." || *seg == ".." {
80                bail!("Invalid path segment '{}' in URL: {}", seg, url);
81            }
82        }
83
84        if segments.len() > MAX_SUBGROUP_DEPTH + 1 {
85            bail!(
86                "Path has too many segments ({}, max {}): {}",
87                segments.len(),
88                MAX_SUBGROUP_DEPTH + 1,
89                url
90            );
91        }
92
93        // Handle Azure DevOps special case: org/proj/_git/repo
94        let (org_path, repo) = if let Some(git_idx) = segments.iter().position(|s| *s == "_git") {
95            if git_idx + 1 >= segments.len() {
96                bail!("Azure DevOps URL missing repo after _git: {}", url);
97            }
98            let org_segments = &segments[..git_idx];
99            let repo = segments[git_idx + 1];
100            (org_segments.join("/"), repo.to_string())
101        } else if segments.len() == 1 {
102            // Single segment: treat as repo with empty org (unusual but valid for some hosts)
103            (String::new(), segments[0].to_string())
104        } else {
105            // Standard case: all but last segment is org_path, last is repo
106            let org_segments = &segments[..segments.len() - 1];
107            let repo = segments[segments.len() - 1];
108            (org_segments.join("/"), repo.to_string())
109        };
110
111        Ok(Self {
112            host: host.to_lowercase(),
113            org_path,
114            repo,
115        })
116    }
117
118    /// Get the canonical key for identity-based lookups.
119    ///
120    /// All fields are lowercased for case-insensitive matching.
121    pub fn canonical_key(&self) -> RepoIdentityKey {
122        RepoIdentityKey {
123            host: self.host.to_lowercase(),
124            org_path: self.org_path.to_lowercase(),
125            repo: self.repo.to_lowercase(),
126        }
127    }
128}
129
130/// Parse SSH scp-like URL: `git@host:path` or `user@host:path`
131fn parse_scp_url(url: &str) -> Result<(String, String)> {
132    // Format: [user@]host:path
133    let without_user = url.find('@').map(|i| &url[i + 1..]).unwrap_or(url);
134
135    let colon_pos = without_user
136        .find(':')
137        .ok_or_else(|| anyhow::anyhow!("Invalid scp-like URL (missing colon): {}", url))?;
138
139    let host = &without_user[..colon_pos];
140    let path = &without_user[colon_pos + 1..];
141
142    if host.is_empty() {
143        bail!("Empty host in URL: {}", url);
144    }
145
146    Ok((host.to_string(), path.to_string()))
147}
148
149/// Parse SSH scheme URL: `ssh://[user@]host[:port]/path`
150fn parse_ssh_scheme_url(url: &str) -> Result<(String, String)> {
151    let without_scheme = url
152        .strip_prefix("ssh://")
153        .ok_or_else(|| anyhow::anyhow!("Not an SSH URL: {}", url))?;
154
155    // Strip userinfo if present
156    let without_user = without_scheme
157        .find('@')
158        .map(|i| &without_scheme[i + 1..])
159        .unwrap_or(without_scheme);
160
161    // Find the first slash (separates host[:port] from path)
162    let slash_pos = without_user
163        .find('/')
164        .ok_or_else(|| anyhow::anyhow!("SSH URL missing path: {}", url))?;
165
166    let host_port = &without_user[..slash_pos];
167    let path = &without_user[slash_pos + 1..];
168
169    // Extract host (strip port if present)
170    let host = host_port
171        .split(':')
172        .next()
173        .ok_or_else(|| anyhow::anyhow!("Empty host in URL: {}", url))?;
174
175    if host.is_empty() {
176        bail!("Empty host in URL: {}", url);
177    }
178
179    Ok((host.to_string(), path.to_string()))
180}
181
182/// Parse HTTPS/HTTP URL: `scheme://[user@]host[:port]/path`
183fn parse_https_url(url: &str) -> Result<(String, String)> {
184    let scheme_end = url
185        .find("://")
186        .ok_or_else(|| anyhow::anyhow!("Invalid URL (missing ://): {}", url))?;
187
188    let without_scheme = &url[scheme_end + 3..];
189
190    // Strip userinfo if present
191    let without_user = without_scheme
192        .find('@')
193        .map(|i| &without_scheme[i + 1..])
194        .unwrap_or(without_scheme);
195
196    // Find the first slash (separates host[:port] from path)
197    let slash_pos = without_user
198        .find('/')
199        .ok_or_else(|| anyhow::anyhow!("URL missing path: {}", url))?;
200
201    let host_port = &without_user[..slash_pos];
202    let path = &without_user[slash_pos + 1..];
203
204    // Extract host (strip port if present)
205    let host = host_port
206        .split(':')
207        .next()
208        .ok_or_else(|| anyhow::anyhow!("Empty host in URL: {}", url))?;
209
210    if host.is_empty() {
211        bail!("Empty host in URL: {}", url);
212    }
213
214    Ok((host.to_string(), path.to_string()))
215}
216
217/// Split `url` into (base_url, optional_subpath) using a last-colon heuristic.
218///
219/// Treats it as `URL:subpath` only if the base portion parses as a valid `RepoIdentity`.
220/// This avoids confusing `host:port` for a subpath delimiter.
221///
222/// # Examples
223/// ```ignore
224/// // No subpath
225/// parse_url_and_subpath("git@github.com:org/repo.git")
226///   => ("git@github.com:org/repo.git", None)
227///
228/// // With subpath
229/// parse_url_and_subpath("git@github.com:org/repo.git:docs/api")
230///   => ("git@github.com:org/repo.git", Some("docs/api"))
231///
232/// // SSH with port (port is NOT a subpath)
233/// parse_url_and_subpath("ssh://git@host:2222/org/repo.git")
234///   => ("ssh://git@host:2222/org/repo.git", None)
235///
236/// // SSH with port AND subpath
237/// parse_url_and_subpath("ssh://git@host:2222/org/repo.git:docs/api")
238///   => ("ssh://git@host:2222/org/repo.git", Some("docs/api"))
239/// ```
240pub fn parse_url_and_subpath(url: &str) -> (String, Option<String>) {
241    // Strategy: find the rightmost colon and check if the left side parses as a valid URL.
242    // If it does, the right side is a subpath. If not, there's no subpath.
243
244    // Handle scheme-based URLs: ssh://, https://, http://
245    // For these, we need to be careful about host:port patterns
246
247    let url = url.trim();
248
249    // Try splitting from the right
250    if let Some(colon_pos) = url.rfind(':') {
251        let potential_base = &url[..colon_pos];
252        let potential_subpath = &url[colon_pos + 1..];
253
254        // Don't split if subpath is empty
255        if potential_subpath.is_empty() {
256            return (url.to_string(), None);
257        }
258
259        // Don't split if subpath looks like a port (all digits)
260        if potential_subpath.chars().all(|c| c.is_ascii_digit()) {
261            return (url.to_string(), None);
262        }
263
264        // Don't split if potential_base is empty or just a scheme
265        if potential_base.is_empty() || potential_base.ends_with("//") {
266            return (url.to_string(), None);
267        }
268
269        // Try parsing the base as a RepoIdentity
270        if RepoIdentity::parse(potential_base).is_ok() {
271            return (
272                potential_base.to_string(),
273                Some(potential_subpath.to_string()),
274            );
275        }
276    }
277
278    (url.to_string(), None)
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    // ===== RepoIdentity::parse tests =====
286
287    #[test]
288    fn test_parse_ssh_scp_basic() {
289        let id = RepoIdentity::parse("git@github.com:org/repo.git").unwrap();
290        assert_eq!(id.host, "github.com");
291        assert_eq!(id.org_path, "org");
292        assert_eq!(id.repo, "repo");
293    }
294
295    #[test]
296    fn test_parse_ssh_scp_no_git_suffix() {
297        let id = RepoIdentity::parse("git@github.com:org/repo").unwrap();
298        assert_eq!(id.host, "github.com");
299        assert_eq!(id.org_path, "org");
300        assert_eq!(id.repo, "repo");
301    }
302
303    #[test]
304    fn test_parse_https_basic() {
305        let id = RepoIdentity::parse("https://github.com/org/repo").unwrap();
306        assert_eq!(id.host, "github.com");
307        assert_eq!(id.org_path, "org");
308        assert_eq!(id.repo, "repo");
309    }
310
311    #[test]
312    fn test_parse_https_with_git_suffix() {
313        let id = RepoIdentity::parse("https://github.com/org/repo.git").unwrap();
314        assert_eq!(id.host, "github.com");
315        assert_eq!(id.org_path, "org");
316        assert_eq!(id.repo, "repo");
317    }
318
319    #[test]
320    fn test_parse_https_trailing_slash() {
321        let id = RepoIdentity::parse("https://github.com/org/repo/").unwrap();
322        assert_eq!(id.host, "github.com");
323        assert_eq!(id.org_path, "org");
324        assert_eq!(id.repo, "repo");
325    }
326
327    #[test]
328    fn test_parse_ssh_with_port() {
329        let id = RepoIdentity::parse("ssh://git@host.example.com:2222/org/repo.git").unwrap();
330        assert_eq!(id.host, "host.example.com");
331        assert_eq!(id.org_path, "org");
332        assert_eq!(id.repo, "repo");
333    }
334
335    #[test]
336    fn test_parse_gitlab_subgroups() {
337        let id = RepoIdentity::parse("https://gitlab.com/group/subgroup/team/repo.git").unwrap();
338        assert_eq!(id.host, "gitlab.com");
339        assert_eq!(id.org_path, "group/subgroup/team");
340        assert_eq!(id.repo, "repo");
341    }
342
343    #[test]
344    fn test_parse_gitlab_deep_subgroups() {
345        let id = RepoIdentity::parse("https://gitlab.com/a/b/c/d/e/repo.git").unwrap();
346        assert_eq!(id.host, "gitlab.com");
347        assert_eq!(id.org_path, "a/b/c/d/e");
348        assert_eq!(id.repo, "repo");
349    }
350
351    #[test]
352    fn test_parse_azure_devops() {
353        let id = RepoIdentity::parse("https://dev.azure.com/myorg/myproj/_git/myrepo").unwrap();
354        assert_eq!(id.host, "dev.azure.com");
355        assert_eq!(id.org_path, "myorg/myproj");
356        assert_eq!(id.repo, "myrepo");
357    }
358
359    #[test]
360    fn test_parse_host_case_normalized() {
361        let id = RepoIdentity::parse("https://GitHub.COM/Org/Repo").unwrap();
362        assert_eq!(id.host, "github.com");
363        // org_path and repo preserve case
364        assert_eq!(id.org_path, "Org");
365        assert_eq!(id.repo, "Repo");
366    }
367
368    #[test]
369    fn test_parse_http_scheme() {
370        let id = RepoIdentity::parse("http://github.com/org/repo").unwrap();
371        assert_eq!(id.host, "github.com");
372        assert_eq!(id.org_path, "org");
373        assert_eq!(id.repo, "repo");
374    }
375
376    #[test]
377    fn test_parse_rejects_invalid_segments() {
378        assert!(RepoIdentity::parse("https://github.com/../repo").is_err());
379        assert!(RepoIdentity::parse("https://github.com/./repo").is_err());
380    }
381
382    #[test]
383    fn test_parse_rejects_unsupported_scheme() {
384        assert!(RepoIdentity::parse("ftp://github.com/org/repo").is_err());
385        assert!(RepoIdentity::parse("org/repo").is_err());
386    }
387
388    // ===== canonical_key tests =====
389
390    #[test]
391    fn test_canonical_key_equality_across_schemes() {
392        let ssh = RepoIdentity::parse("git@github.com:User/Repo.git").unwrap();
393        let https = RepoIdentity::parse("https://github.com/user/repo").unwrap();
394
395        assert_eq!(ssh.canonical_key(), https.canonical_key());
396    }
397
398    #[test]
399    fn test_canonical_key_different_repos() {
400        let a = RepoIdentity::parse("git@github.com:org/repo-a.git").unwrap();
401        let b = RepoIdentity::parse("git@github.com:org/repo-b.git").unwrap();
402
403        assert_ne!(a.canonical_key(), b.canonical_key());
404    }
405
406    #[test]
407    fn test_canonical_key_different_orgs() {
408        let a = RepoIdentity::parse("git@github.com:alice/utils.git").unwrap();
409        let b = RepoIdentity::parse("git@github.com:bob/utils.git").unwrap();
410
411        assert_ne!(a.canonical_key(), b.canonical_key());
412    }
413
414    // ===== parse_url_and_subpath tests =====
415
416    #[test]
417    fn test_subpath_none_basic() {
418        let (url, sub) = parse_url_and_subpath("git@github.com:user/repo.git");
419        assert_eq!(url, "git@github.com:user/repo.git");
420        assert_eq!(sub, None);
421    }
422
423    #[test]
424    fn test_subpath_present() {
425        let (url, sub) = parse_url_and_subpath("git@github.com:user/repo.git:docs/api");
426        assert_eq!(url, "git@github.com:user/repo.git");
427        assert_eq!(sub, Some("docs/api".to_string()));
428    }
429
430    #[test]
431    fn test_subpath_https_none() {
432        let (url, sub) = parse_url_and_subpath("https://github.com/user/repo");
433        assert_eq!(url, "https://github.com/user/repo");
434        assert_eq!(sub, None);
435    }
436
437    #[test]
438    fn test_subpath_ssh_port_not_confused() {
439        // Port should NOT be treated as subpath
440        let (url, sub) = parse_url_and_subpath("ssh://git@host:2222/org/repo.git");
441        assert_eq!(url, "ssh://git@host:2222/org/repo.git");
442        assert_eq!(sub, None);
443    }
444
445    #[test]
446    fn test_subpath_ssh_port_with_actual_subpath() {
447        let (url, sub) = parse_url_and_subpath("ssh://git@host:2222/org/repo.git:docs/api");
448        assert_eq!(url, "ssh://git@host:2222/org/repo.git");
449        assert_eq!(sub, Some("docs/api".to_string()));
450    }
451
452    #[test]
453    fn test_subpath_empty_subpath_ignored() {
454        let (url, sub) = parse_url_and_subpath("git@github.com:user/repo.git:");
455        assert_eq!(url, "git@github.com:user/repo.git:");
456        assert_eq!(sub, None);
457    }
458}