Skip to main content

thoughts_tool/
repo_identity.rs

1//! Canonical repository identity normalization.
2//!
3//! This module provides `RepoIdentity` as the single source of truth for repository identity,
4//! enabling consistent URL normalization across SSH, HTTPS, and various git hosting formats.
5
6use anyhow::Result;
7use anyhow::bail;
8
9/// Maximum allowed subgroup nesting depth (GitLab supports up to 20 levels).
10const MAX_SUBGROUP_DEPTH: usize = 20;
11
12/// Canonical repository identity extracted from a git URL.
13///
14/// This struct normalizes various URL formats (SSH, HTTPS, with/without .git suffix)
15/// into a consistent identity that can be used for deduplication and matching.
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct RepoIdentity {
18    /// Host name (lowercased), e.g., "github.com"
19    pub host: String,
20    /// Organization path (may contain multiple segments for GitLab subgroups), e.g., "org" or "group/subgroup"
21    pub org_path: String,
22    /// Repository name (no .git suffix, no trailing slash)
23    pub repo: String,
24}
25
26/// Canonical key for identity-based lookups and deduplication.
27///
28/// All fields are lowercased for case-insensitive matching.
29#[derive(Debug, Clone, PartialEq, Eq, Hash)]
30pub struct RepoIdentityKey {
31    pub host: String,
32    pub org_path: String,
33    pub repo: String,
34}
35
36impl RepoIdentity {
37    /// Parse a git URL into a RepoIdentity.
38    ///
39    /// Supported formats:
40    /// - SSH scp-like: `git@github.com:org/repo.git`
41    /// - SSH with port: `ssh://git@host:2222/org/repo.git`
42    /// - HTTPS: `https://github.com/org/repo` or `https://github.com/org/repo.git`
43    /// - GitLab subgroups: `https://gitlab.com/a/b/c/repo.git`
44    /// - Azure DevOps: `https://dev.azure.com/org/proj/_git/repo`
45    ///
46    /// # Errors
47    /// Returns an error if the URL cannot be parsed or has invalid structure.
48    pub fn parse(url: &str) -> Result<Self> {
49        let url = url.trim();
50
51        // Determine URL type and extract host + path
52        let (host, path) = if url.starts_with("git@") {
53            // SSH scp-like: git@host:path
54            parse_scp_url(url)?
55        } else if url.starts_with("ssh://") {
56            // SSH with scheme: ssh://[user@]host[:port]/path
57            parse_ssh_scheme_url(url)?
58        } else if url.starts_with("https://") || url.starts_with("http://") {
59            // HTTPS/HTTP: scheme://[user@]host[:port]/path
60            parse_https_url(url)?
61        } else {
62            bail!("Unsupported URL format: {}", url);
63        };
64
65        // Normalize path: remove trailing slashes and .git suffix
66        let path = path
67            .trim_end_matches('/')
68            .trim_end_matches(".git")
69            .trim_end_matches('/');
70
71        // Split path into segments and validate
72        let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
73
74        if segments.is_empty() {
75            bail!("URL has no path segments: {}", url);
76        }
77
78        // Check for invalid segments
79        for seg in &segments {
80            if *seg == "." || *seg == ".." {
81                bail!("Invalid path segment '{}' in URL: {}", seg, url);
82            }
83        }
84
85        if segments.len() > MAX_SUBGROUP_DEPTH + 1 {
86            bail!(
87                "Path has too many segments ({}, max {}): {}",
88                segments.len(),
89                MAX_SUBGROUP_DEPTH + 1,
90                url
91            );
92        }
93
94        // Handle Azure DevOps special case: org/proj/_git/repo
95        let (org_path, repo) = if let Some(git_idx) = segments.iter().position(|s| *s == "_git") {
96            if git_idx + 1 >= segments.len() {
97                bail!("Azure DevOps URL missing repo after _git: {}", url);
98            }
99            let org_segments = &segments[..git_idx];
100            let repo = segments[git_idx + 1];
101            (org_segments.join("/"), repo.to_string())
102        } else if segments.len() == 1 {
103            // Single segment: treat as repo with empty org (unusual but valid for some hosts)
104            (String::new(), segments[0].to_string())
105        } else {
106            // Standard case: all but last segment is org_path, last is repo
107            let org_segments = &segments[..segments.len() - 1];
108            let repo = segments[segments.len() - 1];
109            (org_segments.join("/"), repo.to_string())
110        };
111
112        Ok(Self {
113            host: host.to_lowercase(),
114            org_path,
115            repo,
116        })
117    }
118
119    /// Get the canonical key for identity-based lookups.
120    ///
121    /// All fields are lowercased for case-insensitive matching.
122    pub fn canonical_key(&self) -> RepoIdentityKey {
123        RepoIdentityKey {
124            host: self.host.to_lowercase(),
125            org_path: self.org_path.to_lowercase(),
126            repo: self.repo.to_lowercase(),
127        }
128    }
129}
130
131/// Parse SSH scp-like URL: `git@host:path` or `user@host:path`
132fn parse_scp_url(url: &str) -> Result<(String, String)> {
133    // Format: [user@]host:path
134    let without_user = url.find('@').map(|i| &url[i + 1..]).unwrap_or(url);
135
136    let colon_pos = without_user
137        .find(':')
138        .ok_or_else(|| anyhow::anyhow!("Invalid scp-like URL (missing colon): {}", url))?;
139
140    let host = &without_user[..colon_pos];
141    let path = &without_user[colon_pos + 1..];
142
143    if host.is_empty() {
144        bail!("Empty host in URL: {}", url);
145    }
146
147    Ok((host.to_string(), path.to_string()))
148}
149
150/// Parse SSH scheme URL: `ssh://[user@]host[:port]/path`
151fn parse_ssh_scheme_url(url: &str) -> Result<(String, String)> {
152    let without_scheme = url
153        .strip_prefix("ssh://")
154        .ok_or_else(|| anyhow::anyhow!("Not an SSH URL: {}", url))?;
155
156    // Strip userinfo if present
157    let without_user = without_scheme
158        .find('@')
159        .map(|i| &without_scheme[i + 1..])
160        .unwrap_or(without_scheme);
161
162    // Find the first slash (separates host[:port] from path)
163    let slash_pos = without_user
164        .find('/')
165        .ok_or_else(|| anyhow::anyhow!("SSH URL missing path: {}", url))?;
166
167    let host_port = &without_user[..slash_pos];
168    let path = &without_user[slash_pos + 1..];
169
170    // Extract host (strip port if present)
171    let host = host_port
172        .split(':')
173        .next()
174        .ok_or_else(|| anyhow::anyhow!("Empty host in URL: {}", url))?;
175
176    if host.is_empty() {
177        bail!("Empty host in URL: {}", url);
178    }
179
180    Ok((host.to_string(), path.to_string()))
181}
182
183/// Parse HTTPS/HTTP URL: `scheme://[user@]host[:port]/path`
184fn parse_https_url(url: &str) -> Result<(String, String)> {
185    let scheme_end = url
186        .find("://")
187        .ok_or_else(|| anyhow::anyhow!("Invalid URL (missing ://): {}", url))?;
188
189    let without_scheme = &url[scheme_end + 3..];
190
191    // Strip userinfo if present
192    let without_user = without_scheme
193        .find('@')
194        .map(|i| &without_scheme[i + 1..])
195        .unwrap_or(without_scheme);
196
197    // Find the first slash (separates host[:port] from path)
198    let slash_pos = without_user
199        .find('/')
200        .ok_or_else(|| anyhow::anyhow!("URL missing path: {}", url))?;
201
202    let host_port = &without_user[..slash_pos];
203    let path = &without_user[slash_pos + 1..];
204
205    // Extract host (strip port if present)
206    let host = host_port
207        .split(':')
208        .next()
209        .ok_or_else(|| anyhow::anyhow!("Empty host in URL: {}", url))?;
210
211    if host.is_empty() {
212        bail!("Empty host in URL: {}", url);
213    }
214
215    Ok((host.to_string(), path.to_string()))
216}
217
218/// Split `url` into (base_url, optional_subpath) using a last-colon heuristic.
219///
220/// Treats it as `URL:subpath` only if the base portion parses as a valid `RepoIdentity`.
221/// This avoids confusing `host:port` for a subpath delimiter.
222///
223/// # Examples
224/// ```ignore
225/// // No subpath
226/// parse_url_and_subpath("git@github.com:org/repo.git")
227///   => ("git@github.com:org/repo.git", None)
228///
229/// // With subpath
230/// parse_url_and_subpath("git@github.com:org/repo.git:docs/api")
231///   => ("git@github.com:org/repo.git", Some("docs/api"))
232///
233/// // SSH with port (port is NOT a subpath)
234/// parse_url_and_subpath("ssh://git@host:2222/org/repo.git")
235///   => ("ssh://git@host:2222/org/repo.git", None)
236///
237/// // SSH with port AND subpath
238/// parse_url_and_subpath("ssh://git@host:2222/org/repo.git:docs/api")
239///   => ("ssh://git@host:2222/org/repo.git", Some("docs/api"))
240/// ```
241pub fn parse_url_and_subpath(url: &str) -> (String, Option<String>) {
242    // Strategy: find the rightmost colon and check if the left side parses as a valid URL.
243    // If it does, the right side is a subpath. If not, there's no subpath.
244
245    // Handle scheme-based URLs: ssh://, https://, http://
246    // For these, we need to be careful about host:port patterns
247
248    let url = url.trim();
249
250    // Try splitting from the right
251    if let Some(colon_pos) = url.rfind(':') {
252        let potential_base = &url[..colon_pos];
253        let potential_subpath = &url[colon_pos + 1..];
254
255        // Don't split if subpath is empty
256        if potential_subpath.is_empty() {
257            return (url.to_string(), None);
258        }
259
260        // Don't split if subpath looks like a port (all digits)
261        if potential_subpath.chars().all(|c| c.is_ascii_digit()) {
262            return (url.to_string(), None);
263        }
264
265        // Don't split if potential_base is empty or just a scheme
266        if potential_base.is_empty() || potential_base.ends_with("//") {
267            return (url.to_string(), None);
268        }
269
270        // Try parsing the base as a RepoIdentity
271        if RepoIdentity::parse(potential_base).is_ok() {
272            return (
273                potential_base.to_string(),
274                Some(potential_subpath.to_string()),
275            );
276        }
277    }
278
279    (url.to_string(), None)
280}
281
282#[cfg(test)]
283mod tests {
284    use super::*;
285
286    // ===== RepoIdentity::parse tests =====
287
288    #[test]
289    fn test_parse_ssh_scp_basic() {
290        let id = RepoIdentity::parse("git@github.com:org/repo.git").unwrap();
291        assert_eq!(id.host, "github.com");
292        assert_eq!(id.org_path, "org");
293        assert_eq!(id.repo, "repo");
294    }
295
296    #[test]
297    fn test_parse_ssh_scp_no_git_suffix() {
298        let id = RepoIdentity::parse("git@github.com:org/repo").unwrap();
299        assert_eq!(id.host, "github.com");
300        assert_eq!(id.org_path, "org");
301        assert_eq!(id.repo, "repo");
302    }
303
304    #[test]
305    fn test_parse_https_basic() {
306        let id = RepoIdentity::parse("https://github.com/org/repo").unwrap();
307        assert_eq!(id.host, "github.com");
308        assert_eq!(id.org_path, "org");
309        assert_eq!(id.repo, "repo");
310    }
311
312    #[test]
313    fn test_parse_https_with_git_suffix() {
314        let id = RepoIdentity::parse("https://github.com/org/repo.git").unwrap();
315        assert_eq!(id.host, "github.com");
316        assert_eq!(id.org_path, "org");
317        assert_eq!(id.repo, "repo");
318    }
319
320    #[test]
321    fn test_parse_https_trailing_slash() {
322        let id = RepoIdentity::parse("https://github.com/org/repo/").unwrap();
323        assert_eq!(id.host, "github.com");
324        assert_eq!(id.org_path, "org");
325        assert_eq!(id.repo, "repo");
326    }
327
328    #[test]
329    fn test_parse_ssh_with_port() {
330        let id = RepoIdentity::parse("ssh://git@host.example.com:2222/org/repo.git").unwrap();
331        assert_eq!(id.host, "host.example.com");
332        assert_eq!(id.org_path, "org");
333        assert_eq!(id.repo, "repo");
334    }
335
336    #[test]
337    fn test_parse_gitlab_subgroups() {
338        let id = RepoIdentity::parse("https://gitlab.com/group/subgroup/team/repo.git").unwrap();
339        assert_eq!(id.host, "gitlab.com");
340        assert_eq!(id.org_path, "group/subgroup/team");
341        assert_eq!(id.repo, "repo");
342    }
343
344    #[test]
345    fn test_parse_gitlab_deep_subgroups() {
346        let id = RepoIdentity::parse("https://gitlab.com/a/b/c/d/e/repo.git").unwrap();
347        assert_eq!(id.host, "gitlab.com");
348        assert_eq!(id.org_path, "a/b/c/d/e");
349        assert_eq!(id.repo, "repo");
350    }
351
352    #[test]
353    fn test_parse_azure_devops() {
354        let id = RepoIdentity::parse("https://dev.azure.com/myorg/myproj/_git/myrepo").unwrap();
355        assert_eq!(id.host, "dev.azure.com");
356        assert_eq!(id.org_path, "myorg/myproj");
357        assert_eq!(id.repo, "myrepo");
358    }
359
360    #[test]
361    fn test_parse_host_case_normalized() {
362        let id = RepoIdentity::parse("https://GitHub.COM/Org/Repo").unwrap();
363        assert_eq!(id.host, "github.com");
364        // org_path and repo preserve case
365        assert_eq!(id.org_path, "Org");
366        assert_eq!(id.repo, "Repo");
367    }
368
369    #[test]
370    fn test_parse_http_scheme() {
371        let id = RepoIdentity::parse("http://github.com/org/repo").unwrap();
372        assert_eq!(id.host, "github.com");
373        assert_eq!(id.org_path, "org");
374        assert_eq!(id.repo, "repo");
375    }
376
377    #[test]
378    fn test_parse_rejects_invalid_segments() {
379        assert!(RepoIdentity::parse("https://github.com/../repo").is_err());
380        assert!(RepoIdentity::parse("https://github.com/./repo").is_err());
381    }
382
383    #[test]
384    fn test_parse_rejects_unsupported_scheme() {
385        assert!(RepoIdentity::parse("ftp://github.com/org/repo").is_err());
386        assert!(RepoIdentity::parse("org/repo").is_err());
387    }
388
389    // ===== canonical_key tests =====
390
391    #[test]
392    fn test_canonical_key_equality_across_schemes() {
393        let ssh = RepoIdentity::parse("git@github.com:User/Repo.git").unwrap();
394        let https = RepoIdentity::parse("https://github.com/user/repo").unwrap();
395
396        assert_eq!(ssh.canonical_key(), https.canonical_key());
397    }
398
399    #[test]
400    fn test_canonical_key_different_repos() {
401        let a = RepoIdentity::parse("git@github.com:org/repo-a.git").unwrap();
402        let b = RepoIdentity::parse("git@github.com:org/repo-b.git").unwrap();
403
404        assert_ne!(a.canonical_key(), b.canonical_key());
405    }
406
407    #[test]
408    fn test_canonical_key_different_orgs() {
409        let a = RepoIdentity::parse("git@github.com:alice/utils.git").unwrap();
410        let b = RepoIdentity::parse("git@github.com:bob/utils.git").unwrap();
411
412        assert_ne!(a.canonical_key(), b.canonical_key());
413    }
414
415    // ===== parse_url_and_subpath tests =====
416
417    #[test]
418    fn test_subpath_none_basic() {
419        let (url, sub) = parse_url_and_subpath("git@github.com:user/repo.git");
420        assert_eq!(url, "git@github.com:user/repo.git");
421        assert_eq!(sub, None);
422    }
423
424    #[test]
425    fn test_subpath_present() {
426        let (url, sub) = parse_url_and_subpath("git@github.com:user/repo.git:docs/api");
427        assert_eq!(url, "git@github.com:user/repo.git");
428        assert_eq!(sub, Some("docs/api".to_string()));
429    }
430
431    #[test]
432    fn test_subpath_https_none() {
433        let (url, sub) = parse_url_and_subpath("https://github.com/user/repo");
434        assert_eq!(url, "https://github.com/user/repo");
435        assert_eq!(sub, None);
436    }
437
438    #[test]
439    fn test_subpath_ssh_port_not_confused() {
440        // Port should NOT be treated as subpath
441        let (url, sub) = parse_url_and_subpath("ssh://git@host:2222/org/repo.git");
442        assert_eq!(url, "ssh://git@host:2222/org/repo.git");
443        assert_eq!(sub, None);
444    }
445
446    #[test]
447    fn test_subpath_ssh_port_with_actual_subpath() {
448        let (url, sub) = parse_url_and_subpath("ssh://git@host:2222/org/repo.git:docs/api");
449        assert_eq!(url, "ssh://git@host:2222/org/repo.git");
450        assert_eq!(sub, Some("docs/api".to_string()));
451    }
452
453    #[test]
454    fn test_subpath_empty_subpath_ignored() {
455        let (url, sub) = parse_url_and_subpath("git@github.com:user/repo.git:");
456        assert_eq!(url, "git@github.com:user/repo.git:");
457        assert_eq!(sub, None);
458    }
459}