Skip to main content

thoughts_tool/
repo_identity.rs

1//! Canonical repository identity normalization.
2//!
3//! This module provides `RepoIdentity` as the single source of truth for repository identity,
4//! enabling consistent URL normalization across SSH, HTTPS, and various git hosting formats.
5
6use anyhow::Result;
7use anyhow::bail;
8
9/// Maximum allowed subgroup nesting depth (GitLab supports up to 20 levels).
10const MAX_SUBGROUP_DEPTH: usize = 20;
11
12/// Canonical repository identity extracted from a git URL.
13///
14/// This struct normalizes various URL formats (SSH, HTTPS, with/without .git suffix)
15/// into a consistent identity that can be used for deduplication and matching.
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct RepoIdentity {
18    /// Host name (lowercased), e.g., "github.com"
19    pub host: String,
20    /// Organization path (may contain multiple segments for GitLab subgroups), e.g., "org" or "group/subgroup"
21    pub org_path: String,
22    /// Repository name (no .git suffix, no trailing slash)
23    pub repo: String,
24}
25
26/// Canonical key for identity-based lookups and deduplication.
27///
28/// All fields are lowercased for case-insensitive matching.
29#[derive(Debug, Clone, PartialEq, Eq, Hash)]
30pub struct RepoIdentityKey {
31    pub host: String,
32    pub org_path: String,
33    pub repo: String,
34}
35
36impl RepoIdentity {
37    /// Parse a git URL into a `RepoIdentity`.
38    ///
39    /// Supported formats:
40    /// - SSH scp-like: `git@github.com:org/repo.git`
41    /// - SSH with port: `ssh://git@host:2222/org/repo.git`
42    /// - HTTPS: `https://github.com/org/repo` or `https://github.com/org/repo.git`
43    /// - GitLab subgroups: `https://gitlab.com/a/b/c/repo.git`
44    /// - Azure DevOps: `https://dev.azure.com/org/proj/_git/repo`
45    ///
46    /// # Errors
47    /// Returns an error if the URL cannot be parsed or has invalid structure.
48    pub fn parse(url: &str) -> Result<Self> {
49        let url = url.trim();
50
51        // Determine URL type and extract host + path
52        let (host, path) = if url.starts_with("git@") {
53            // SSH scp-like: git@host:path
54            parse_scp_url(url)?
55        } else if url.starts_with("ssh://") {
56            // SSH with scheme: ssh://[user@]host[:port]/path
57            parse_ssh_scheme_url(url)?
58        } else if url.starts_with("https://") || url.starts_with("http://") {
59            // HTTPS/HTTP: scheme://[user@]host[:port]/path
60            parse_https_url(url)?
61        } else {
62            bail!("Unsupported URL format: {url}");
63        };
64
65        // Normalize path: remove trailing slashes and .git suffix
66        let path = path
67            .trim_end_matches('/')
68            .trim_end_matches(".git")
69            .trim_end_matches('/');
70
71        // Split path into segments and validate
72        let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
73
74        if segments.is_empty() {
75            bail!("URL has no path segments: {url}");
76        }
77
78        // Check for invalid segments
79        for seg in &segments {
80            if *seg == "." || *seg == ".." {
81                bail!("Invalid path segment '{seg}' in URL: {url}");
82            }
83        }
84
85        if segments.len() > MAX_SUBGROUP_DEPTH + 1 {
86            bail!(
87                "Path has too many segments ({}, max {}): {}",
88                segments.len(),
89                MAX_SUBGROUP_DEPTH + 1,
90                url
91            );
92        }
93
94        // Handle Azure DevOps special case: org/proj/_git/repo
95        let (org_path, repo) = if let Some(git_idx) = segments.iter().position(|s| *s == "_git") {
96            if git_idx + 1 >= segments.len() {
97                bail!("Azure DevOps URL missing repo after _git: {url}");
98            }
99            let org_segments = &segments[..git_idx];
100            let repo = segments[git_idx + 1];
101            (org_segments.join("/"), repo.to_string())
102        } else if segments.len() == 1 {
103            // Single segment: treat as repo with empty org (unusual but valid for some hosts)
104            (String::new(), segments[0].to_string())
105        } else {
106            // Standard case: all but last segment is org_path, last is repo
107            let org_segments = &segments[..segments.len() - 1];
108            let repo = segments[segments.len() - 1];
109            (org_segments.join("/"), repo.to_string())
110        };
111
112        Ok(Self {
113            host: host.to_lowercase(),
114            org_path,
115            repo,
116        })
117    }
118
119    /// Get the canonical key for identity-based lookups.
120    ///
121    /// All fields are lowercased for case-insensitive matching.
122    pub fn canonical_key(&self) -> RepoIdentityKey {
123        RepoIdentityKey {
124            host: self.host.to_lowercase(),
125            org_path: self.org_path.to_lowercase(),
126            repo: self.repo.to_lowercase(),
127        }
128    }
129}
130
131/// Parse SSH scp-like URL: `git@host:path` or `user@host:path`
132fn parse_scp_url(url: &str) -> Result<(String, String)> {
133    // Format: [user@]host:path
134    let without_user = url.find('@').map_or(url, |i| &url[i + 1..]);
135
136    let colon_pos = without_user
137        .find(':')
138        .ok_or_else(|| anyhow::anyhow!("Invalid scp-like URL (missing colon): {url}"))?;
139
140    let host = &without_user[..colon_pos];
141    let path = &without_user[colon_pos + 1..];
142
143    if host.is_empty() {
144        bail!("Empty host in URL: {url}");
145    }
146
147    Ok((host.to_string(), path.to_string()))
148}
149
150/// Parse SSH scheme URL: `ssh://[user@]host[:port]/path`
151fn parse_ssh_scheme_url(url: &str) -> Result<(String, String)> {
152    let without_scheme = url
153        .strip_prefix("ssh://")
154        .ok_or_else(|| anyhow::anyhow!("Not an SSH URL: {url}"))?;
155
156    // Strip userinfo if present
157    let without_user = without_scheme
158        .find('@')
159        .map_or(without_scheme, |i| &without_scheme[i + 1..]);
160
161    // Find the first slash (separates host[:port] from path)
162    let slash_pos = without_user
163        .find('/')
164        .ok_or_else(|| anyhow::anyhow!("SSH URL missing path: {url}"))?;
165
166    let host_port = &without_user[..slash_pos];
167    let path = &without_user[slash_pos + 1..];
168
169    // Extract host (strip port if present)
170    let host = host_port
171        .split(':')
172        .next()
173        .ok_or_else(|| anyhow::anyhow!("Empty host in URL: {url}"))?;
174
175    if host.is_empty() {
176        bail!("Empty host in URL: {url}");
177    }
178
179    Ok((host.to_string(), path.to_string()))
180}
181
182/// Parse HTTPS/HTTP URL: `scheme://[user@]host[:port]/path`
183fn parse_https_url(url: &str) -> Result<(String, String)> {
184    let scheme_end = url
185        .find("://")
186        .ok_or_else(|| anyhow::anyhow!("Invalid URL (missing ://): {url}"))?;
187
188    let without_scheme = &url[scheme_end + 3..];
189
190    // Strip userinfo if present
191    let without_user = without_scheme
192        .find('@')
193        .map_or(without_scheme, |i| &without_scheme[i + 1..]);
194
195    // Find the first slash (separates host[:port] from path)
196    let slash_pos = without_user
197        .find('/')
198        .ok_or_else(|| anyhow::anyhow!("URL missing path: {url}"))?;
199
200    let host_port = &without_user[..slash_pos];
201    let path = &without_user[slash_pos + 1..];
202
203    // Extract host (strip port if present)
204    let host = host_port
205        .split(':')
206        .next()
207        .ok_or_else(|| anyhow::anyhow!("Empty host in URL: {url}"))?;
208
209    if host.is_empty() {
210        bail!("Empty host in URL: {url}");
211    }
212
213    Ok((host.to_string(), path.to_string()))
214}
215
216/// Split `url` into (`base_url`, `optional_subpath`) using a last-colon heuristic.
217///
218/// Treats it as `URL:subpath` only if the base portion parses as a valid `RepoIdentity`.
219/// This avoids confusing `host:port` for a subpath delimiter.
220///
221/// # Examples
222/// ```ignore
223/// // No subpath
224/// parse_url_and_subpath("git@github.com:org/repo.git")
225///   => ("git@github.com:org/repo.git", None)
226///
227/// // With subpath
228/// parse_url_and_subpath("git@github.com:org/repo.git:docs/api")
229///   => ("git@github.com:org/repo.git", Some("docs/api"))
230///
231/// // SSH with port (port is NOT a subpath)
232/// parse_url_and_subpath("ssh://git@host:2222/org/repo.git")
233///   => ("ssh://git@host:2222/org/repo.git", None)
234///
235/// // SSH with port AND subpath
236/// parse_url_and_subpath("ssh://git@host:2222/org/repo.git:docs/api")
237///   => ("ssh://git@host:2222/org/repo.git", Some("docs/api"))
238/// ```
239pub fn parse_url_and_subpath(url: &str) -> (String, Option<String>) {
240    // Strategy: find the rightmost colon and check if the left side parses as a valid URL.
241    // If it does, the right side is a subpath. If not, there's no subpath.
242
243    // Handle scheme-based URLs: ssh://, https://, http://
244    // For these, we need to be careful about host:port patterns
245
246    let url = url.trim();
247
248    // Try splitting from the right
249    if let Some(colon_pos) = url.rfind(':') {
250        let potential_base = &url[..colon_pos];
251        let potential_subpath = &url[colon_pos + 1..];
252
253        // Don't split if subpath is empty
254        if potential_subpath.is_empty() {
255            return (url.to_string(), None);
256        }
257
258        // Don't split if subpath looks like a port (all digits)
259        if potential_subpath.chars().all(|c| c.is_ascii_digit()) {
260            return (url.to_string(), None);
261        }
262
263        // Don't split if potential_base is empty or just a scheme
264        if potential_base.is_empty() || potential_base.ends_with("//") {
265            return (url.to_string(), None);
266        }
267
268        // Try parsing the base as a RepoIdentity
269        if RepoIdentity::parse(potential_base).is_ok() {
270            return (
271                potential_base.to_string(),
272                Some(potential_subpath.to_string()),
273            );
274        }
275    }
276
277    (url.to_string(), None)
278}
279
280#[cfg(test)]
281mod tests {
282    use super::*;
283
284    // ===== RepoIdentity::parse tests =====
285
286    #[test]
287    fn test_parse_ssh_scp_basic() {
288        let id = RepoIdentity::parse("git@github.com:org/repo.git").unwrap();
289        assert_eq!(id.host, "github.com");
290        assert_eq!(id.org_path, "org");
291        assert_eq!(id.repo, "repo");
292    }
293
294    #[test]
295    fn test_parse_ssh_scp_no_git_suffix() {
296        let id = RepoIdentity::parse("git@github.com:org/repo").unwrap();
297        assert_eq!(id.host, "github.com");
298        assert_eq!(id.org_path, "org");
299        assert_eq!(id.repo, "repo");
300    }
301
302    #[test]
303    fn test_parse_https_basic() {
304        let id = RepoIdentity::parse("https://github.com/org/repo").unwrap();
305        assert_eq!(id.host, "github.com");
306        assert_eq!(id.org_path, "org");
307        assert_eq!(id.repo, "repo");
308    }
309
310    #[test]
311    fn test_parse_https_with_git_suffix() {
312        let id = RepoIdentity::parse("https://github.com/org/repo.git").unwrap();
313        assert_eq!(id.host, "github.com");
314        assert_eq!(id.org_path, "org");
315        assert_eq!(id.repo, "repo");
316    }
317
318    #[test]
319    fn test_parse_https_trailing_slash() {
320        let id = RepoIdentity::parse("https://github.com/org/repo/").unwrap();
321        assert_eq!(id.host, "github.com");
322        assert_eq!(id.org_path, "org");
323        assert_eq!(id.repo, "repo");
324    }
325
326    #[test]
327    fn test_parse_ssh_with_port() {
328        let id = RepoIdentity::parse("ssh://git@host.example.com:2222/org/repo.git").unwrap();
329        assert_eq!(id.host, "host.example.com");
330        assert_eq!(id.org_path, "org");
331        assert_eq!(id.repo, "repo");
332    }
333
334    #[test]
335    fn test_parse_gitlab_subgroups() {
336        let id = RepoIdentity::parse("https://gitlab.com/group/subgroup/team/repo.git").unwrap();
337        assert_eq!(id.host, "gitlab.com");
338        assert_eq!(id.org_path, "group/subgroup/team");
339        assert_eq!(id.repo, "repo");
340    }
341
342    #[test]
343    fn test_parse_gitlab_deep_subgroups() {
344        let id = RepoIdentity::parse("https://gitlab.com/a/b/c/d/e/repo.git").unwrap();
345        assert_eq!(id.host, "gitlab.com");
346        assert_eq!(id.org_path, "a/b/c/d/e");
347        assert_eq!(id.repo, "repo");
348    }
349
350    #[test]
351    fn test_parse_azure_devops() {
352        let id = RepoIdentity::parse("https://dev.azure.com/myorg/myproj/_git/myrepo").unwrap();
353        assert_eq!(id.host, "dev.azure.com");
354        assert_eq!(id.org_path, "myorg/myproj");
355        assert_eq!(id.repo, "myrepo");
356    }
357
358    #[test]
359    fn test_parse_host_case_normalized() {
360        let id = RepoIdentity::parse("https://GitHub.COM/Org/Repo").unwrap();
361        assert_eq!(id.host, "github.com");
362        // org_path and repo preserve case
363        assert_eq!(id.org_path, "Org");
364        assert_eq!(id.repo, "Repo");
365    }
366
367    #[test]
368    fn test_parse_http_scheme() {
369        let id = RepoIdentity::parse("http://github.com/org/repo").unwrap();
370        assert_eq!(id.host, "github.com");
371        assert_eq!(id.org_path, "org");
372        assert_eq!(id.repo, "repo");
373    }
374
375    #[test]
376    fn test_parse_rejects_invalid_segments() {
377        assert!(RepoIdentity::parse("https://github.com/../repo").is_err());
378        assert!(RepoIdentity::parse("https://github.com/./repo").is_err());
379    }
380
381    #[test]
382    fn test_parse_rejects_unsupported_scheme() {
383        assert!(RepoIdentity::parse("ftp://github.com/org/repo").is_err());
384        assert!(RepoIdentity::parse("org/repo").is_err());
385    }
386
387    // ===== canonical_key tests =====
388
389    #[test]
390    fn test_canonical_key_equality_across_schemes() {
391        let ssh = RepoIdentity::parse("git@github.com:User/Repo.git").unwrap();
392        let https = RepoIdentity::parse("https://github.com/user/repo").unwrap();
393
394        assert_eq!(ssh.canonical_key(), https.canonical_key());
395    }
396
397    #[test]
398    fn test_canonical_key_different_repos() {
399        let a = RepoIdentity::parse("git@github.com:org/repo-a.git").unwrap();
400        let b = RepoIdentity::parse("git@github.com:org/repo-b.git").unwrap();
401
402        assert_ne!(a.canonical_key(), b.canonical_key());
403    }
404
405    #[test]
406    fn test_canonical_key_different_orgs() {
407        let a = RepoIdentity::parse("git@github.com:alice/utils.git").unwrap();
408        let b = RepoIdentity::parse("git@github.com:bob/utils.git").unwrap();
409
410        assert_ne!(a.canonical_key(), b.canonical_key());
411    }
412
413    // ===== parse_url_and_subpath tests =====
414
415    #[test]
416    fn test_subpath_none_basic() {
417        let (url, sub) = parse_url_and_subpath("git@github.com:user/repo.git");
418        assert_eq!(url, "git@github.com:user/repo.git");
419        assert_eq!(sub, None);
420    }
421
422    #[test]
423    fn test_subpath_present() {
424        let (url, sub) = parse_url_and_subpath("git@github.com:user/repo.git:docs/api");
425        assert_eq!(url, "git@github.com:user/repo.git");
426        assert_eq!(sub, Some("docs/api".to_string()));
427    }
428
429    #[test]
430    fn test_subpath_https_none() {
431        let (url, sub) = parse_url_and_subpath("https://github.com/user/repo");
432        assert_eq!(url, "https://github.com/user/repo");
433        assert_eq!(sub, None);
434    }
435
436    #[test]
437    fn test_subpath_ssh_port_not_confused() {
438        // Port should NOT be treated as subpath
439        let (url, sub) = parse_url_and_subpath("ssh://git@host:2222/org/repo.git");
440        assert_eq!(url, "ssh://git@host:2222/org/repo.git");
441        assert_eq!(sub, None);
442    }
443
444    #[test]
445    fn test_subpath_ssh_port_with_actual_subpath() {
446        let (url, sub) = parse_url_and_subpath("ssh://git@host:2222/org/repo.git:docs/api");
447        assert_eq!(url, "ssh://git@host:2222/org/repo.git");
448        assert_eq!(sub, Some("docs/api".to_string()));
449    }
450
451    #[test]
452    fn test_subpath_empty_subpath_ignored() {
453        let (url, sub) = parse_url_and_subpath("git@github.com:user/repo.git:");
454        assert_eq!(url, "git@github.com:user/repo.git:");
455        assert_eq!(sub, None);
456    }
457}