Skip to main content

trusty_common/
github_path.rs

1//! Shared GitHub `owner/repo` path derivation (issue #1220).
2//!
3//! Why: two trusty-* subsystems need the canonical `owner/repo` identity of a
4//! project, derived from its git origin remote: trusty-mpm's managed-session
5//! workspace root (`~/trusty-mpm-projects/<owner>/<repo>/…`, #1220) and
6//! trusty-memory's palace-ID derivation (#1217). Before this module each crate
7//! re-implemented git-URL parsing; centralising it in `trusty-common` gives both
8//! one tested seam and guarantees they agree on what `<owner>/<repo>` means for
9//! a given remote. Unlike trusty-memory's `owner_repo_from_git_remote` (which
10//! collapses to a single storage-safe `owner-repo` token), this module keeps
11//! `owner` and `repo` as SEPARATE components because #1220 maps them onto two
12//! nested filesystem path segments.
13//!
14//! What: [`GithubPath`] is the parsed `{ owner, repo }` pair; [`parse_github_path`]
15//! turns a git remote URL into one (pure, no I/O); [`derive_github_path`] runs
16//! `git config --get remote.origin.url` in a directory and parses the result
17//! (the only I/O entry point). Both components are slugified for filesystem
18//! safety — lower-cased, non-alphanumerics collapsed to `-`, trailing `.git`
19//! stripped — so the result is always two clean path segments.
20//!
21//! Test: `parse_*` unit tests cover SSH/HTTPS, with/without `.git`, trailing
22//! slashes, nested groups, owner-less and empty inputs; `derive_*` is covered by
23//! the `derive_github_path_reads_origin` test against a real temp git repo.
24
25use std::path::Path;
26use std::process::Command;
27
28/// A parsed, slugified GitHub-style project identity.
29///
30/// Why: #1220's workspace-root convention nests sessions under
31/// `~/trusty-mpm-projects/<owner>/<repo>/`, so callers need the owner and the
32/// repo as two independent, filesystem-safe segments — not a single fused token.
33/// Keeping them in a struct (rather than a tuple) makes call sites self-documenting
34/// and lets the type grow (e.g. a `host` field) without breaking signatures.
35/// What: the slugified `owner` and `repo` path segments. Both are guaranteed
36/// non-empty by every constructor in this module (a parse that cannot produce a
37/// non-empty `repo` returns `None`); `owner` is `"unknown-owner"` only via the
38/// explicit owner-less fallback documented on [`parse_github_path`].
39/// Test: every `parse_*` test asserts the two fields independently.
40#[derive(Debug, Clone, PartialEq, Eq)]
41pub struct GithubPath {
42    /// Slugified repository owner (GitHub user or org), e.g. `bobmatnyc`.
43    pub owner: String,
44    /// Slugified repository name, e.g. `trusty-tools`.
45    pub repo: String,
46}
47
48impl GithubPath {
49    /// Render the identity as the relative path `<owner>/<repo>`.
50    ///
51    /// Why: the workspace-root builder joins this onto the configured root; a
52    /// single accessor keeps the join order (`owner` then `repo`) in one place so
53    /// callers cannot accidentally swap the segments.
54    /// What: returns `format!("{owner}/{repo}")` — always using `/` so the result
55    /// is a portable relative path that `Path::join` splits into two components.
56    /// Test: `github_path_rel_joins_owner_repo`.
57    pub fn rel_path(&self) -> String {
58        format!("{}/{}", self.owner, self.repo)
59    }
60}
61
62/// Slugify a single path component for filesystem safety.
63///
64/// Why: owners and repo names can contain mixed case, underscores, and (rarely)
65/// other punctuation; turning each into a clean kebab-case segment keeps the
66/// derived workspace path predictable and avoids surprising directory names.
67/// What: lower-cases, strips a trailing `.git`, maps `[a-z0-9]` through, collapses
68/// any run of other characters to a single `-`, and trims leading/trailing `-`.
69/// Returns an empty string when nothing survives. Mirrors trusty-memory's
70/// `slugify_string` so the two crates derive identical tokens for the same input.
71/// Test: exercised indirectly by every `parse_*` test (case, underscores,
72/// `.git`, punctuation).
73fn slugify_component(input: &str) -> String {
74    let lowered = input.trim().to_ascii_lowercase();
75    let stripped = lowered.strip_suffix(".git").unwrap_or(&lowered);
76    let mut out = String::with_capacity(stripped.len());
77    let mut prev_hyphen = false;
78    for c in stripped.chars() {
79        match c {
80            'a'..='z' | '0'..='9' => {
81                out.push(c);
82                prev_hyphen = false;
83            }
84            _ => {
85                // Collapse any run of separators/punctuation into one hyphen,
86                // never leading.
87                if !prev_hyphen && !out.is_empty() {
88                    out.push('-');
89                    prev_hyphen = true;
90                }
91            }
92        }
93    }
94    while out.ends_with('-') {
95        out.pop();
96    }
97    out
98}
99
100/// Strip a leading URL scheme (`https://`, `ssh://`, `git://`, …) if present.
101///
102/// Why: the path-extraction logic only needs the host-and-path portion; a
103/// scheme's `://` colon must not be mistaken for the SSH host delimiter.
104/// What: returns everything after a leading `<scheme>://`; inputs without a
105/// scheme (SSH scp-syntax like `git@host:owner/repo`) are returned unchanged.
106/// Test: covered by `parse_https_*` (scheme present) and `parse_ssh_*` (absent).
107fn strip_scheme(url: &str) -> &str {
108    match url.find("://") {
109        Some(idx) => &url[idx + 3..],
110        None => url,
111    }
112}
113
114/// Reduce a host-prefixed locator to just its path portion.
115///
116/// Why: both `host/owner/repo` (URL) and `host:owner/repo` (SSH scp-syntax)
117/// carry the host as a leading component that must be dropped before taking the
118/// trailing `owner/repo` segments.
119/// What: if an SSH `:` separator precedes the first `/`, splits on it and returns
120/// the remainder; otherwise drops the first `/`-delimited segment (the host). A
121/// locator with no separators is returned unchanged.
122/// Test: covered by `parse_ssh_github`, `parse_https_github_*`.
123fn host_relative_path(locator: &str) -> &str {
124    let colon = locator.find(':');
125    let slash = locator.find('/');
126    match (colon, slash) {
127        (Some(c), maybe_slash) if maybe_slash.is_none_or(|s| c < s) => &locator[c + 1..],
128        (_, Some(s)) => &locator[s + 1..],
129        _ => locator,
130    }
131}
132
133/// Fallback owner used when a remote exposes a repo segment but no owner.
134///
135/// Why: #1220 nests sessions two levels deep (`<owner>/<repo>`); an owner-less
136/// remote (e.g. `git@host:repo.git`) would otherwise yield a one-level path and
137/// break the convention. A stable sentinel keeps the two-segment shape.
138/// What: `"unknown-owner"` — already slug-safe.
139/// Test: `parse_repo_only_uses_unknown_owner`.
140pub const UNKNOWN_OWNER: &str = "unknown-owner";
141
142/// Parse a git remote URL into a [`GithubPath`] (`{ owner, repo }`).
143///
144/// Why: the canonical identity of a hosted project is the `owner/repo` path in
145/// its remote URL, not the local directory name. Parsing it purely (no I/O) makes
146/// every URL-shape branch deterministically unit-testable.
147/// What: handles the three canonical shapes — SSH (`git@github.com:owner/repo.git`),
148/// HTTPS (`https://github.com/owner/repo(.git)`), and scp-less host paths — by
149/// stripping the scheme + host, trimming a trailing `.git`/slashes, splitting on
150/// `/`, and taking the final two segments as `(owner, repo)`. Each segment is
151/// slugified. When only one trailing segment is parseable (no owner) the repo is
152/// kept and `owner` falls back to [`UNKNOWN_OWNER`] so the result is always a
153/// two-segment path. Returns `None` only when no non-empty `repo` slug can be
154/// produced (empty input, host-only URL).
155/// Test: `parse_ssh_github`, `parse_https_github_with_and_without_dot_git`,
156/// `parse_non_github_host`, `parse_trailing_slash`, `parse_nested_group_takes_last_two`,
157/// `parse_repo_only_uses_unknown_owner`, `parse_empty_returns_none`.
158pub fn parse_github_path(url: &str) -> Option<GithubPath> {
159    let trimmed = url.trim();
160    if trimmed.is_empty() {
161        return None;
162    }
163
164    let path = host_relative_path(strip_scheme(trimmed));
165    let path = path.trim_end_matches('/');
166    let path = path.strip_suffix(".git").unwrap_or(path);
167    let path = path.trim_end_matches('/');
168
169    let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
170    let (owner_raw, repo_raw) = match segments.as_slice() {
171        [.., owner, repo] => (Some(*owner), *repo),
172        [repo] => (None, *repo),
173        _ => return None,
174    };
175
176    let repo = slugify_component(repo_raw);
177    if repo.is_empty() {
178        return None;
179    }
180
181    let owner = owner_raw
182        .map(slugify_component)
183        .filter(|s| !s.is_empty())
184        .unwrap_or_else(|| UNKNOWN_OWNER.to_string());
185
186    Some(GithubPath { owner, repo })
187}
188
189/// Derive a [`GithubPath`] from the git origin remote of a directory.
190///
191/// Why: the only I/O entry point — the workspace-root builder needs the
192/// `owner/repo` identity of the repo a session targets, which lives in its
193/// `remote.origin.url`. Shelling out to `git config` (rather than reading
194/// `.git/config`) transparently handles worktrees, where `.git` is a file
195/// pointing at the parent repo.
196/// What: runs `git -C <dir> config --get remote.origin.url`; on success parses
197/// the URL via [`parse_github_path`]. Returns `None` when git is absent, `dir` is
198/// not a repo, there is no origin remote, or the URL has no parseable identity.
199/// Best-effort, no network.
200/// Test: `derive_github_path_reads_origin` (temp repo with an origin remote);
201/// `derive_github_path_none_outside_repo`.
202pub fn derive_github_path(dir: &Path) -> Option<GithubPath> {
203    let output = Command::new("git")
204        .arg("-C")
205        .arg(dir)
206        .arg("config")
207        .arg("--get")
208        .arg("remote.origin.url")
209        .output()
210        .ok()?;
211    if !output.status.success() {
212        return None;
213    }
214    let url = String::from_utf8_lossy(&output.stdout);
215    parse_github_path(url.trim())
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221
222    /// Why: SSH scp-syntax is the most common GitHub remote; owner and repo must
223    /// be split into two slugified segments.
224    /// Test: itself.
225    #[test]
226    fn parse_ssh_github() {
227        let gp = parse_github_path("git@github.com:bobmatnyc/trusty-tools.git").expect("parsed");
228        assert_eq!(gp.owner, "bobmatnyc");
229        assert_eq!(gp.repo, "trusty-tools");
230    }
231
232    /// Why: HTTPS remotes appear with and without the trailing `.git`; both must
233    /// yield the same two segments.
234    /// Test: itself.
235    #[test]
236    fn parse_https_github_with_and_without_dot_git() {
237        let with = parse_github_path("https://github.com/bobmatnyc/trusty-tools.git").unwrap();
238        let without = parse_github_path("https://github.com/bobmatnyc/trusty-tools").unwrap();
239        assert_eq!(with, without);
240        assert_eq!(with.owner, "bobmatnyc");
241        assert_eq!(with.repo, "trusty-tools");
242    }
243
244    /// Why: non-GitHub hosts still expose `owner/repo`; the host is irrelevant
245    /// and mixed-case/underscores must be normalised in both segments.
246    /// Test: itself.
247    #[test]
248    fn parse_non_github_host() {
249        let gp = parse_github_path("git@gitlab.example.com:Acme/Cool_App.git").unwrap();
250        assert_eq!(gp.owner, "acme");
251        assert_eq!(gp.repo, "cool-app");
252    }
253
254    /// Why: a trailing slash must not produce an empty repo segment.
255    /// Test: itself.
256    #[test]
257    fn parse_trailing_slash() {
258        let gp = parse_github_path("https://github.com/bobmatnyc/trusty-tools/").unwrap();
259        assert_eq!(gp.owner, "bobmatnyc");
260        assert_eq!(gp.repo, "trusty-tools");
261    }
262
263    /// Why: nested group paths (GitLab subgroups) must resolve to the final two
264    /// segments — the immediate group and the repo.
265    /// Test: itself.
266    #[test]
267    fn parse_nested_group_takes_last_two() {
268        let gp = parse_github_path("https://gitlab.com/acme/team/widget.git").unwrap();
269        assert_eq!(gp.owner, "team");
270        assert_eq!(gp.repo, "widget");
271    }
272
273    /// Why: an owner-less remote must still yield a two-segment path so the
274    /// `<owner>/<repo>` convention holds; owner falls back to the sentinel.
275    /// Test: itself.
276    #[test]
277    fn parse_repo_only_uses_unknown_owner() {
278        let gp = parse_github_path("git@host:repo.git").unwrap();
279        assert_eq!(gp.owner, UNKNOWN_OWNER);
280        assert_eq!(gp.repo, "repo");
281    }
282
283    /// Why: empty / host-only inputs have no extractable identity and must
284    /// return `None` so the caller falls back to a slug/default.
285    /// Test: itself.
286    #[test]
287    fn parse_empty_returns_none() {
288        assert_eq!(parse_github_path(""), None);
289        assert_eq!(parse_github_path("   "), None);
290        assert_eq!(parse_github_path("https://github.com/"), None);
291    }
292
293    /// Why: callers join the identity onto a root; `rel_path` must emit
294    /// `<owner>/<repo>` in that exact order.
295    /// Test: itself.
296    #[test]
297    fn github_path_rel_joins_owner_repo() {
298        let gp = GithubPath {
299            owner: "bobmatnyc".into(),
300            repo: "trusty-tools".into(),
301        };
302        assert_eq!(gp.rel_path(), "bobmatnyc/trusty-tools");
303    }
304
305    /// Why: the I/O entry point must read a real `remote.origin.url` and parse it;
306    /// uses a throwaway temp repo so it never touches the network.
307    /// Test: itself (skips cleanly if `git` is unavailable on the runner).
308    #[test]
309    fn derive_github_path_reads_origin() {
310        let tmp = tempfile::TempDir::new().expect("tempdir");
311        let dir = tmp.path();
312        let git = |args: &[&str]| {
313            Command::new("git")
314                .arg("-C")
315                .arg(dir)
316                .args(args)
317                .output()
318                .map(|o| o.status.success())
319                .unwrap_or(false)
320        };
321        if !git(&["init"]) {
322            // No usable git on this runner — nothing to assert.
323            return;
324        }
325        let _ = git(&[
326            "remote",
327            "add",
328            "origin",
329            "git@github.com:bobmatnyc/trusty-tools.git",
330        ]);
331        let gp = derive_github_path(dir).expect("derived from origin");
332        assert_eq!(gp.owner, "bobmatnyc");
333        assert_eq!(gp.repo, "trusty-tools");
334    }
335
336    /// Why: a directory that is not a git repo must yield `None`, not panic, so
337    /// the caller can fall back cleanly.
338    /// Test: itself.
339    #[test]
340    fn derive_github_path_none_outside_repo() {
341        let tmp = tempfile::TempDir::new().expect("tempdir");
342        assert_eq!(derive_github_path(tmp.path()), None);
343    }
344}