Skip to main content

aube_lockfile/
source.rs

1use std::path::{Path, PathBuf};
2
3/// Non-registry source for a locked package.
4///
5/// When a package comes from a local path (via `file:` or `link:` in
6/// `package.json`) it doesn't have a tarball URL or integrity hash, so we
7/// record the source separately and let the linker materialize it
8/// on-the-fly.
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub enum LocalSource {
11    /// `file:<dir>` — a directory on disk whose contents should be
12    /// hardlink-copied into the virtual store like a normal package.
13    /// Path is stored relative to the project root.
14    Directory(PathBuf),
15    /// `file:<tarball>` — a `.tgz` on disk, extracted into the virtual
16    /// store the same way we extract registry tarballs.
17    Tarball(PathBuf),
18    /// `link:<dir>` — a plain symlink into `node_modules/<name>`, never
19    /// materialized into the virtual store. Transitive deps are the
20    /// target's responsibility.
21    Link(PathBuf),
22    /// `portal:<dir>` — a Yarn Berry package portal. The target is a
23    /// package on disk, but unlike `link:` its dependencies are still
24    /// modeled in the lockfile graph.
25    Portal(PathBuf),
26    /// `exec:<script>` — a Yarn Berry generator script. The script is
27    /// executed at fetch time and writes the package files into a
28    /// generated build directory.
29    Exec(PathBuf),
30    /// `git+https://`, `git+ssh://`, `github:user/repo`, etc. — a
31    /// remote git repo. Cloned at fetch time and imported like a
32    /// `file:` directory. `url` is the normalized clone URL (what
33    /// gets passed to `git clone`). `committish` is the user-written
34    /// ref after `#` (branch, tag, or commit; `None` means HEAD).
35    /// `resolved` is the 40-char commit SHA that `git ls-remote`
36    /// pinned the ref to — the lockfile records this so repeat
37    /// installs reproduce bit-for-bit.
38    Git(GitSource),
39    /// `https://example.com/pkg.tgz` — a remote tarball URL. Fetched
40    /// once at resolve time so the resolver can read the enclosed
41    /// `package.json` for version + transitive deps and pin the
42    /// sha512 integrity. `integrity` stays empty on freshly-parsed
43    /// specifiers and is filled in by the resolver after download.
44    RemoteTarball(RemoteTarballSource),
45}
46
47/// A remote tarball dependency spec. See [`LocalSource::RemoteTarball`].
48#[derive(Debug, Clone, PartialEq, Eq)]
49pub struct RemoteTarballSource {
50    pub url: String,
51    pub integrity: String,
52    pub git_hosted: bool,
53}
54
55/// A git dependency spec. See [`LocalSource::Git`].
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct GitSource {
58    pub url: String,
59    pub committish: Option<String>,
60    pub resolved: String,
61    /// SHA-512 SRI of the hosted tarball bytes when the git source was
62    /// fetched through a codeload-style archive. Plain git-clone sources
63    /// leave this unset because git object IDs verify the checkout.
64    pub integrity: Option<String>,
65    /// pnpm `&path:/sub/dir` selector — when set, only this
66    /// subdirectory of the cloned repo is treated as the package
67    /// root. Stored without leading slash so dep_path hashes are
68    /// stable regardless of whether the user wrote `path:/x` or
69    /// `path:x`.
70    pub subpath: Option<String>,
71}
72
73pub fn git_commits_match(left: &str, right: &str) -> bool {
74    if left.eq_ignore_ascii_case(right) {
75        return true;
76    }
77    let left = left.trim();
78    let right = right.trim();
79    if left.len().min(right.len()) < 7
80        || !left.bytes().all(|b| b.is_ascii_hexdigit())
81        || !right.bytes().all(|b| b.is_ascii_hexdigit())
82    {
83        return false;
84    }
85    let left = left.to_ascii_lowercase();
86    let right = right.to_ascii_lowercase();
87    (left.len() == 40 && right.len() < 40 && left.starts_with(&right))
88        || (right.len() == 40 && left.len() < 40 && right.starts_with(&left))
89}
90
91impl LocalSource {
92    /// The original path (relative to the project root) the user wrote
93    /// in `package.json`. `None` for non-path sources like git.
94    pub fn path(&self) -> Option<&Path> {
95        match self {
96            LocalSource::Directory(p)
97            | LocalSource::Tarball(p)
98            | LocalSource::Link(p)
99            | LocalSource::Portal(p)
100            | LocalSource::Exec(p) => Some(p),
101            LocalSource::Git(_) | LocalSource::RemoteTarball(_) => None,
102        }
103    }
104
105    /// The protocol kind (`"file"` / `"link"` / `"git"` / `"url"`).
106    pub fn kind_str(&self) -> &'static str {
107        match self {
108            LocalSource::Directory(_) | LocalSource::Tarball(_) => "file",
109            LocalSource::Link(_) => "link",
110            LocalSource::Portal(_) => "portal",
111            LocalSource::Exec(_) => "exec",
112            LocalSource::Git(_) => "git",
113            LocalSource::RemoteTarball(_) => "url",
114        }
115    }
116
117    /// The path as a POSIX-style string with forward-slash separators.
118    /// `Path::display()` and `to_string_lossy()` honor the host's
119    /// separator (backslash on Windows), which would make `dep_path`
120    /// hashes and lockfile `specifier:` strings non-portable: the
121    /// same `file:./some/dir` would render as `some\dir` on Windows
122    /// and `some/dir` on Unix, producing two different hashes for
123    /// the same logical target. Always rendering with `/` keeps
124    /// lockfiles cross-platform identical.
125    pub fn path_posix(&self) -> String {
126        self.path()
127            .map(|p| p.to_string_lossy().replace('\\', "/"))
128            .unwrap_or_default()
129    }
130
131    /// Canonical specifier string as pnpm writes it in the `packages:`
132    /// and `snapshots:` keys (post-`<name>@` part). For `file:` /
133    /// `link:` this is `file:./vendor/foo` / `link:../sibling`. For
134    /// `git`, pnpm uses the resolved form `<url>#<commit>` (no
135    /// `git+` prefix) because the lockfile pins to the exact commit
136    /// regardless of what the user wrote. Always emits POSIX
137    /// separators so the resulting lockfile is portable.
138    pub fn specifier(&self) -> String {
139        match self {
140            LocalSource::Git(g) => match &g.subpath {
141                Some(sub) => format!("{}#{}&path:/{}", g.url, g.resolved, sub),
142                None => format!("{}#{}", g.url, g.resolved),
143            },
144            LocalSource::RemoteTarball(t) => t.url.clone(),
145            _ => format!("{}:{}", self.kind_str(), self.path_posix()),
146        }
147    }
148
149    /// Internal FS-safe dep_path used as the key in
150    /// `LockfileGraph.packages` and as the `.aube/` subdir name.
151    ///
152    /// Distinct paths must map to distinct keys (otherwise the
153    /// linker would silently mix files between two local packages),
154    /// and the result must be a single filesystem component — no
155    /// `/`, `\`, `:`, or `..`. Ad-hoc character substitution trips
156    /// over cases like `../vendor` vs `__/vendor` or `a.b` vs `a_b`
157    /// collapsing to the same string, so we hash the raw path bytes
158    /// and suffix the first 16 hex chars (64 bits — more than enough
159    /// to avoid collisions inside a single project).
160    ///
161    /// The hash input is the POSIX-form path string so a checked-in
162    /// lockfile resolves to the same key regardless of which
163    /// platform ran `aube install`.
164    pub fn dep_path(&self, name: &str) -> String {
165        use sha2::{Digest, Sha256};
166        let mut hasher = Sha256::new();
167        match self {
168            LocalSource::Git(g) => {
169                hasher.update(g.url.as_bytes());
170                hasher.update(b"#");
171                hasher.update(g.resolved.as_bytes());
172                if let Some(sub) = &g.subpath {
173                    hasher.update(b"&path:/");
174                    hasher.update(sub.as_bytes());
175                }
176            }
177            LocalSource::RemoteTarball(t) => {
178                hasher.update(t.url.as_bytes());
179            }
180            _ => hasher.update(self.path_posix().as_bytes()),
181        }
182        let digest = hasher.finalize();
183        let short: String = digest.iter().take(8).map(|b| format!("{b:02x}")).collect();
184        format!("{name}@{}+{short}", self.kind_str())
185    }
186
187    /// Classify a user-written `file:` / `link:` specifier against the
188    /// project root. Returns `None` if `spec` isn't a local specifier.
189    /// Resolves the target path relative to `project_root`; a `file:`
190    /// target that resolves to a `.tgz` / `.tar.gz` on disk is treated
191    /// as a tarball, anything else as a directory.
192    pub fn parse(spec: &str, project_root: &Path) -> Option<Self> {
193        // Check git first so URLs like `https://host/user/repo.git`
194        // aren't swallowed by the broader bare-http tarball check
195        // below.
196        if let Some((url, committish, subpath)) = parse_git_spec(spec) {
197            // `resolved` is filled in by the resolver after running
198            // `git ls-remote`. A lockfile round-trip that never
199            // re-resolves will leave this empty, which is the sentinel
200            // the resolver checks for before calling ls-remote.
201            return Some(LocalSource::Git(GitSource {
202                url,
203                committish,
204                resolved: String::new(),
205                integrity: None,
206                subpath,
207            }));
208        }
209        // Any remaining bare `http(s)://` URL is a remote tarball.
210        // npm semantics treat *all* non-git HTTP URLs in a dependency
211        // value as tarball URLs, so services that serve tarballs from
212        // URLs without a `.tgz` extension (pkg.pr.new, GitHub
213        // codeload, etc.) classify correctly here.
214        if Self::looks_like_remote_tarball_url(spec) {
215            return Some(LocalSource::RemoteTarball(RemoteTarballSource {
216                url: spec.to_string(),
217                integrity: String::new(),
218                git_hosted: false,
219            }));
220        }
221        let (kind, rest) = if let Some(r) = spec.strip_prefix("file:") {
222            ("file", r)
223        } else if let Some(r) = spec.strip_prefix("link:") {
224            ("link", r)
225        } else if let Some(r) = spec.strip_prefix("portal:") {
226            ("portal", r)
227        } else if let Some(r) = spec.strip_prefix("exec:") {
228            return Some(LocalSource::Exec(PathBuf::from(r)));
229        } else {
230            return None;
231        };
232        let rel = PathBuf::from(rest);
233        let abs = project_root.join(&rel);
234        if kind == "link" {
235            return Some(LocalSource::Link(rel));
236        }
237        if kind == "portal" {
238            return Some(LocalSource::Portal(rel));
239        }
240        if abs.is_file() && Self::path_looks_like_tarball(&rel) {
241            return Some(LocalSource::Tarball(rel));
242        }
243        Some(LocalSource::Directory(rel))
244    }
245
246    /// Whether a specifier looks like a direct HTTP(S) URL that should
247    /// be fetched as a tarball. Per npm semantics, *any* `http://` or
248    /// `https://` URL in a dependency value is a tarball URL — services
249    /// like pkg.pr.new, GitHub codeload, and private registries with
250    /// auth-token query strings serve tarballs from URLs that don't
251    /// carry a `.tgz` extension. Git URLs must already have been
252    /// ruled out by the caller (see [`parse_git_spec`]) so a
253    /// `.git`-suffixed URL doesn't get misclassified here.
254    pub fn looks_like_remote_tarball_url(spec: &str) -> bool {
255        spec.starts_with("https://") || spec.starts_with("http://")
256    }
257
258    pub fn path_looks_like_tarball(path: &Path) -> bool {
259        let name = match path.file_name().and_then(|n| n.to_str()) {
260            Some(n) => n,
261            None => return false,
262        };
263        let lower = name.to_ascii_lowercase();
264        lower.ends_with(".tgz") || lower.ends_with(".tar.gz")
265    }
266}
267
268/// Parse a git dependency specifier into `(clone_url, committish)`.
269///
270/// Recognized forms:
271/// - `git+https://host/user/repo.git[#ref]`
272/// - `git+ssh://git@host/user/repo.git[#ref]`
273/// - `git://host/user/repo.git[#ref]`
274/// - `https://host/user/repo.git[#ref]` (only when ending in `.git`)
275/// - `user@host:path[.git][#ref]` (scp-form, only for github.com / gitlab.com /
276///   bitbucket.org — matches pnpm 11 behavior, where unknown SCP hosts are
277///   treated as local paths) → `ssh://user@host/path[.git]`
278/// - `github:user/repo[#ref]` → `https://github.com/user/repo.git`
279/// - `gitlab:user/repo[#ref]` → `https://gitlab.com/user/repo.git`
280/// - `bitbucket:user/repo[#ref]` → `https://bitbucket.org/user/repo.git`
281/// - `user/repo[#ref]` (bare GitHub shorthand, npm/pnpm compat)
282///   → `https://github.com/user/repo.git`
283///
284/// Returns `None` for any specifier that doesn't look like a git URL,
285/// so the caller can fall through to other protocol parsers.
286pub fn parse_git_spec(spec: &str) -> Option<(String, Option<String>, Option<String>)> {
287    let (body, committish, subpath) = match spec.find('#') {
288        Some(idx) => {
289            let (c, s) = parse_git_fragment(&spec[idx + 1..]);
290            (&spec[..idx], c, s)
291        }
292        None => (spec, None, None),
293    };
294    let is_bare_transport = body.starts_with("https://")
295        || body.starts_with("http://")
296        || body.starts_with("ssh://")
297        || body.starts_with("file://");
298    let url = if let Some(rest) = body.strip_prefix("git+") {
299        // `git+` explicitly tags the URL as git, so the `.git`
300        // suffix is optional (GitHub/GitLab accept both forms).
301        rest.to_string()
302    } else if body.starts_with("git://") {
303        body.to_string()
304    } else if let Some(scp) = parse_scp_url(body) {
305        scp
306    } else if let Some(path) = body.strip_prefix("github:") {
307        format!("https://github.com/{path}.git")
308    } else if let Some(path) = body.strip_prefix("gitlab:") {
309        format!("https://gitlab.com/{path}.git")
310    } else if let Some(path) = body.strip_prefix("bitbucket:") {
311        format!("https://bitbucket.org/{path}.git")
312    } else if is_bare_transport && body.ends_with(".git") {
313        body.to_string()
314    } else if is_bare_transport
315        && committish
316            .as_deref()
317            .is_some_and(|c| c.len() == 40 && c.chars().all(|ch| ch.is_ascii_hexdigit()))
318    {
319        // Lockfile round-trip form: `specifier()` writes the stored
320        // URL verbatim plus `#<sha>`. URLs that dropped the `git+`
321        // prefix (and happen to lack `.git`) are disambiguated from
322        // plain tarball URLs by the 40-hex committish suffix.
323        body.to_string()
324    } else if is_bare_github_shorthand(body) {
325        // npm/pnpm bare GitHub shorthand: `user/repo` expands to
326        // `github:user/repo`. Placed last so all explicit URL/scheme
327        // forms above shadow it.
328        format!("https://github.com/{body}.git")
329    } else {
330        return None;
331    };
332    Some((url, committish, subpath))
333}
334
335/// `user/repo` — a single `/`, both segments non-empty, ASCII
336/// alphanumeric + `_.-` only, owner doesn't start with `.` so
337/// single-component relative paths (`./repo`, `../repo`) are rejected.
338/// Excludes scoped npm names (`@scope/pkg`) and file paths. Other
339/// URL/SCP forms are ruled out by placement order in `parse_git_spec`.
340fn is_bare_github_shorthand(body: &str) -> bool {
341    let Some((owner, repo)) = body.split_once('/') else {
342        return false;
343    };
344    !owner.is_empty()
345        && !owner.starts_with('.')
346        && !repo.is_empty()
347        && !repo.contains('/')
348        && owner
349            .bytes()
350            .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'_' | b'.' | b'-'))
351        && repo
352            .bytes()
353            .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'_' | b'.' | b'-'))
354}
355
356/// A git URL that maps to one of the three "hosted" providers npm /
357/// pnpm both special-case (github / gitlab / bitbucket). For these
358/// hosts a public read can be served as a flat HTTPS tarball over
359/// `codeload.github.com` (or each host's equivalent), bypassing `git`
360/// entirely. The lockfile's stored URL is canonical-identity only —
361/// pnpm and npm both re-derive the fetch URL from `(host, owner,
362/// repo)` on every install rather than dialing whatever scheme
363/// happens to be in `resolved:`.
364#[derive(Debug, Clone, PartialEq, Eq)]
365pub struct HostedGit {
366    pub host: HostedGitHost,
367    pub owner: String,
368    pub repo: String,
369}
370
371#[derive(Debug, Clone, Copy, PartialEq, Eq)]
372pub enum HostedGitHost {
373    GitHub,
374    GitLab,
375    Bitbucket,
376}
377
378impl HostedGit {
379    /// `https://github.com/<owner>/<repo>.git` — the form `git fetch`
380    /// can dial without an SSH key. Used as the runtime fetch URL when
381    /// the lockfile's stored URL is `git+ssh://git@…` (npm canonical
382    /// identity) but the actual install host has no SSH configured.
383    pub fn https_url(&self) -> String {
384        let host = self.host.host_domain();
385        format!("https://{host}/{}/{}.git", self.owner, self.repo)
386    }
387
388    /// `https://codeload.github.com/<owner>/<repo>/tar.gz/<sha>` (or
389    /// each host's equivalent) — a flat HTTPS tarball at the given
390    /// commit. Returns `None` unless `committish` is a 40-char hex
391    /// SHA, since the codeload path can't be verified after extraction
392    /// without `.git/` metadata. Branch / tag names round-trip through
393    /// `git ls-remote` to get pinned to a SHA first.
394    pub fn tarball_url(&self, committish: &str) -> Option<String> {
395        if committish.len() != 40 || !committish.chars().all(|c| c.is_ascii_hexdigit()) {
396            return None;
397        }
398        let sha = committish.to_ascii_lowercase();
399        Some(match self.host {
400            HostedGitHost::GitHub => format!(
401                "https://codeload.github.com/{}/{}/tar.gz/{sha}",
402                self.owner, self.repo
403            ),
404            HostedGitHost::GitLab => format!(
405                "https://gitlab.com/{}/{}/-/archive/{sha}/{}-{sha}.tar.gz",
406                self.owner, self.repo, self.repo
407            ),
408            HostedGitHost::Bitbucket => format!(
409                "https://bitbucket.org/{}/{}/get/{sha}.tar.gz",
410                self.owner, self.repo
411            ),
412        })
413    }
414}
415
416impl HostedGitHost {
417    fn from_domain(domain: &str) -> Option<Self> {
418        match domain {
419            "github.com" => Some(HostedGitHost::GitHub),
420            "gitlab.com" => Some(HostedGitHost::GitLab),
421            "bitbucket.org" => Some(HostedGitHost::Bitbucket),
422            _ => None,
423        }
424    }
425
426    pub fn host_domain(self) -> &'static str {
427        match self {
428            HostedGitHost::GitHub => "github.com",
429            HostedGitHost::GitLab => "gitlab.com",
430            HostedGitHost::Bitbucket => "bitbucket.org",
431        }
432    }
433}
434
435/// Parse a clone URL — in any form `parse_git_spec` accepts as input
436/// or produces as output — into its `(host, owner, repo)` components,
437/// when the host is one of the three providers npm / pnpm route
438/// through HTTPS tarballs. Returns `None` for any other host (including
439/// self-hosted GitLab / Gitea / Bitbucket Data Center): those still
440/// need a real `git clone` because no codeload-style HTTP archive is
441/// available.
442///
443/// Accepts:
444/// - `https://github.com/owner/repo[.git]`
445/// - `git+https://github.com/owner/repo[.git]`
446/// - `git://github.com/owner/repo[.git]`
447/// - `ssh://git@github.com/owner/repo[.git]`
448/// - `git+ssh://git@github.com/owner/repo[.git]` (npm canonical lockfile form)
449/// - `git@github.com:owner/repo[.git]` (scp shorthand, in case a caller
450///   parses raw lockfile fields without going through `parse_git_spec`)
451pub fn parse_hosted_git(url: &str) -> Option<HostedGit> {
452    let body = url.strip_prefix("git+").unwrap_or(url);
453    let after_scheme = if let Some(rest) = body.strip_prefix("https://") {
454        rest
455    } else if let Some(rest) = body.strip_prefix("http://") {
456        rest
457    } else if let Some(rest) = body.strip_prefix("ssh://") {
458        rest
459    } else if let Some(rest) = body.strip_prefix("git://") {
460        rest
461    } else {
462        // scp shorthand `user@host:path` — not produced by parse_git_spec
463        // but accepted defensively in case a raw lockfile string ever
464        // bypasses it.
465        let scp_path = parse_scp_url(body)?;
466        return parse_hosted_git(&scp_path);
467    };
468    // Strip optional `user@` (always `git@` for hosted forms).
469    let host_and_path = match after_scheme.split_once('@') {
470        Some((_, rest)) => rest,
471        None => after_scheme,
472    };
473    let (host, path) = host_and_path.split_once('/')?;
474    let host = HostedGitHost::from_domain(host)?;
475    // Take exactly two path segments: owner and repo. Anything beyond
476    // (subgroup-style GitLab paths) doesn't have a stable HTTPS tarball
477    // form on the three providers we care about, so refuse and let the
478    // caller fall back to clone.
479    let mut segs = path.splitn(3, '/');
480    let owner = segs.next()?;
481    let repo = segs.next()?;
482    if owner.is_empty() || repo.is_empty() || segs.next().is_some() {
483        return None;
484    }
485    let repo = repo
486        .strip_suffix(".git")
487        .unwrap_or(repo)
488        .trim_end_matches('/');
489    if repo.is_empty() {
490        return None;
491    }
492    Some(HostedGit {
493        host,
494        owner: owner.to_string(),
495        repo: repo.to_string(),
496    })
497}
498
499fn parse_scp_url(body: &str) -> Option<String> {
500    if body.contains("://") {
501        return None;
502    }
503    let colon = body.find(':')?;
504    let before = &body[..colon];
505    let path = &body[colon + 1..];
506    if before.is_empty() || path.is_empty() {
507        return None;
508    }
509    if path.starts_with('/') {
510        return None;
511    }
512    let at = before.find('@')?;
513    let user = &before[..at];
514    let host = &before[at + 1..];
515    if user.is_empty() || host.is_empty() || host.contains('/') || host.contains('@') {
516        return None;
517    }
518    // pnpm 11 only resolves SCP-form as hosted Git for the three known
519    // providers; other hosts (e.g. `git@example.com:foo/bar.git`) are
520    // treated as local paths, and `host:path` without a user errors.
521    if !matches!(host, "github.com" | "gitlab.com" | "bitbucket.org") {
522        return None;
523    }
524    Some(format!("ssh://{user}@{host}/{path}"))
525}
526
527/// Normalize git URL fragments used by npm-compatible lockfiles.
528///
529/// Plain git accepts `#<ref>`, while npm and Yarn Berry also write
530/// key/value fragments such as `#commit=<sha>` for pinned git deps.
531/// Downstream code passes this value directly to `git ls-remote` and
532/// `git checkout`, so strip the selector key here and keep only the
533/// actual ref name or SHA.
534pub(crate) fn normalize_git_fragment(fragment: &str) -> Option<String> {
535    parse_git_fragment(fragment).0
536}
537
538/// Parse a git URL fragment into `(committish, subpath)`. Handles the
539/// pnpm/hosted-git-info form `<ref>&path:/sub/dir` (the `path:` key
540/// uses a colon, not `=`, by historical convention) as well as the
541/// `key=value` form npm/Yarn Berry write. Unknown selectors are
542/// ignored. Subpath is returned without leading slash so the caller
543/// can join it with a clone dir without tripping the absolute-path
544/// branch of `Path::join`.
545pub(crate) fn parse_git_fragment(fragment: &str) -> (Option<String>, Option<String>) {
546    if fragment.is_empty() {
547        return (None, None);
548    }
549
550    let mut fallback: Option<&str> = None;
551    let mut preferred: Option<&str> = None;
552    let mut subpath: Option<String> = None;
553    for part in fragment.split('&') {
554        if part.is_empty() {
555            continue;
556        }
557        // Try `key=value` first; fall back to `key:value` only for
558        // the small set of selectors we actually handle below. A tag
559        // name with a colon (e.g. `release:2026-01`) is left alone —
560        // and `semver:^1.0.0` stays as a literal ref so `ls-remote`
561        // surfaces an explicit error rather than silently HEAD-ing.
562        let split = part.split_once('=').or_else(|| {
563            part.split_once(':')
564                .filter(|(k, _)| matches!(*k, "commit" | "tag" | "head" | "branch" | "path"))
565        });
566        let (key, value) = split.unwrap_or(("", part));
567        if value.is_empty() {
568            continue;
569        }
570        match key {
571            "commit" => {
572                preferred.get_or_insert(value);
573            }
574            "tag" | "head" | "branch" => {
575                fallback.get_or_insert(value);
576            }
577            "path" => {
578                // Strip leading slashes (pnpm writes `path:/sub`) and
579                // reject any `..` / `.` component. Without this, a
580                // crafted spec like `&path:/../../etc` would let the
581                // resolver and installer escape the clone dir and
582                // import an arbitrary host directory into the store.
583                if subpath.is_some() {
584                    // First-wins, matching the other selectors above.
585                    continue;
586                }
587                let trimmed = value.trim_start_matches('/');
588                if trimmed.is_empty() {
589                    continue;
590                }
591                if trimmed
592                    .split('/')
593                    .any(|c| c.is_empty() || c == "." || c == "..")
594                {
595                    continue;
596                }
597                subpath = Some(trimmed.to_string());
598            }
599            "" => {
600                fallback.get_or_insert(value);
601            }
602            _ => {}
603        }
604    }
605
606    (preferred.or(fallback).map(ToString::to_string), subpath)
607}
608
609#[cfg(test)]
610mod tests {
611    use super::*;
612
613    #[test]
614    fn matches_https_tgz() {
615        assert!(LocalSource::looks_like_remote_tarball_url(
616            "https://example.com/pkg-1.0.0.tgz"
617        ));
618    }
619
620    #[test]
621    fn matches_http_tar_gz() {
622        assert!(LocalSource::looks_like_remote_tarball_url(
623            "http://example.com/pkg-1.0.0.tar.gz"
624        ));
625    }
626
627    #[test]
628    fn strips_fragment_before_suffix_check() {
629        assert!(LocalSource::looks_like_remote_tarball_url(
630            "https://example.com/pkg-1.0.0.tgz#sha512-abc"
631        ));
632    }
633
634    #[test]
635    fn strips_query_string_before_suffix_check() {
636        // Auth-token URLs from private registries (JFrog, Nexus,
637        // CodeArtifact, …) routinely trail `?token=…` after the
638        // filename. Must still classify as a tarball URL.
639        assert!(LocalSource::looks_like_remote_tarball_url(
640            "https://registry.example.com/pkg/-/pkg-1.0.0.tgz?token=abc"
641        ));
642        assert!(LocalSource::looks_like_remote_tarball_url(
643            "https://example.com/pkg-1.0.0.tar.gz?v=2&signed=1"
644        ));
645    }
646
647    #[test]
648    fn matches_bare_http_url_without_tarball_suffix() {
649        // pkg.pr.new serves tarballs from URLs without a `.tgz`
650        // extension; npm treats all non-git http(s) URLs as tarball
651        // URLs, so these must classify as remote tarballs.
652        assert!(LocalSource::looks_like_remote_tarball_url(
653            "https://pkg.pr.new/lunariajs/lunaria/@lunariajs/core@904b935"
654        ));
655        assert!(LocalSource::looks_like_remote_tarball_url(
656            "https://codeload.github.com/user/repo/tar.gz/main"
657        ));
658    }
659
660    #[test]
661    fn git_commits_match_only_allows_full_sha_prefix_pairs() {
662        let full = "abcdef0123456789abcdef0123456789abcdef01";
663        assert!(git_commits_match(full, "abcdef0"));
664        assert!(git_commits_match("abcdef0", full));
665        assert!(git_commits_match(full, full));
666        assert!(!git_commits_match("abcdef0", "abcdef012"));
667        assert!(!git_commits_match(full, "abcdef1"));
668        assert!(!git_commits_match("main", full));
669    }
670
671    #[test]
672    fn rejects_non_http_schemes() {
673        assert!(!LocalSource::looks_like_remote_tarball_url(
674            "ftp://example.com/pkg.tgz"
675        ));
676        assert!(!LocalSource::looks_like_remote_tarball_url(
677            "git://example.com/repo.git"
678        ));
679    }
680
681    #[test]
682    fn parse_classifies_bare_http_url_as_remote_tarball() {
683        use std::path::Path;
684        let parsed = LocalSource::parse(
685            "https://pkg.pr.new/lunariajs/lunaria/@lunariajs/core@904b935",
686            Path::new(""),
687        );
688        assert!(matches!(parsed, Some(LocalSource::RemoteTarball(_))));
689    }
690
691    #[test]
692    fn parse_prefers_git_over_tarball_for_dot_git_url() {
693        use std::path::Path;
694        let parsed = LocalSource::parse("https://github.com/user/repo.git", Path::new(""));
695        assert!(matches!(parsed, Some(LocalSource::Git(_))));
696    }
697
698    #[test]
699    fn parse_classifies_exec_as_local_source() {
700        let parsed = LocalSource::parse("exec:./scripts/generate.js", Path::new(""));
701        assert_eq!(
702            parsed,
703            Some(LocalSource::Exec(PathBuf::from("./scripts/generate.js")))
704        );
705    }
706
707    #[test]
708    fn git_plus_https_without_dot_git_roundtrips_via_lockfile_form() {
709        // Initial parse: `git+https://…/repo` (no `.git`).
710        let (url, committish, subpath) = parse_git_spec("git+https://host/user/repo").unwrap();
711        assert_eq!(url, "https://host/user/repo");
712        assert_eq!(committish, None);
713        assert_eq!(subpath, None);
714
715        // After resolving, the serializer writes `<url>#<sha>` into
716        // the lockfile's importer `version:` field.
717        let sha = "abcdef0123456789abcdef0123456789abcdef01";
718        let source = LocalSource::Git(GitSource {
719            url: url.clone(),
720            committish: None,
721            resolved: sha.to_string(),
722            integrity: None,
723            subpath: None,
724        });
725        let lockfile_version = source.specifier();
726        assert_eq!(lockfile_version, format!("https://host/user/repo#{sha}"));
727
728        // Re-parse must recognize the bare URL because the 40-hex
729        // committish suffix unambiguously tags it as git.
730        let (round_url, round_committish, round_subpath) =
731            parse_git_spec(&lockfile_version).unwrap();
732        assert_eq!(round_url, "https://host/user/repo");
733        assert_eq!(round_committish.as_deref(), Some(sha));
734        assert_eq!(round_subpath, None);
735    }
736
737    #[test]
738    fn bare_https_without_dot_git_and_no_committish_is_not_git() {
739        // A plain `https://…` URL with no `.git` and no SHA could be
740        // anything (including a tarball); don't claim it.
741        assert!(parse_git_spec("https://example.com/pkg").is_none());
742    }
743
744    #[test]
745    fn github_shorthand_expands_and_roundtrips() {
746        let (url, _, _) = parse_git_spec("github:user/repo").unwrap();
747        assert_eq!(url, "https://github.com/user/repo.git");
748    }
749
750    #[test]
751    fn bare_user_repo_expands_to_github() {
752        let (url, committish, subpath) = parse_git_spec("kevva/is-negative").unwrap();
753        assert_eq!(url, "https://github.com/kevva/is-negative.git");
754        assert!(committish.is_none());
755        assert!(subpath.is_none());
756    }
757
758    #[test]
759    fn bare_user_repo_with_committish_preserved() {
760        let (url, committish, _) = parse_git_spec("kevva/is-negative#v1.0.0").unwrap();
761        assert_eq!(url, "https://github.com/kevva/is-negative.git");
762        assert_eq!(committish.as_deref(), Some("v1.0.0"));
763    }
764
765    #[test]
766    fn bare_scope_pkg_is_not_git_shorthand() {
767        // npm-style `@scope/pkg` is a registry name, not a GitHub shorthand.
768        assert!(parse_git_spec("@types/node").is_none());
769    }
770
771    #[test]
772    fn bare_relative_path_is_not_git_shorthand() {
773        // Single-component relative paths split as owner=".", owner="..",
774        // so owner-starts-with-`.` is the load-bearing guard here.
775        assert!(parse_git_spec("./repo").is_none());
776        assert!(parse_git_spec("../repo").is_none());
777        // Multi-component relative paths additionally fail the
778        // single-`/`-only guard.
779        assert!(parse_git_spec("./local/path").is_none());
780        assert!(parse_git_spec("../local/path").is_none());
781    }
782
783    #[test]
784    fn bare_path_with_extra_slashes_is_not_git_shorthand() {
785        // Real GitHub shorthand is exactly `user/repo` — anything with a
786        // second `/` is a path, not a shorthand.
787        assert!(parse_git_spec("path/with/slashes/extra").is_none());
788    }
789
790    #[test]
791    fn bare_scp_form_unknown_host_is_not_github_shorthand() {
792        // `user@host:repo.git` is scp form (handled or rejected above);
793        // the bare-shorthand branch must not pick it up.
794        assert!(parse_git_spec("user@host:repo.git").is_none());
795    }
796
797    #[test]
798    fn scp_form_recognized() {
799        let (url, committish, _) =
800            parse_git_spec("git@github.com:EthanHenrickson/math-mcp.git").unwrap();
801        assert_eq!(url, "ssh://git@github.com/EthanHenrickson/math-mcp.git");
802        assert!(committish.is_none());
803    }
804
805    #[test]
806    fn scp_form_with_ref_recognized() {
807        let (url, committish, _) =
808            parse_git_spec("git@github.com:EthanHenrickson/math-mcp.git#0.1.5").unwrap();
809        assert_eq!(url, "ssh://git@github.com/EthanHenrickson/math-mcp.git");
810        assert_eq!(committish.as_deref(), Some("0.1.5"));
811    }
812
813    #[test]
814    fn scp_form_bitbucket_recognized() {
815        let (url, _, _) = parse_git_spec("git@bitbucket.org:pnpmjs/git-resolver.git").unwrap();
816        assert_eq!(url, "ssh://git@bitbucket.org/pnpmjs/git-resolver.git");
817    }
818
819    #[test]
820    fn scp_form_unknown_host_rejected() {
821        // pnpm 11 treats `user@unknown-host:path` as a local path, not Git.
822        assert!(parse_git_spec("git@example.com:org/repo.git").is_none());
823        assert!(parse_git_spec("alice@host.example.com:org/repo.git").is_none());
824    }
825
826    #[test]
827    fn scp_form_without_user_rejected() {
828        // pnpm 11 errors on bare `host:path` as unsupported.
829        assert!(parse_git_spec("github.com:user/repo.git").is_none());
830    }
831
832    #[test]
833    fn commit_selector_fragment_normalizes_to_sha() {
834        let sha = "abcdef0123456789abcdef0123456789abcdef01";
835        let (url, committish, _) =
836            parse_git_spec(&format!("https://host/user/repo.git#commit={sha}")).unwrap();
837        assert_eq!(url, "https://host/user/repo.git");
838        assert_eq!(committish.as_deref(), Some(sha));
839    }
840
841    #[test]
842    fn named_selector_fragment_normalizes_to_ref() {
843        let (url, committish, _) = parse_git_spec("git+https://host/user/repo#tag=v1.2.3").unwrap();
844        assert_eq!(url, "https://host/user/repo");
845        assert_eq!(committish.as_deref(), Some("v1.2.3"));
846    }
847
848    #[test]
849    fn pnpm_path_subpath_extracted_from_fragment() {
850        // pnpm syntax: `<url>#<ref>&path:/<subdir>` selects a
851        // subdirectory of the cloned repo as the package root.
852        let (url, committish, subpath) =
853            parse_git_spec("github:org/dep#v0.1.4&path:/packages/special").unwrap();
854        assert_eq!(url, "https://github.com/org/dep.git");
855        assert_eq!(committish.as_deref(), Some("v0.1.4"));
856        assert_eq!(subpath.as_deref(), Some("packages/special"));
857    }
858
859    #[test]
860    fn path_subpath_roundtrips_via_specifier() {
861        let sha = "abcdef0123456789abcdef0123456789abcdef01";
862        let source = LocalSource::Git(GitSource {
863            url: "https://github.com/org/dep.git".to_string(),
864            committish: None,
865            resolved: sha.to_string(),
866            integrity: None,
867            subpath: Some("packages/special".to_string()),
868        });
869        let spec = source.specifier();
870        assert_eq!(
871            spec,
872            format!("https://github.com/org/dep.git#{sha}&path:/packages/special")
873        );
874        let (url, committish, subpath) = parse_git_spec(&spec).unwrap();
875        assert_eq!(url, "https://github.com/org/dep.git");
876        assert_eq!(committish.as_deref(), Some(sha));
877        assert_eq!(subpath.as_deref(), Some("packages/special"));
878    }
879
880    #[test]
881    fn parse_hosted_git_recognizes_canonical_forms() {
882        // All these point at the same (github.com, owner, repo) tuple
883        // and must map to the same HostedGit so the runtime fetch URL
884        // doesn't depend on which scheme the lockfile happens to record.
885        let canonical = HostedGit {
886            host: HostedGitHost::GitHub,
887            owner: "owner".to_string(),
888            repo: "repo".to_string(),
889        };
890        for spec in [
891            "https://github.com/owner/repo.git",
892            "https://github.com/owner/repo",
893            "http://github.com/owner/repo.git",
894            "git+https://github.com/owner/repo.git",
895            "git+https://github.com/owner/repo",
896            "git://github.com/owner/repo.git",
897            "ssh://git@github.com/owner/repo.git",
898            "git+ssh://git@github.com/owner/repo.git",
899            "git@github.com:owner/repo.git",
900        ] {
901            assert_eq!(
902                parse_hosted_git(spec).as_ref(),
903                Some(&canonical),
904                "spec {spec} should map to canonical HostedGit",
905            );
906        }
907    }
908
909    #[test]
910    fn parse_hosted_git_returns_none_for_non_hosted() {
911        // Self-hosted GitLab / Gitea / arbitrary hosts: no codeload
912        // template, so the codeload fast path doesn't apply.
913        for spec in [
914            "https://example.com/owner/repo.git",
915            "ssh://git@gitea.internal/owner/repo.git",
916            "git+ssh://git@gitlab.example.com/group/sub/repo.git",
917            "https://github.com/owner/repo/sub",
918            "https://github.com/owner",
919        ] {
920            assert!(
921                parse_hosted_git(spec).is_none(),
922                "spec {spec} must not match a hosted provider",
923            );
924        }
925    }
926
927    #[test]
928    fn hosted_tarball_url_only_for_full_sha() {
929        let g = HostedGit {
930            host: HostedGitHost::GitHub,
931            owner: "o".to_string(),
932            repo: "r".to_string(),
933        };
934        let sha = "abcdef0123456789abcdef0123456789abcdef01";
935        assert_eq!(
936            g.tarball_url(sha).as_deref(),
937            Some("https://codeload.github.com/o/r/tar.gz/abcdef0123456789abcdef0123456789abcdef01"),
938        );
939        // Branch / tag / abbreviated SHA don't take the fast path —
940        // codeload accepts them but the wrapper-dir name varies and
941        // we can't verify a non-SHA committish post-extraction.
942        assert!(g.tarball_url("main").is_none());
943        assert!(g.tarball_url("v1.2.3").is_none());
944        assert!(g.tarball_url("abcdef0").is_none());
945    }
946
947    #[test]
948    fn hosted_tarball_url_per_provider() {
949        let sha = "abcdef0123456789abcdef0123456789abcdef01";
950        let gitlab = HostedGit {
951            host: HostedGitHost::GitLab,
952            owner: "g".to_string(),
953            repo: "r".to_string(),
954        }
955        .tarball_url(sha)
956        .unwrap();
957        assert!(gitlab.starts_with("https://gitlab.com/g/r/-/archive/"));
958        assert!(gitlab.ends_with("/r-abcdef0123456789abcdef0123456789abcdef01.tar.gz"));
959        let bitbucket = HostedGit {
960            host: HostedGitHost::Bitbucket,
961            owner: "g".to_string(),
962            repo: "r".to_string(),
963        }
964        .tarball_url(sha)
965        .unwrap();
966        assert_eq!(
967            bitbucket,
968            "https://bitbucket.org/g/r/get/abcdef0123456789abcdef0123456789abcdef01.tar.gz",
969        );
970    }
971
972    #[test]
973    fn hosted_https_url_normalizes() {
974        let g = parse_hosted_git("git+ssh://git@github.com/owner/repo.git").unwrap();
975        assert_eq!(g.https_url(), "https://github.com/owner/repo.git");
976    }
977
978    #[test]
979    fn path_traversal_components_in_subpath_are_rejected() {
980        // `..` and `.` components would let a crafted spec escape the
981        // clone dir at install time. The parser drops them so the
982        // resolver/installer never see a traversal-laden subpath.
983        let cases = [
984            "github:org/dep#main&path:/../../etc",
985            "github:org/dep#main&path:/packages/../../../etc",
986            "github:org/dep#main&path:/./packages/foo",
987            "github:org/dep#main&path:/packages//foo",
988        ];
989        for spec in cases {
990            let (_, _, subpath) = parse_git_spec(spec).unwrap();
991            assert_eq!(subpath, None, "spec should drop subpath: {spec}");
992        }
993    }
994
995    #[test]
996    fn dep_path_distinguishes_subpaths_under_same_commit() {
997        // Two packages from the same repo+commit but different
998        // subdirs must hash to distinct dep_paths so the linker
999        // doesn't collapse them.
1000        let sha = "abcdef0123456789abcdef0123456789abcdef01";
1001        let a = LocalSource::Git(GitSource {
1002            url: "https://example.com/r.git".to_string(),
1003            committish: None,
1004            resolved: sha.to_string(),
1005            integrity: None,
1006            subpath: Some("packages/a".to_string()),
1007        });
1008        let b = LocalSource::Git(GitSource {
1009            url: "https://example.com/r.git".to_string(),
1010            committish: None,
1011            resolved: sha.to_string(),
1012            integrity: None,
1013            subpath: Some("packages/b".to_string()),
1014        });
1015        assert_ne!(a.dep_path("dep"), b.dep_path("dep"));
1016    }
1017}