Skip to main content

lex_extension_host/resolve/
fetcher.rs

1//! [`Fetcher`] trait — the contract per-transport network resolvers
2//! implement.
3//!
4//! The host owns the cache (content-hashed lookup, TTL bookkeeping,
5//! `~/.cache/lex/labels/` layout); the fetcher only knows how to
6//! fetch one URI's contents to a directory. This split keeps the
7//! per-transport implementation small — the git fetcher only needs to
8//! shell out to `git clone`, not understand lex's cache layout.
9//!
10//! ## Transports vs. URL templates
11//!
12//! The model (specified in `comms/specs/proposals/extending-lex-stores.lex`)
13//! decomposes the resolver into three real *transports* and N *URL
14//! templates*:
15//!
16//! - **Transports** carry the actual data movement. Three ship today:
17//!   - `path:` — built-in local filesystem read, implemented in
18//!     [`super::path`]. Special-cased upstream of registry dispatch
19//!     (no [`Fetcher`] impl, no cache); listed here for completeness
20//!     of the transport set.
21//!   - `https:` — HTTPS GET of a tarball/zip, implemented as the
22//!     [`HttpsFetcher`] [`Fetcher`] in this module.
23//!   - `git:` / `git+ssh:` — git clone, implemented as the
24//!     [`GitFetcher`] [`Fetcher`] in this module. Accepts any URL
25//!     form `git clone` accepts; the `git+ssh:` scheme is retained
26//!     for backwards compatibility and dispatched to the same fetcher.
27//! - **URL templates** are forge-shorthands that expand into a
28//!   transport URI before registry dispatch. They live in
29//!   [`super::template`] and have no `Fetcher` impl — they're pure
30//!   functions over URIs. `github:owner/repo` and `gitlab:owner/repo`
31//!   are the two templates shipped today.
32//!
33//! Implementation status: both real transports ship today. `https:`
34//! uses ureq + tar + zip extraction (see [`HttpsFetcher`]); `git:` /
35//! `git+ssh:` shell out to `git clone --depth=1` (see [`GitFetcher`]).
36
37use std::path::Path;
38use std::process::Command;
39
40#[cfg(feature = "https-fetcher")]
41use std::io::Read;
42
43use super::uri::ParsedUri;
44
45/// Per-transport network resolver. Implementations fetch the URI's
46/// contents into a caller-provided destination directory.
47///
48/// ## Contract
49///
50/// - **`dest` is an empty directory the caller owns.** The fetcher
51///   writes the schema files (or a subdirectory if the URI's
52///   `subdir` knob is set) directly into `dest`. Cache layout,
53///   content hashing, and TTL bookkeeping are the host's
54///   responsibility; the fetcher just fetches.
55/// - **Honour `uri.subdir` if present.** After extracting a tarball
56///   or cloning a repo, copy the contents of `uri.subdir/` (relative
57///   to the fetched root) into `dest`, not the whole repo. The
58///   schema loader scans `dest` directly — it doesn't descend.
59/// - **Return [`FetchError`] variants the host can surface.** Keep
60///   the per-fetcher error type small; specific causes (HTTP status
61///   code, git error code) go in the `Other` variant's message.
62pub trait Fetcher: Send + Sync {
63    /// Fetch `uri`'s contents into `dest`. `dest` is guaranteed to
64    /// exist and be empty when this is called.
65    fn fetch(&self, uri: &ParsedUri, dest: &Path) -> Result<(), FetchError>;
66
67    /// URI schemes this fetcher handles. Typically a single-element
68    /// slice (one fetcher per scheme), but a fetcher can claim
69    /// multiple schemes if its implementation is shared — e.g.,
70    /// [`GitFetcher`] claims both `git` and `git+ssh` because the
71    /// underlying `git clone` accepts both URL forms.
72    ///
73    /// Returned as `&'static [&'static str]` so the
74    /// [`super::registry::FetcherRegistry`] can build its scheme map
75    /// without allocating.
76    fn schemes(&self) -> &'static [&'static str];
77
78    /// True when `rev` is an immutable reference (Git tag, content
79    /// hash, SHA). Drives cache TTL: immutable refs cache
80    /// indefinitely; mutable refs (branches, `None`) have a 24-hour
81    /// TTL after which the cache invalidates and the next resolve
82    /// re-fetches.
83    ///
84    /// Default: `false` for any input. Fetchers should override
85    /// when they can confidently distinguish — e.g., [`GitFetcher`]
86    /// would return `true` for `rev` matching `^[0-9a-f]{7,40}$`
87    /// (SHA-ish) or `^v?\d+\.\d+`-ish (tag heuristic). Returning
88    /// `false` from a default-impl-using fetcher is always safe
89    /// (cache invalidates more often than necessary; never less).
90    fn is_immutable_rev(&self, _rev: Option<&str>) -> bool {
91        false
92    }
93}
94
95/// Errors a [`Fetcher`] surfaces. Wrapped by [`super::ResolveError::Fetch`]
96/// at the top-level resolve API.
97#[derive(Debug)]
98#[non_exhaustive]
99pub enum FetchError {
100    /// The fetcher hasn't been implemented yet — placeholder for the
101    /// pre-implementation stubs. Real fetchers never return this.
102    Unimplemented { scheme: String, message: String },
103    /// Network IO failed (timeout, DNS, connection refused, …).
104    Network { message: String },
105    /// Server returned a non-success status (HTTP 4xx/5xx, git
106    /// permission denied, …).
107    UpstreamStatus { status: String, message: String },
108    /// The fetched archive couldn't be extracted (corrupt tarball,
109    /// unrecognised format, …).
110    Extract { message: String },
111    /// IO failed during the fetcher's local writes (out of disk,
112    /// permission denied on the cache dir, …).
113    Io(std::io::Error),
114    /// Some other per-fetcher condition the variants above don't
115    /// capture. Use sparingly — prefer adding a typed variant if the
116    /// condition is recurring.
117    Other { message: String },
118}
119
120impl std::fmt::Display for FetchError {
121    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
122        match self {
123            FetchError::Unimplemented { scheme, message } => {
124                write!(f, "`{scheme}:` resolver not implemented: {message}")
125            }
126            FetchError::Network { message } => write!(f, "network error: {message}"),
127            FetchError::UpstreamStatus { status, message } => {
128                write!(f, "upstream returned {status}: {message}")
129            }
130            FetchError::Extract { message } => write!(f, "archive extraction failed: {message}"),
131            FetchError::Io(e) => write!(f, "fetcher io error: {e}"),
132            FetchError::Other { message } => write!(f, "{message}"),
133        }
134    }
135}
136
137impl std::error::Error for FetchError {
138    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
139        match self {
140            FetchError::Io(e) => Some(e),
141            _ => None,
142        }
143    }
144}
145
146impl From<std::io::Error> for FetchError {
147    fn from(e: std::io::Error) -> Self {
148        FetchError::Io(e)
149    }
150}
151
152/// HTTPS tarball/zip transport. Performs a single HTTPS GET against
153/// the URI body, expects a `tar.gz` or `zip` archive in response, and
154/// extracts it into the destination directory. Honors `uri.subdir`
155/// for archives that wrap their content in a top-level directory (the
156/// GitHub tarball API does this) or that ship schemas alongside
157/// unrelated content.
158///
159/// This is also the underlying transport that `github:` and `gitlab:`
160/// URL templates expand into when their `via` knob picks https (the
161/// default — see [`super::template`]).
162///
163/// Auth is by way of an optional `Authorization` (or arbitrary)
164/// header pass-through with `${ENV_VAR}` interpolation. Plumbing the
165/// header through from `lex-config` is a follow-up (see issue #651);
166/// for now the fetcher reads no headers from configuration.
167///
168/// Implementation notes:
169///
170/// - Sync via `ureq` — keeps tokio off the resolver boot path.
171///   `rustls` + `webpki-roots` so HTTPS works without OS-OpenSSL.
172/// - 256 MiB response cap — a pathological server can't OOM us.
173/// - Path-traversal defence: archive members with absolute paths or
174///   `..` components are rejected; symlinks are skipped.
175///
176/// See `comms/specs/proposals/extending-lex-stores.lex` §3.2 and §6.2.
177#[derive(Debug, Default, Clone, Copy)]
178pub struct HttpsFetcher;
179
180/// Hard cap on archive size. 256 MiB is generous for any plausible
181/// schema bundle; a tarball larger than this is almost certainly the
182/// wrong artifact pointed at the wrong URI.
183#[cfg(feature = "https-fetcher")]
184const HTTPS_RESPONSE_CAP_BYTES: u64 = 256 * 1024 * 1024;
185
186/// Hard cap on error-response bodies. Error bodies don't need to be
187/// large (they're consumed verbatim into a diagnostic string); a
188/// hostile or misbehaving server returning a 500 with a 1 GiB body
189/// shouldn't be allowed to OOM us via the error path either.
190#[cfg(feature = "https-fetcher")]
191const HTTPS_ERROR_BODY_CAP_BYTES: u64 = 64 * 1024;
192
193/// Per-request connect timeout. The resolver runs at boot; a stalled
194/// server shouldn't be able to hang it indefinitely.
195#[cfg(feature = "https-fetcher")]
196const HTTPS_CONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
197
198/// Overall read timeout. Covers both DNS-to-headers and headers-to-EOF
199/// — generous enough for slow tarball fetches over flaky links, tight
200/// enough that a wedged connection doesn't sit forever.
201#[cfg(feature = "https-fetcher")]
202const HTTPS_READ_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(120);
203
204#[cfg(feature = "https-fetcher")]
205impl Fetcher for HttpsFetcher {
206    fn fetch(&self, uri: &ParsedUri, dest: &Path) -> Result<(), FetchError> {
207        // ParsedUri::body for `https:` includes the leading `//`
208        // (`//api.github.com/...`); ureq wants the full URL with
209        // scheme, so reconstruct.
210        let url = format!("https:{}", uri.body);
211
212        let agent = ureq::AgentBuilder::new()
213            .timeout_connect(HTTPS_CONNECT_TIMEOUT)
214            .timeout_read(HTTPS_READ_TIMEOUT)
215            .build();
216
217        let response = agent
218            .get(&url)
219            .set(
220                "User-Agent",
221                "lex-extension-host (https://github.com/lex-fmt/lex)",
222            )
223            .call()
224            .map_err(map_ureq_error)?;
225
226        let content_type = response.header("Content-Type").map(|s| s.to_string());
227        let format = super::extract::detect_format(content_type.as_deref(), &uri.body);
228
229        // Stream the response body to a tempfile rather than buffering
230        // the whole archive in memory. Schema bundles are typically
231        // KB-MB but the cap is 256 MiB; the tempfile keeps resident
232        // memory bounded for the pathological case. `zip::ZipArchive`
233        // needs `Read + Seek`, which a File provides; `tar::Archive`
234        // doesn't need Seek but accepts it.
235        let mut response_reader = response.into_reader().take(HTTPS_RESPONSE_CAP_BYTES + 1);
236        let mut temp = tempfile::tempfile().map_err(FetchError::Io)?;
237        let written = std::io::copy(&mut response_reader, &mut temp).map_err(FetchError::Io)?;
238        if written > HTTPS_RESPONSE_CAP_BYTES {
239            return Err(FetchError::Extract {
240                message: format!("response exceeded {HTTPS_RESPONSE_CAP_BYTES}-byte cap"),
241            });
242        }
243        use std::io::Seek;
244        temp.rewind().map_err(FetchError::Io)?;
245
246        super::extract::extract_archive_into(temp, format, dest, uri.subdir.as_deref())
247            .map_err(map_extract_error)?;
248
249        Ok(())
250    }
251
252    fn schemes(&self) -> &'static [&'static str] {
253        &["https"]
254    }
255}
256
257/// Stub HttpsFetcher impl for builds that disable the `https-fetcher`
258/// feature (notably wasm32-unknown-unknown, where `ring`'s `getrandom`
259/// dep doesn't compile). Returns [`FetchError::Unimplemented`] so the
260/// trait shape stays uniform across feature variants — callers don't
261/// need to special-case "is this build's HttpsFetcher real?"
262#[cfg(not(feature = "https-fetcher"))]
263impl Fetcher for HttpsFetcher {
264    fn fetch(&self, _uri: &ParsedUri, _dest: &Path) -> Result<(), FetchError> {
265        Err(FetchError::Unimplemented {
266            scheme: "https".into(),
267            message: "https: fetcher disabled at build time (the `https-fetcher` feature on lex-extension-host wasn't enabled — common for wasm targets where the underlying TLS chain doesn't compile)".into(),
268        })
269    }
270
271    fn schemes(&self) -> &'static [&'static str] {
272        &["https"]
273    }
274}
275
276#[cfg(feature = "https-fetcher")]
277fn map_ureq_error(e: ureq::Error) -> FetchError {
278    match e {
279        ureq::Error::Status(code, response) => {
280            // Cap the error-body read at 64 KiB. ureq's
281            // `Response::into_string` reads without bound; a
282            // misbehaving server returning a giant 4xx/5xx body could
283            // bypass HTTPS_RESPONSE_CAP_BYTES (which only applies to
284            // the success path) and exhaust memory on the error
285            // diagnostic. 64 KiB is far more than any sane error body
286            // would carry.
287            let mut reader = response.into_reader().take(HTTPS_ERROR_BODY_CAP_BYTES);
288            let mut buf = String::new();
289            use std::io::Read as _;
290            let _ = reader.read_to_string(&mut buf);
291            FetchError::UpstreamStatus {
292                status: format!("{code}"),
293                message: if buf.is_empty() {
294                    "<empty body>".into()
295                } else {
296                    buf
297                },
298            }
299        }
300        ureq::Error::Transport(t) => FetchError::Network {
301            message: t.to_string(),
302        },
303    }
304}
305
306#[cfg(feature = "https-fetcher")]
307fn map_extract_error(e: super::extract::ExtractError) -> FetchError {
308    use super::extract::ExtractError;
309    match e {
310        ExtractError::Io(io_err) => FetchError::Io(io_err),
311        other => FetchError::Extract {
312            message: other.to_string(),
313        },
314    }
315}
316
317/// Git transport. Shells out to `git clone --depth=1` to fetch a
318/// repository into the destination directory. Honors `uri.rev` as
319/// `--branch` (branch or tag name) and `uri.subdir` to extract a
320/// subdirectory of the repo as the schema root. Removes `.git/` after
321/// clone — the cache only holds schema content.
322///
323/// Accepts both `git:` and `git+ssh:` schemes (claimed in
324/// [`Self::schemes`]). The URL forms supported:
325///
326/// - `git:https://host/path/repo.git` — body is the full URL, passed
327///   to `git clone` as-is.
328/// - `git:git@host:owner/repo.git` — body is the scp-like URL, passed
329///   to `git clone` as-is.
330/// - `git:file:///path/to/bare` — body is a `file://` URL pointing at
331///   a local bare repo (used in tests and as the local-mirror
332///   escape hatch).
333/// - `git+ssh://git@host/path/repo.git` — body is `//git@host/...`;
334///   the fetcher reconstructs `git+ssh://...` for the clone command
335///   (git accepts the `git+ssh` scheme directly).
336///
337/// ## Why shell out
338///
339/// Spec §6.3 spells out the reasoning. Briefly: libgit2's credential
340/// coverage is incomplete in ways that matter (macOS keychain
341/// integration, SAML SSO, Kerberos), so a libgit2-backed fetcher would
342/// produce a UX divide between "private repos that work" and "private
343/// repos that don't" with no clear story for the user. Shell-out
344/// inherits everything `git clone` honors at the command line: SSH
345/// agent, OS keychain helpers (osxkeychain, libsecret, GCM, GCMcore),
346/// `gh auth setup-git`, `gitconfig`-declared SSO providers. There is
347/// no Lex-side credential knob.
348///
349/// ## Constraints
350///
351/// `git` must be in `PATH`. The fetcher returns
352/// [`FetchError::Other`] with a clear message if the binary isn't
353/// found; the diagnostic surface tells the user to install git or fall
354/// back to a [`path:`-scheme] / `--ext-schema` local schema. Spec §6.3
355/// covers the rationale for not bundling git.
356///
357/// ## Errors
358///
359/// Git's stderr is classified into typed variants for the host
360/// diagnostic surface:
361///
362/// - [`FetchError::Network`] for connectivity failures (DNS,
363///   connection refused/timeout, unreachable).
364/// - [`FetchError::UpstreamStatus`] for auth-shaped failures
365///   (permission denied, authentication failed, repository not found
366///   — which the github/gitlab APIs also use as a private-repo
367///   not-authorised signal).
368/// - [`FetchError::Other`] carrying the raw stderr text for anything
369///   else (unknown ref, corrupted upstream, etc.).
370///
371/// ## Interaction with URL templates
372///
373/// This is the transport the `github:` and `gitlab:` URL templates
374/// expand into when their `via` knob is `"git"` (the private-repo
375/// path; default for those templates is `via = "https"`, which uses
376/// the [`HttpsFetcher`] tarball API instead). See [`super::template`].
377///
378/// [`path:`-scheme]: super::path
379#[derive(Debug, Default, Clone, Copy)]
380pub struct GitFetcher;
381
382impl Fetcher for GitFetcher {
383    fn fetch(&self, uri: &ParsedUri, dest: &Path) -> Result<(), FetchError> {
384        let url = reconstruct_git_url(&uri.scheme, &uri.body);
385
386        // Normalize subdir (`/labels/`, `labels/`, `/labels` → `labels`).
387        // Empty after trim → treat as no subdir.
388        let subdir = uri
389            .subdir
390            .as_deref()
391            .map(|s| s.trim_matches('/').to_string())
392            .filter(|s| !s.is_empty());
393
394        // Clone into a hidden subdirectory of dest, then promote the
395        // desired contents (whole repo, or `subdir/` if set) up to
396        // dest. Cloning into a subdirectory of dest avoids needing a
397        // tempfile dep (the issue spec calls for std-only deps) and
398        // avoids needing write access to dest's parent. The `.lex-` /
399        // dot prefix means we won't shadow any real file the schema
400        // ships.
401        let clone_dir = dest.join(".lex-git-clone");
402
403        // Validate subdir upfront so we don't bother cloning if it
404        // would escape the clone root. Rejects `..` components,
405        // absolute paths, and platform prefixes — the symlink check
406        // happens post-clone (we need the on-disk tree to inspect).
407        if let Some(sub) = subdir.as_deref() {
408            validate_subdir(sub)?;
409        }
410
411        let mut cmd = Command::new("git");
412        cmd.arg("clone").arg("--depth=1");
413        if let Some(rev) = uri.rev.as_deref().filter(|s| !s.is_empty()) {
414            // `--branch` accepts arbitrary user input (the rev). git
415            // does not interpret its `--branch` argument as a flag —
416            // it expects a ref name — but the rev still flows from a
417            // namespace config the user wrote, so terminate option
418            // parsing before any user-controlled positional with
419            // `--` below.
420            cmd.arg("--branch").arg(rev);
421        }
422        // `--` terminates option parsing so a URL starting with `-`
423        // can't be mistaken for a flag (option-injection defence; the
424        // URL flows from user config). Same reasoning for the
425        // clone-dir positional, though that one is host-controlled.
426        cmd.arg("--").arg(&url).arg(&clone_dir);
427        // Suppress interactive credential prompts; if the user's
428        // credential helper can't satisfy the request non-interactively
429        // we want a clean error rather than a hung boot path.
430        cmd.env("GIT_TERMINAL_PROMPT", "0");
431
432        let output = cmd.output().map_err(|e| {
433            if e.kind() == std::io::ErrorKind::NotFound {
434                FetchError::Other {
435                    message: "git binary not in PATH; install git, or use a `path:` URI / `--ext-schema` flag for a local schema".into(),
436                }
437            } else {
438                FetchError::Io(e)
439            }
440        })?;
441
442        if !output.status.success() {
443            // Best-effort cleanup of the partial clone dir before
444            // surfacing the error.
445            let _ = std::fs::remove_dir_all(&clone_dir);
446            let stderr = String::from_utf8_lossy(&output.stderr).into_owned();
447            return Err(classify_git_clone_error(&stderr));
448        }
449
450        // Source of the content we keep is either `<clone>/<subdir>`
451        // or `<clone>` directly. `safe_subdir_join` walks the path
452        // component by component, refusing to follow any component
453        // that's a symlink on disk — a repo-shipped symlink at the
454        // subdir root would otherwise let us read/copy arbitrary
455        // filesystem paths into the cache.
456        let source = match subdir.as_deref() {
457            Some(sub) => match safe_subdir_join(&clone_dir, sub) {
458                Ok(p) => {
459                    if !p.is_dir() {
460                        let _ = std::fs::remove_dir_all(&clone_dir);
461                        return Err(FetchError::Other {
462                            message: format!(
463                                "subdir `{sub}` not found in cloned repo (clone succeeded but the path doesn't exist)"
464                            ),
465                        });
466                    }
467                    p
468                }
469                Err(e) => {
470                    let _ = std::fs::remove_dir_all(&clone_dir);
471                    return Err(e);
472                }
473            },
474            None => clone_dir.clone(),
475        };
476
477        // Copy contents into dest. Skip `.git` (the cache only holds
478        // schema content) and skip the `.lex-git-clone` directory
479        // itself (we're walking it as a source, but for the no-subdir
480        // case it's literally a sibling of where we're writing, so
481        // exclude it to avoid recursive copying).
482        copy_dir_contents(&source, dest, &clone_dir).map_err(FetchError::Io)?;
483
484        // Clean up the clone dir; we've copied what we need.
485        std::fs::remove_dir_all(&clone_dir).map_err(FetchError::Io)?;
486
487        Ok(())
488    }
489
490    fn schemes(&self) -> &'static [&'static str] {
491        &["git", "git+ssh"]
492    }
493
494    fn is_immutable_rev(&self, rev: Option<&str>) -> bool {
495        is_immutable_git_rev(rev)
496    }
497}
498
499/// Reconstruct the URL to pass to `git clone` from the parsed URI's
500/// `(scheme, body)` pair.
501///
502/// - `git:<body>` → `<body>` (body is the verbatim URL — `https://...`,
503///   `git@host:path`, `file:///...`, etc.).
504/// - `git+ssh:<body>` (body is `//user@host/path`) → `git+ssh:<body>`.
505///   Git accepts `git+ssh://...` as a synonym for `ssh://...` since
506///   2.x.
507fn reconstruct_git_url(scheme: &str, body: &str) -> String {
508    match scheme {
509        "git+ssh" => format!("git+ssh:{body}"),
510        // `git:` and anything else (the registry only routes `git:` and
511        // `git+ssh:` here, but be defensive in case a custom registry
512        // routes another scheme).
513        _ => body.to_string(),
514    }
515}
516
517/// Reject `subdir` values that escape the clone root *lexically*
518/// (before any disk lookup): `..` components, absolute paths, and
519/// platform prefixes (Windows drive letters, UNC roots). The
520/// post-clone path-component walk in [`safe_subdir_join`] catches the
521/// symlink-escape case; this is the first line of defence and runs
522/// pre-clone so a hostile subdir doesn't even cost us a network round
523/// trip.
524fn validate_subdir(subdir: &str) -> Result<(), FetchError> {
525    use std::path::Component;
526    let path = Path::new(subdir);
527    if path.is_absolute() {
528        return Err(FetchError::Other {
529            message: format!(
530                "subdir `{subdir}` is absolute; subdir must be a relative path within the cloned repo"
531            ),
532        });
533    }
534    for component in path.components() {
535        match component {
536            Component::ParentDir => {
537                return Err(FetchError::Other {
538                    message: format!(
539                        "subdir `{subdir}` contains `..`; refusing to escape the clone root"
540                    ),
541                });
542            }
543            Component::Prefix(_) | Component::RootDir => {
544                return Err(FetchError::Other {
545                    message: format!(
546                        "subdir `{subdir}` is rooted (absolute path or platform prefix); refusing"
547                    ),
548                });
549            }
550            Component::Normal(_) | Component::CurDir => {}
551        }
552    }
553    Ok(())
554}
555
556/// Join `clone_dir + subdir` while refusing to traverse a symlink at
557/// any intermediate component. The lexical [`validate_subdir`] is the
558/// belt; this is the suspenders for the on-disk side — a repo-shipped
559/// symlinked directory (`git checkout` happily restores symlinks from
560/// the tree object) at any point in the subdir path would otherwise
561/// let the post-clone copy read/write filesystem locations outside the
562/// clone root.
563///
564/// Walks each `Normal` component, accumulating the path, and rejects
565/// the first one whose `symlink_metadata` reports a symlink. Missing
566/// intermediate components terminate the walk early — the caller
567/// surfaces "subdir not found" against the joined path.
568fn safe_subdir_join(clone_dir: &Path, subdir: &str) -> Result<std::path::PathBuf, FetchError> {
569    use std::path::Component;
570    // validate_subdir is the lexical pre-flight; call it again here
571    // so this helper is safe to use standalone (defence in depth).
572    validate_subdir(subdir)?;
573
574    let mut accumulated = clone_dir.to_path_buf();
575    for component in Path::new(subdir).components() {
576        if let Component::Normal(name) = component {
577            accumulated.push(name);
578            match std::fs::symlink_metadata(&accumulated) {
579                Ok(meta) if meta.file_type().is_symlink() => {
580                    return Err(FetchError::Other {
581                        message: format!(
582                            "subdir component `{}` is a symlink in the cloned repo; refusing to follow",
583                            accumulated
584                                .strip_prefix(clone_dir)
585                                .unwrap_or(&accumulated)
586                                .display()
587                        ),
588                    });
589                }
590                Ok(_) | Err(_) => {
591                    // Non-symlink → keep walking. Missing entry →
592                    // stop; the caller handles "subdir not found"
593                    // against the final joined path.
594                }
595            }
596        }
597    }
598    Ok(accumulated)
599}
600
601/// Classify git's stderr into a typed [`FetchError`]. The heuristics
602/// are conservative — when we can't recognise the failure shape we
603/// fall through to [`FetchError::Other`] with the raw stderr so the
604/// user sees git's own message verbatim.
605fn classify_git_clone_error(stderr: &str) -> FetchError {
606    let lower = stderr.to_ascii_lowercase();
607    if lower.contains("could not resolve host")
608        || lower.contains("could not connect")
609        || lower.contains("connection refused")
610        || lower.contains("connection timed out")
611        || lower.contains("network is unreachable")
612        || lower.contains("no route to host")
613    {
614        FetchError::Network {
615            message: stderr.trim().to_string(),
616        }
617    } else if lower.contains("permission denied")
618        || lower.contains("authentication failed")
619        || lower.contains("could not read username")
620        || lower.contains("access denied")
621        || lower.contains("repository not found")
622    {
623        // Note: `could not read from remote repository` is intentionally
624        // NOT in this list. Git emits that line on most clone failures
625        // (auth, missing repo, wrong endpoint, etc.) so it's too broad
626        // to disambiguate. The auth-shaped failures all surface a more
627        // specific marker above; everything else falls through to
628        // FetchError::Other with git's raw stderr.
629        FetchError::UpstreamStatus {
630            status: "auth".into(),
631            message: stderr.trim().to_string(),
632        }
633    } else {
634        FetchError::Other {
635            message: stderr.trim().to_string(),
636        }
637    }
638}
639
640/// True when `rev` looks like an immutable git reference. Drives the
641/// cache TTL: immutable refs are cached indefinitely, mutable refs
642/// expire after 24 hours.
643///
644/// Heuristics:
645///
646/// - SHA-shaped: 7-40 lowercase hex characters. Matches both
647///   short-SHA (`abc1234`) and full-SHA (40-char) forms. Uppercase hex
648///   is not matched — git itself emits lowercase, and matching
649///   uppercase would expand the false-positive surface for branch
650///   names that happen to be hex-shaped.
651/// - Tag-shaped: optional `v` prefix, then `<digit>+.<digit>+`
652///   (matches `v1.2`, `1.2`, `v0.14.0`, `1.2.3-rc4`, etc.). The
653///   `\d+\.\d+` minimum requirement excludes single-digit "branches"
654///   like `1` while keeping the common semver-prefixed tag shape.
655///
656/// Everything else (branch names, `None`) returns `false` — the cache
657/// treats them as mutable and invalidates on TTL.
658fn is_immutable_git_rev(rev: Option<&str>) -> bool {
659    let Some(rev) = rev else { return false };
660    let bytes = rev.as_bytes();
661
662    // SHA: 7-40 lowercase hex digits, nothing else.
663    if (7..=40).contains(&bytes.len())
664        && bytes
665            .iter()
666            .all(|&b| matches!(b, b'0'..=b'9' | b'a'..=b'f'))
667    {
668        return true;
669    }
670
671    // Tag: optional `v` prefix, then `<digits>.<digits>` (more
672    // components allowed after; we only require the first two).
673    let after_v = bytes.strip_prefix(b"v").unwrap_or(bytes);
674    let mut parts = after_v.split(|&b| b == b'.');
675    let (Some(first), Some(second)) = (parts.next(), parts.next()) else {
676        return false;
677    };
678    !first.is_empty()
679        && first.iter().all(|b| b.is_ascii_digit())
680        && !second.is_empty()
681        // Second component can have trailing non-digit characters
682        // (e.g. `1.2.3-rc4` → second = `2`; e.g. `1.2-pre` → second =
683        // `2-pre`). Require at least one leading digit, allow whatever
684        // after — git tag names can be arbitrarily decorated.
685        && second.iter().take_while(|b| b.is_ascii_digit()).count() > 0
686}
687
688/// Recursively copy contents of `src` into `dest`. Skips:
689///
690/// - The top-level `.git` directory (we don't need git's index /
691///   objects in the cache — only the schema content).
692/// - Anything matching `skip_path` (used to skip the
693///   `.lex-git-clone/` directory itself when `src` is its sibling, so
694///   the copy doesn't recursively follow into the source-of-truth).
695/// - Symlinks (same trust-surface reasoning as the extract module —
696///   archive/repo-shipped symlinks expand what the schema loader
697///   trusts).
698/// - Special files (sockets, FIFOs).
699fn copy_dir_contents(src: &Path, dest: &Path, skip_path: &Path) -> std::io::Result<()> {
700    for entry in std::fs::read_dir(src)? {
701        let entry = entry?;
702        let src_path = entry.path();
703        if src_path == skip_path {
704            continue;
705        }
706        let name = entry.file_name();
707        if name == ".git" {
708            continue;
709        }
710        let dest_path = dest.join(&name);
711        let file_type = entry.file_type()?;
712        if file_type.is_symlink() {
713            continue;
714        }
715        if file_type.is_dir() {
716            std::fs::create_dir_all(&dest_path)?;
717            copy_dir_contents_no_skip(&src_path, &dest_path)?;
718        } else if file_type.is_file() {
719            std::fs::copy(&src_path, &dest_path)?;
720        }
721        // Anything else (sockets, FIFOs, etc.) — skip.
722    }
723    Ok(())
724}
725
726/// Inner recursion that doesn't reapply the top-level skip rules.
727/// Nested directories shouldn't filter `.git` (a real `.git` deeper
728/// in the tree is regular content, not metadata) or `.lex-git-clone`
729/// (only the outermost level is a sibling of the source).
730fn copy_dir_contents_no_skip(src: &Path, dest: &Path) -> std::io::Result<()> {
731    for entry in std::fs::read_dir(src)? {
732        let entry = entry?;
733        let src_path = entry.path();
734        let dest_path = dest.join(entry.file_name());
735        let file_type = entry.file_type()?;
736        if file_type.is_symlink() {
737            continue;
738        }
739        if file_type.is_dir() {
740            std::fs::create_dir_all(&dest_path)?;
741            copy_dir_contents_no_skip(&src_path, &dest_path)?;
742        } else if file_type.is_file() {
743            std::fs::copy(&src_path, &dest_path)?;
744        }
745    }
746    Ok(())
747}
748
749#[cfg(test)]
750mod git_helper_tests {
751    use super::*;
752
753    // ---- is_immutable_git_rev ----
754
755    #[test]
756    fn immutable_rev_full_sha() {
757        assert!(is_immutable_git_rev(Some(
758            "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0"
759        )));
760    }
761
762    #[test]
763    fn immutable_rev_short_sha_seven_chars() {
764        assert!(is_immutable_git_rev(Some("a1b2c3d")));
765    }
766
767    #[test]
768    fn immutable_rev_rejects_sha_below_seven_chars() {
769        assert!(!is_immutable_git_rev(Some("abc123")));
770    }
771
772    #[test]
773    fn immutable_rev_rejects_uppercase_hex() {
774        // We only match lowercase — branch names that happen to be
775        // uppercase-hex-shaped shouldn't false-match as SHAs.
776        assert!(!is_immutable_git_rev(Some("ABC1234")));
777    }
778
779    #[test]
780    fn immutable_rev_semver_tag_with_v_prefix() {
781        assert!(is_immutable_git_rev(Some("v1.2.0")));
782        assert!(is_immutable_git_rev(Some("v0.14.0")));
783    }
784
785    #[test]
786    fn immutable_rev_semver_tag_without_v_prefix() {
787        assert!(is_immutable_git_rev(Some("1.2")));
788        assert!(is_immutable_git_rev(Some("1.2.3")));
789    }
790
791    #[test]
792    fn immutable_rev_semver_tag_with_decoration() {
793        // Tag names can carry decorations after the version
794        // (`-rc4`, `-pre`, etc.). The heuristic accepts these.
795        assert!(is_immutable_git_rev(Some("v1.2.3-rc4")));
796        assert!(is_immutable_git_rev(Some("1.2-pre")));
797    }
798
799    #[test]
800    fn immutable_rev_rejects_single_digit_branch_lookalike() {
801        // `1` alone isn't enough — no minor component.
802        assert!(!is_immutable_git_rev(Some("1")));
803        assert!(!is_immutable_git_rev(Some("v1")));
804    }
805
806    #[test]
807    fn immutable_rev_rejects_branch_names() {
808        assert!(!is_immutable_git_rev(Some("main")));
809        assert!(!is_immutable_git_rev(Some("master")));
810        assert!(!is_immutable_git_rev(Some("feature/foo")));
811        assert!(!is_immutable_git_rev(Some("release-2026-05")));
812    }
813
814    #[test]
815    fn immutable_rev_rejects_none() {
816        assert!(!is_immutable_git_rev(None));
817    }
818
819    #[test]
820    fn immutable_rev_rejects_empty_string() {
821        assert!(!is_immutable_git_rev(Some("")));
822    }
823
824    // ---- reconstruct_git_url ----
825
826    #[test]
827    fn reconstruct_url_git_scheme_passes_body_verbatim() {
828        assert_eq!(
829            reconstruct_git_url("git", "https://host/path/repo.git"),
830            "https://host/path/repo.git"
831        );
832        assert_eq!(
833            reconstruct_git_url("git", "git@host:owner/repo.git"),
834            "git@host:owner/repo.git"
835        );
836        assert_eq!(
837            reconstruct_git_url("git", "file:///tmp/bare"),
838            "file:///tmp/bare"
839        );
840    }
841
842    #[test]
843    fn reconstruct_url_git_ssh_scheme_rebuilds_full_url() {
844        // ParsedUri::parse("git+ssh://git@host/path.git") gives
845        // body = "//git@host/path.git"; the fetcher reconstructs
846        // the full URL by prepending the scheme.
847        assert_eq!(
848            reconstruct_git_url("git+ssh", "//git@host/path.git"),
849            "git+ssh://git@host/path.git"
850        );
851    }
852
853    // ---- classify_git_clone_error ----
854
855    #[test]
856    fn classify_dns_failure_is_network() {
857        let err = classify_git_clone_error(
858            "fatal: unable to access 'https://nonexistent.example/r.git/': Could not resolve host: nonexistent.example",
859        );
860        assert!(matches!(err, FetchError::Network { .. }), "got: {err:?}");
861    }
862
863    #[test]
864    fn classify_connection_refused_is_network() {
865        let err = classify_git_clone_error(
866            "fatal: unable to access 'https://localhost:1/r.git/': Failed to connect to localhost port 1: Connection refused",
867        );
868        assert!(matches!(err, FetchError::Network { .. }), "got: {err:?}");
869    }
870
871    #[test]
872    fn classify_auth_failure_is_upstream_status() {
873        let err = classify_git_clone_error(
874            "git@github.com: Permission denied (publickey).\nfatal: Could not read from remote repository.",
875        );
876        assert!(
877            matches!(err, FetchError::UpstreamStatus { .. }),
878            "got: {err:?}"
879        );
880    }
881
882    #[test]
883    fn classify_repository_not_found_is_upstream_status() {
884        // GitHub's "private repo without auth" surfaces as
885        // "Repository not found" — semantically an auth failure (the
886        // public can't see it).
887        let err = classify_git_clone_error(
888            "remote: Repository not found.\nfatal: repository 'https://github.com/private/secret.git/' not found",
889        );
890        assert!(
891            matches!(err, FetchError::UpstreamStatus { .. }),
892            "got: {err:?}"
893        );
894    }
895
896    // ---- validate_subdir ----
897
898    #[test]
899    fn validate_subdir_rejects_parent_dir() {
900        let err = validate_subdir("../escape").unwrap_err();
901        assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
902    }
903
904    #[test]
905    fn validate_subdir_rejects_parent_dir_in_middle() {
906        let err = validate_subdir("safe/../escape").unwrap_err();
907        assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
908    }
909
910    #[test]
911    fn validate_subdir_rejects_absolute_path() {
912        let err = validate_subdir("/etc/passwd").unwrap_err();
913        assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
914    }
915
916    #[test]
917    fn validate_subdir_accepts_normal_relative_path() {
918        validate_subdir("labels").unwrap();
919        validate_subdir("src/labels").unwrap();
920        validate_subdir("./labels").unwrap();
921        validate_subdir("a/b/c").unwrap();
922    }
923
924    // ---- safe_subdir_join ----
925
926    #[test]
927    fn safe_subdir_join_rejects_traversal_lexically_before_disk_lookup() {
928        // No clone_dir on disk needed — the lexical check fires first.
929        let err = safe_subdir_join(Path::new("/nonexistent"), "../escape").unwrap_err();
930        assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
931    }
932
933    #[test]
934    fn safe_subdir_join_refuses_symlink_at_subdir_root() {
935        let base = tempfile::tempdir().unwrap();
936        // Create a real "labels" target outside the base.
937        let outside = tempfile::tempdir().unwrap();
938        std::fs::create_dir(outside.path().join("real")).unwrap();
939        // Inside the clone dir, ship a `labels` symlink pointing
940        // outside. A repo-shipped symlink that git checkout restored
941        // is the exact attack we're defending against.
942        #[cfg(unix)]
943        std::os::unix::fs::symlink(outside.path(), base.path().join("labels")).unwrap();
944        #[cfg(windows)]
945        std::os::windows::fs::symlink_dir(outside.path(), base.path().join("labels")).unwrap();
946
947        let err = safe_subdir_join(base.path(), "labels").unwrap_err();
948        match err {
949            FetchError::Other { message } => assert!(
950                message.contains("symlink"),
951                "error should mention symlink, got: {message}"
952            ),
953            other => panic!("expected Other(symlink), got: {other:?}"),
954        }
955    }
956
957    #[test]
958    fn safe_subdir_join_refuses_symlink_at_intermediate_component() {
959        let base = tempfile::tempdir().unwrap();
960        let outside = tempfile::tempdir().unwrap();
961        std::fs::create_dir(outside.path().join("labels")).unwrap();
962        // `src` is a symlink, `src/labels` is the requested subdir.
963        // The intermediate component must be caught.
964        #[cfg(unix)]
965        std::os::unix::fs::symlink(outside.path(), base.path().join("src")).unwrap();
966        #[cfg(windows)]
967        std::os::windows::fs::symlink_dir(outside.path(), base.path().join("src")).unwrap();
968
969        let err = safe_subdir_join(base.path(), "src/labels").unwrap_err();
970        assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
971    }
972
973    #[test]
974    fn safe_subdir_join_accepts_normal_path_with_real_directories() {
975        let base = tempfile::tempdir().unwrap();
976        std::fs::create_dir_all(base.path().join("src/labels")).unwrap();
977        let joined = safe_subdir_join(base.path(), "src/labels").unwrap();
978        assert_eq!(joined, base.path().join("src/labels"));
979    }
980
981    #[test]
982    fn safe_subdir_join_accepts_path_with_missing_tail() {
983        // When the subdir doesn't exist, the walk terminates early
984        // and the caller surfaces "subdir not found" against the
985        // joined path. No error from safe_subdir_join itself.
986        let base = tempfile::tempdir().unwrap();
987        let joined = safe_subdir_join(base.path(), "does/not/exist").unwrap();
988        assert_eq!(joined, base.path().join("does/not/exist"));
989    }
990
991    #[test]
992    fn classify_unknown_ref_falls_through_to_other() {
993        let err = classify_git_clone_error(
994            "warning: Could not find remote branch nonexistent to clone.\nfatal: Remote branch nonexistent not found in upstream origin",
995        );
996        assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
997    }
998}