lex_extension_host/resolve/fetcher.rs
1//! [`Fetcher`] trait — the contract per-transport network resolvers
2//! implement.
3//!
4//! The host owns the cache (content-hashed lookup, TTL bookkeeping,
5//! `~/.cache/lex/labels/` layout); the fetcher only knows how to
6//! fetch one URI's contents to a directory. This split keeps the
7//! per-transport implementation small — the git fetcher only needs to
8//! shell out to `git clone`, not understand lex's cache layout.
9//!
10//! ## Transports vs. URL templates
11//!
12//! The model (specified in `comms/specs/proposals/extending-lex-stores.lex`)
13//! decomposes the resolver into three real *transports* and N *URL
14//! templates*:
15//!
16//! - **Transports** carry the actual data movement. Three ship today:
17//! - `path:` — built-in local filesystem read, implemented in
18//! [`super::path`]. Special-cased upstream of registry dispatch
19//! (no [`Fetcher`] impl, no cache); listed here for completeness
20//! of the transport set.
21//! - `https:` — HTTPS GET of a tarball/zip, implemented as the
22//! [`HttpsFetcher`] [`Fetcher`] in this module.
23//! - `git:` / `git+ssh:` — git clone, implemented as the
24//! [`GitFetcher`] [`Fetcher`] in this module. Accepts any URL
25//! form `git clone` accepts; the `git+ssh:` scheme is retained
26//! for backwards compatibility and dispatched to the same fetcher.
27//! - **URL templates** are forge-shorthands that expand into a
28//! transport URI before registry dispatch. They live in
29//! [`super::template`] and have no `Fetcher` impl — they're pure
30//! functions over URIs. `github:owner/repo` and `gitlab:owner/repo`
31//! are the two templates shipped today.
32//!
33//! Implementation status: both real transports ship today. `https:`
34//! uses ureq + tar + zip extraction (see [`HttpsFetcher`]); `git:` /
35//! `git+ssh:` shell out to `git clone --depth=1` (see [`GitFetcher`]).
36
37use std::path::Path;
38use std::process::Command;
39
40#[cfg(feature = "https-fetcher")]
41use std::io::Read;
42
43use super::uri::ParsedUri;
44
45/// Per-transport network resolver. Implementations fetch the URI's
46/// contents into a caller-provided destination directory.
47///
48/// ## Contract
49///
50/// - **`dest` is an empty directory the caller owns.** The fetcher
51/// writes the schema files (or a subdirectory if the URI's
52/// `subdir` knob is set) directly into `dest`. Cache layout,
53/// content hashing, and TTL bookkeeping are the host's
54/// responsibility; the fetcher just fetches.
55/// - **Honour `uri.subdir` if present.** After extracting a tarball
56/// or cloning a repo, copy the contents of `uri.subdir/` (relative
57/// to the fetched root) into `dest`, not the whole repo. The
58/// schema loader scans `dest` directly — it doesn't descend.
59/// - **Return [`FetchError`] variants the host can surface.** Keep
60/// the per-fetcher error type small; specific causes (HTTP status
61/// code, git error code) go in the `Other` variant's message.
62pub trait Fetcher: Send + Sync {
63 /// Fetch `uri`'s contents into `dest`. `dest` is guaranteed to
64 /// exist and be empty when this is called.
65 fn fetch(&self, uri: &ParsedUri, dest: &Path) -> Result<(), FetchError>;
66
67 /// URI schemes this fetcher handles. Typically a single-element
68 /// slice (one fetcher per scheme), but a fetcher can claim
69 /// multiple schemes if its implementation is shared — e.g.,
70 /// [`GitFetcher`] claims both `git` and `git+ssh` because the
71 /// underlying `git clone` accepts both URL forms.
72 ///
73 /// Returned as `&'static [&'static str]` so the
74 /// [`super::registry::FetcherRegistry`] can build its scheme map
75 /// without allocating.
76 fn schemes(&self) -> &'static [&'static str];
77
78 /// True when `rev` is an immutable reference (Git tag, content
79 /// hash, SHA). Drives cache TTL: immutable refs cache
80 /// indefinitely; mutable refs (branches, `None`) have a 24-hour
81 /// TTL after which the cache invalidates and the next resolve
82 /// re-fetches.
83 ///
84 /// Default: `false` for any input. Fetchers should override
85 /// when they can confidently distinguish — e.g., [`GitFetcher`]
86 /// would return `true` for `rev` matching `^[0-9a-f]{7,40}$`
87 /// (SHA-ish) or `^v?\d+\.\d+`-ish (tag heuristic). Returning
88 /// `false` from a default-impl-using fetcher is always safe
89 /// (cache invalidates more often than necessary; never less).
90 fn is_immutable_rev(&self, _rev: Option<&str>) -> bool {
91 false
92 }
93}
94
95/// Errors a [`Fetcher`] surfaces. Wrapped by [`super::ResolveError::Fetch`]
96/// at the top-level resolve API.
97#[derive(Debug)]
98#[non_exhaustive]
99pub enum FetchError {
100 /// The fetcher hasn't been implemented yet — placeholder for the
101 /// pre-implementation stubs. Real fetchers never return this.
102 Unimplemented { scheme: String, message: String },
103 /// Network IO failed (timeout, DNS, connection refused, …).
104 Network { message: String },
105 /// Server returned a non-success status (HTTP 4xx/5xx, git
106 /// permission denied, …).
107 UpstreamStatus { status: String, message: String },
108 /// The fetched archive couldn't be extracted (corrupt tarball,
109 /// unrecognised format, …).
110 Extract { message: String },
111 /// IO failed during the fetcher's local writes (out of disk,
112 /// permission denied on the cache dir, …).
113 Io(std::io::Error),
114 /// Some other per-fetcher condition the variants above don't
115 /// capture. Use sparingly — prefer adding a typed variant if the
116 /// condition is recurring.
117 Other { message: String },
118}
119
120impl std::fmt::Display for FetchError {
121 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
122 match self {
123 FetchError::Unimplemented { scheme, message } => {
124 write!(f, "`{scheme}:` resolver not implemented: {message}")
125 }
126 FetchError::Network { message } => write!(f, "network error: {message}"),
127 FetchError::UpstreamStatus { status, message } => {
128 write!(f, "upstream returned {status}: {message}")
129 }
130 FetchError::Extract { message } => write!(f, "archive extraction failed: {message}"),
131 FetchError::Io(e) => write!(f, "fetcher io error: {e}"),
132 FetchError::Other { message } => write!(f, "{message}"),
133 }
134 }
135}
136
137impl std::error::Error for FetchError {
138 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
139 match self {
140 FetchError::Io(e) => Some(e),
141 _ => None,
142 }
143 }
144}
145
146impl From<std::io::Error> for FetchError {
147 fn from(e: std::io::Error) -> Self {
148 FetchError::Io(e)
149 }
150}
151
152/// HTTPS tarball/zip transport. Performs a single HTTPS GET against
153/// the URI body, expects a `tar.gz` or `zip` archive in response, and
154/// extracts it into the destination directory. Honors `uri.subdir`
155/// for archives that wrap their content in a top-level directory (the
156/// GitHub tarball API does this) or that ship schemas alongside
157/// unrelated content.
158///
159/// This is also the underlying transport that `github:` and `gitlab:`
160/// URL templates expand into when their `via` knob picks https (the
161/// default — see [`super::template`]).
162///
163/// Auth is by way of an optional `Authorization` (or arbitrary)
164/// header pass-through with `${ENV_VAR}` interpolation. Plumbing the
165/// header through from `lex-config` is a follow-up (see issue #651);
166/// for now the fetcher reads no headers from configuration.
167///
168/// Implementation notes:
169///
170/// - Sync via `ureq` — keeps tokio off the resolver boot path.
171/// `rustls` + `webpki-roots` so HTTPS works without OS-OpenSSL.
172/// - 256 MiB response cap — a pathological server can't OOM us.
173/// - Path-traversal defence: archive members with absolute paths or
174/// `..` components are rejected; symlinks are skipped.
175///
176/// See `comms/specs/proposals/extending-lex-stores.lex` §3.2 and §6.2.
177#[derive(Debug, Default, Clone, Copy)]
178pub struct HttpsFetcher;
179
180/// Hard cap on archive size. 256 MiB is generous for any plausible
181/// schema bundle; a tarball larger than this is almost certainly the
182/// wrong artifact pointed at the wrong URI.
183#[cfg(feature = "https-fetcher")]
184const HTTPS_RESPONSE_CAP_BYTES: u64 = 256 * 1024 * 1024;
185
186/// Hard cap on error-response bodies. Error bodies don't need to be
187/// large (they're consumed verbatim into a diagnostic string); a
188/// hostile or misbehaving server returning a 500 with a 1 GiB body
189/// shouldn't be allowed to OOM us via the error path either.
190#[cfg(feature = "https-fetcher")]
191const HTTPS_ERROR_BODY_CAP_BYTES: u64 = 64 * 1024;
192
193/// Per-request connect timeout. The resolver runs at boot; a stalled
194/// server shouldn't be able to hang it indefinitely.
195#[cfg(feature = "https-fetcher")]
196const HTTPS_CONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
197
198/// Overall read timeout. Covers both DNS-to-headers and headers-to-EOF
199/// — generous enough for slow tarball fetches over flaky links, tight
200/// enough that a wedged connection doesn't sit forever.
201#[cfg(feature = "https-fetcher")]
202const HTTPS_READ_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(120);
203
204#[cfg(feature = "https-fetcher")]
205impl Fetcher for HttpsFetcher {
206 fn fetch(&self, uri: &ParsedUri, dest: &Path) -> Result<(), FetchError> {
207 // ParsedUri::body for `https:` includes the leading `//`
208 // (`//api.github.com/...`); ureq wants the full URL with
209 // scheme, so reconstruct.
210 let url = format!("https:{}", uri.body);
211
212 let agent = ureq::AgentBuilder::new()
213 .timeout_connect(HTTPS_CONNECT_TIMEOUT)
214 .timeout_read(HTTPS_READ_TIMEOUT)
215 .build();
216
217 let response = agent
218 .get(&url)
219 .set(
220 "User-Agent",
221 "lex-extension-host (https://github.com/lex-fmt/lex)",
222 )
223 .call()
224 .map_err(map_ureq_error)?;
225
226 let content_type = response.header("Content-Type").map(|s| s.to_string());
227 let format = super::extract::detect_format(content_type.as_deref(), &uri.body);
228
229 // Stream the response body to a tempfile rather than buffering
230 // the whole archive in memory. Schema bundles are typically
231 // KB-MB but the cap is 256 MiB; the tempfile keeps resident
232 // memory bounded for the pathological case. `zip::ZipArchive`
233 // needs `Read + Seek`, which a File provides; `tar::Archive`
234 // doesn't need Seek but accepts it.
235 let mut response_reader = response.into_reader().take(HTTPS_RESPONSE_CAP_BYTES + 1);
236 let mut temp = tempfile::tempfile().map_err(FetchError::Io)?;
237 let written = std::io::copy(&mut response_reader, &mut temp).map_err(FetchError::Io)?;
238 if written > HTTPS_RESPONSE_CAP_BYTES {
239 return Err(FetchError::Extract {
240 message: format!("response exceeded {HTTPS_RESPONSE_CAP_BYTES}-byte cap"),
241 });
242 }
243 use std::io::Seek;
244 temp.rewind().map_err(FetchError::Io)?;
245
246 super::extract::extract_archive_into(temp, format, dest, uri.subdir.as_deref())
247 .map_err(map_extract_error)?;
248
249 Ok(())
250 }
251
252 fn schemes(&self) -> &'static [&'static str] {
253 &["https"]
254 }
255}
256
257/// Stub HttpsFetcher impl for builds that disable the `https-fetcher`
258/// feature (notably wasm32-unknown-unknown, where `ring`'s `getrandom`
259/// dep doesn't compile). Returns [`FetchError::Unimplemented`] so the
260/// trait shape stays uniform across feature variants — callers don't
261/// need to special-case "is this build's HttpsFetcher real?"
262#[cfg(not(feature = "https-fetcher"))]
263impl Fetcher for HttpsFetcher {
264 fn fetch(&self, _uri: &ParsedUri, _dest: &Path) -> Result<(), FetchError> {
265 Err(FetchError::Unimplemented {
266 scheme: "https".into(),
267 message: "https: fetcher disabled at build time (the `https-fetcher` feature on lex-extension-host wasn't enabled — common for wasm targets where the underlying TLS chain doesn't compile)".into(),
268 })
269 }
270
271 fn schemes(&self) -> &'static [&'static str] {
272 &["https"]
273 }
274}
275
276#[cfg(feature = "https-fetcher")]
277fn map_ureq_error(e: ureq::Error) -> FetchError {
278 match e {
279 ureq::Error::Status(code, response) => {
280 // Cap the error-body read at 64 KiB. ureq's
281 // `Response::into_string` reads without bound; a
282 // misbehaving server returning a giant 4xx/5xx body could
283 // bypass HTTPS_RESPONSE_CAP_BYTES (which only applies to
284 // the success path) and exhaust memory on the error
285 // diagnostic. 64 KiB is far more than any sane error body
286 // would carry.
287 let mut reader = response.into_reader().take(HTTPS_ERROR_BODY_CAP_BYTES);
288 let mut buf = String::new();
289 use std::io::Read as _;
290 let _ = reader.read_to_string(&mut buf);
291 FetchError::UpstreamStatus {
292 status: format!("{code}"),
293 message: if buf.is_empty() {
294 "<empty body>".into()
295 } else {
296 buf
297 },
298 }
299 }
300 ureq::Error::Transport(t) => FetchError::Network {
301 message: t.to_string(),
302 },
303 }
304}
305
306#[cfg(feature = "https-fetcher")]
307fn map_extract_error(e: super::extract::ExtractError) -> FetchError {
308 use super::extract::ExtractError;
309 match e {
310 ExtractError::Io(io_err) => FetchError::Io(io_err),
311 other => FetchError::Extract {
312 message: other.to_string(),
313 },
314 }
315}
316
317/// Git transport. Shells out to `git clone --depth=1` to fetch a
318/// repository into the destination directory. Honors `uri.rev` as
319/// `--branch` (branch or tag name) and `uri.subdir` to extract a
320/// subdirectory of the repo as the schema root. Removes `.git/` after
321/// clone — the cache only holds schema content.
322///
323/// Accepts both `git:` and `git+ssh:` schemes (claimed in
324/// [`Self::schemes`]). The URL forms supported:
325///
326/// - `git:https://host/path/repo.git` — body is the full URL, passed
327/// to `git clone` as-is.
328/// - `git:git@host:owner/repo.git` — body is the scp-like URL, passed
329/// to `git clone` as-is.
330/// - `git:file:///path/to/bare` — body is a `file://` URL pointing at
331/// a local bare repo (used in tests and as the local-mirror
332/// escape hatch).
333/// - `git+ssh://git@host/path/repo.git` — body is `//git@host/...`;
334/// the fetcher reconstructs `git+ssh://...` for the clone command
335/// (git accepts the `git+ssh` scheme directly).
336///
337/// ## Why shell out
338///
339/// Spec §6.3 spells out the reasoning. Briefly: libgit2's credential
340/// coverage is incomplete in ways that matter (macOS keychain
341/// integration, SAML SSO, Kerberos), so a libgit2-backed fetcher would
342/// produce a UX divide between "private repos that work" and "private
343/// repos that don't" with no clear story for the user. Shell-out
344/// inherits everything `git clone` honors at the command line: SSH
345/// agent, OS keychain helpers (osxkeychain, libsecret, GCM, GCMcore),
346/// `gh auth setup-git`, `gitconfig`-declared SSO providers. There is
347/// no Lex-side credential knob.
348///
349/// ## Constraints
350///
351/// `git` must be in `PATH`. The fetcher returns
352/// [`FetchError::Other`] with a clear message if the binary isn't
353/// found; the diagnostic surface tells the user to install git or fall
354/// back to a [`path:`-scheme] / `--ext-schema` local schema. Spec §6.3
355/// covers the rationale for not bundling git.
356///
357/// ## Errors
358///
359/// Git's stderr is classified into typed variants for the host
360/// diagnostic surface:
361///
362/// - [`FetchError::Network`] for connectivity failures (DNS,
363/// connection refused/timeout, unreachable).
364/// - [`FetchError::UpstreamStatus`] for auth-shaped failures
365/// (permission denied, authentication failed, repository not found
366/// — which the github/gitlab APIs also use as a private-repo
367/// not-authorised signal).
368/// - [`FetchError::Other`] carrying the raw stderr text for anything
369/// else (unknown ref, corrupted upstream, etc.).
370///
371/// ## Interaction with URL templates
372///
373/// This is the transport the `github:` and `gitlab:` URL templates
374/// expand into when their `via` knob is `"git"` (the private-repo
375/// path; default for those templates is `via = "https"`, which uses
376/// the [`HttpsFetcher`] tarball API instead). See [`super::template`].
377///
378/// [`path:`-scheme]: super::path
379#[derive(Debug, Default, Clone, Copy)]
380pub struct GitFetcher;
381
382impl Fetcher for GitFetcher {
383 fn fetch(&self, uri: &ParsedUri, dest: &Path) -> Result<(), FetchError> {
384 let url = reconstruct_git_url(&uri.scheme, &uri.body);
385
386 // Normalize subdir (`/labels/`, `labels/`, `/labels` → `labels`).
387 // Empty after trim → treat as no subdir.
388 let subdir = uri
389 .subdir
390 .as_deref()
391 .map(|s| s.trim_matches('/').to_string())
392 .filter(|s| !s.is_empty());
393
394 // Clone into a hidden subdirectory of dest, then promote the
395 // desired contents (whole repo, or `subdir/` if set) up to
396 // dest. Cloning into a subdirectory of dest avoids needing a
397 // tempfile dep (the issue spec calls for std-only deps) and
398 // avoids needing write access to dest's parent. The `.lex-` /
399 // dot prefix means we won't shadow any real file the schema
400 // ships.
401 let clone_dir = dest.join(".lex-git-clone");
402
403 // Validate subdir upfront so we don't bother cloning if it
404 // would escape the clone root. Rejects `..` components,
405 // absolute paths, and platform prefixes — the symlink check
406 // happens post-clone (we need the on-disk tree to inspect).
407 if let Some(sub) = subdir.as_deref() {
408 validate_subdir(sub)?;
409 }
410
411 let mut cmd = Command::new("git");
412 cmd.arg("clone").arg("--depth=1");
413 if let Some(rev) = uri.rev.as_deref().filter(|s| !s.is_empty()) {
414 // `--branch` accepts arbitrary user input (the rev). git
415 // does not interpret its `--branch` argument as a flag —
416 // it expects a ref name — but the rev still flows from a
417 // namespace config the user wrote, so terminate option
418 // parsing before any user-controlled positional with
419 // `--` below.
420 cmd.arg("--branch").arg(rev);
421 }
422 // `--` terminates option parsing so a URL starting with `-`
423 // can't be mistaken for a flag (option-injection defence; the
424 // URL flows from user config). Same reasoning for the
425 // clone-dir positional, though that one is host-controlled.
426 cmd.arg("--").arg(&url).arg(&clone_dir);
427 // Suppress interactive credential prompts; if the user's
428 // credential helper can't satisfy the request non-interactively
429 // we want a clean error rather than a hung boot path.
430 cmd.env("GIT_TERMINAL_PROMPT", "0");
431
432 let output = cmd.output().map_err(|e| {
433 if e.kind() == std::io::ErrorKind::NotFound {
434 FetchError::Other {
435 message: "git binary not in PATH; install git, or use a `path:` URI / `--ext-schema` flag for a local schema".into(),
436 }
437 } else {
438 FetchError::Io(e)
439 }
440 })?;
441
442 if !output.status.success() {
443 // Best-effort cleanup of the partial clone dir before
444 // surfacing the error.
445 let _ = std::fs::remove_dir_all(&clone_dir);
446 let stderr = String::from_utf8_lossy(&output.stderr).into_owned();
447 return Err(classify_git_clone_error(&stderr));
448 }
449
450 // Source of the content we keep is either `<clone>/<subdir>`
451 // or `<clone>` directly. `safe_subdir_join` walks the path
452 // component by component, refusing to follow any component
453 // that's a symlink on disk — a repo-shipped symlink at the
454 // subdir root would otherwise let us read/copy arbitrary
455 // filesystem paths into the cache.
456 let source = match subdir.as_deref() {
457 Some(sub) => match safe_subdir_join(&clone_dir, sub) {
458 Ok(p) => {
459 if !p.is_dir() {
460 let _ = std::fs::remove_dir_all(&clone_dir);
461 return Err(FetchError::Other {
462 message: format!(
463 "subdir `{sub}` not found in cloned repo (clone succeeded but the path doesn't exist)"
464 ),
465 });
466 }
467 p
468 }
469 Err(e) => {
470 let _ = std::fs::remove_dir_all(&clone_dir);
471 return Err(e);
472 }
473 },
474 None => clone_dir.clone(),
475 };
476
477 // Copy contents into dest. Skip `.git` (the cache only holds
478 // schema content) and skip the `.lex-git-clone` directory
479 // itself (we're walking it as a source, but for the no-subdir
480 // case it's literally a sibling of where we're writing, so
481 // exclude it to avoid recursive copying).
482 copy_dir_contents(&source, dest, &clone_dir).map_err(FetchError::Io)?;
483
484 // Clean up the clone dir; we've copied what we need.
485 std::fs::remove_dir_all(&clone_dir).map_err(FetchError::Io)?;
486
487 Ok(())
488 }
489
490 fn schemes(&self) -> &'static [&'static str] {
491 &["git", "git+ssh"]
492 }
493
494 fn is_immutable_rev(&self, rev: Option<&str>) -> bool {
495 is_immutable_git_rev(rev)
496 }
497}
498
499/// Reconstruct the URL to pass to `git clone` from the parsed URI's
500/// `(scheme, body)` pair.
501///
502/// - `git:<body>` → `<body>` (body is the verbatim URL — `https://...`,
503/// `git@host:path`, `file:///...`, etc.).
504/// - `git+ssh:<body>` (body is `//user@host/path`) → `git+ssh:<body>`.
505/// Git accepts `git+ssh://...` as a synonym for `ssh://...` since
506/// 2.x.
507fn reconstruct_git_url(scheme: &str, body: &str) -> String {
508 match scheme {
509 "git+ssh" => format!("git+ssh:{body}"),
510 // `git:` and anything else (the registry only routes `git:` and
511 // `git+ssh:` here, but be defensive in case a custom registry
512 // routes another scheme).
513 _ => body.to_string(),
514 }
515}
516
517/// Reject `subdir` values that escape the clone root *lexically*
518/// (before any disk lookup): `..` components, absolute paths, and
519/// platform prefixes (Windows drive letters, UNC roots). The
520/// post-clone path-component walk in [`safe_subdir_join`] catches the
521/// symlink-escape case; this is the first line of defence and runs
522/// pre-clone so a hostile subdir doesn't even cost us a network round
523/// trip.
524fn validate_subdir(subdir: &str) -> Result<(), FetchError> {
525 use std::path::Component;
526 let path = Path::new(subdir);
527 if path.is_absolute() {
528 return Err(FetchError::Other {
529 message: format!(
530 "subdir `{subdir}` is absolute; subdir must be a relative path within the cloned repo"
531 ),
532 });
533 }
534 for component in path.components() {
535 match component {
536 Component::ParentDir => {
537 return Err(FetchError::Other {
538 message: format!(
539 "subdir `{subdir}` contains `..`; refusing to escape the clone root"
540 ),
541 });
542 }
543 Component::Prefix(_) | Component::RootDir => {
544 return Err(FetchError::Other {
545 message: format!(
546 "subdir `{subdir}` is rooted (absolute path or platform prefix); refusing"
547 ),
548 });
549 }
550 Component::Normal(_) | Component::CurDir => {}
551 }
552 }
553 Ok(())
554}
555
556/// Join `clone_dir + subdir` while refusing to traverse a symlink at
557/// any intermediate component. The lexical [`validate_subdir`] is the
558/// belt; this is the suspenders for the on-disk side — a repo-shipped
559/// symlinked directory (`git checkout` happily restores symlinks from
560/// the tree object) at any point in the subdir path would otherwise
561/// let the post-clone copy read/write filesystem locations outside the
562/// clone root.
563///
564/// Walks each `Normal` component, accumulating the path, and rejects
565/// the first one whose `symlink_metadata` reports a symlink. Missing
566/// intermediate components terminate the walk early — the caller
567/// surfaces "subdir not found" against the joined path.
568fn safe_subdir_join(clone_dir: &Path, subdir: &str) -> Result<std::path::PathBuf, FetchError> {
569 use std::path::Component;
570 // validate_subdir is the lexical pre-flight; call it again here
571 // so this helper is safe to use standalone (defence in depth).
572 validate_subdir(subdir)?;
573
574 let mut accumulated = clone_dir.to_path_buf();
575 for component in Path::new(subdir).components() {
576 if let Component::Normal(name) = component {
577 accumulated.push(name);
578 match std::fs::symlink_metadata(&accumulated) {
579 Ok(meta) if meta.file_type().is_symlink() => {
580 return Err(FetchError::Other {
581 message: format!(
582 "subdir component `{}` is a symlink in the cloned repo; refusing to follow",
583 accumulated
584 .strip_prefix(clone_dir)
585 .unwrap_or(&accumulated)
586 .display()
587 ),
588 });
589 }
590 Ok(_) | Err(_) => {
591 // Non-symlink → keep walking. Missing entry →
592 // stop; the caller handles "subdir not found"
593 // against the final joined path.
594 }
595 }
596 }
597 }
598 Ok(accumulated)
599}
600
601/// Classify git's stderr into a typed [`FetchError`]. The heuristics
602/// are conservative — when we can't recognise the failure shape we
603/// fall through to [`FetchError::Other`] with the raw stderr so the
604/// user sees git's own message verbatim.
605fn classify_git_clone_error(stderr: &str) -> FetchError {
606 let lower = stderr.to_ascii_lowercase();
607 if lower.contains("could not resolve host")
608 || lower.contains("could not connect")
609 || lower.contains("connection refused")
610 || lower.contains("connection timed out")
611 || lower.contains("network is unreachable")
612 || lower.contains("no route to host")
613 {
614 FetchError::Network {
615 message: stderr.trim().to_string(),
616 }
617 } else if lower.contains("permission denied")
618 || lower.contains("authentication failed")
619 || lower.contains("could not read username")
620 || lower.contains("access denied")
621 || lower.contains("repository not found")
622 {
623 // Note: `could not read from remote repository` is intentionally
624 // NOT in this list. Git emits that line on most clone failures
625 // (auth, missing repo, wrong endpoint, etc.) so it's too broad
626 // to disambiguate. The auth-shaped failures all surface a more
627 // specific marker above; everything else falls through to
628 // FetchError::Other with git's raw stderr.
629 FetchError::UpstreamStatus {
630 status: "auth".into(),
631 message: stderr.trim().to_string(),
632 }
633 } else {
634 FetchError::Other {
635 message: stderr.trim().to_string(),
636 }
637 }
638}
639
640/// True when `rev` looks like an immutable git reference. Drives the
641/// cache TTL: immutable refs are cached indefinitely, mutable refs
642/// expire after 24 hours.
643///
644/// Heuristics:
645///
646/// - SHA-shaped: 7-40 lowercase hex characters. Matches both
647/// short-SHA (`abc1234`) and full-SHA (40-char) forms. Uppercase hex
648/// is not matched — git itself emits lowercase, and matching
649/// uppercase would expand the false-positive surface for branch
650/// names that happen to be hex-shaped.
651/// - Tag-shaped: optional `v` prefix, then `<digit>+.<digit>+`
652/// (matches `v1.2`, `1.2`, `v0.14.0`, `1.2.3-rc4`, etc.). The
653/// `\d+\.\d+` minimum requirement excludes single-digit "branches"
654/// like `1` while keeping the common semver-prefixed tag shape.
655///
656/// Everything else (branch names, `None`) returns `false` — the cache
657/// treats them as mutable and invalidates on TTL.
658fn is_immutable_git_rev(rev: Option<&str>) -> bool {
659 let Some(rev) = rev else { return false };
660 let bytes = rev.as_bytes();
661
662 // SHA: 7-40 lowercase hex digits, nothing else.
663 if (7..=40).contains(&bytes.len())
664 && bytes
665 .iter()
666 .all(|&b| matches!(b, b'0'..=b'9' | b'a'..=b'f'))
667 {
668 return true;
669 }
670
671 // Tag: optional `v` prefix, then `<digits>.<digits>` (more
672 // components allowed after; we only require the first two).
673 let after_v = bytes.strip_prefix(b"v").unwrap_or(bytes);
674 let mut parts = after_v.split(|&b| b == b'.');
675 let (Some(first), Some(second)) = (parts.next(), parts.next()) else {
676 return false;
677 };
678 !first.is_empty()
679 && first.iter().all(|b| b.is_ascii_digit())
680 && !second.is_empty()
681 // Second component can have trailing non-digit characters
682 // (e.g. `1.2.3-rc4` → second = `2`; e.g. `1.2-pre` → second =
683 // `2-pre`). Require at least one leading digit, allow whatever
684 // after — git tag names can be arbitrarily decorated.
685 && second.iter().take_while(|b| b.is_ascii_digit()).count() > 0
686}
687
688/// Recursively copy contents of `src` into `dest`. Skips:
689///
690/// - The top-level `.git` directory (we don't need git's index /
691/// objects in the cache — only the schema content).
692/// - Anything matching `skip_path` (used to skip the
693/// `.lex-git-clone/` directory itself when `src` is its sibling, so
694/// the copy doesn't recursively follow into the source-of-truth).
695/// - Symlinks (same trust-surface reasoning as the extract module —
696/// archive/repo-shipped symlinks expand what the schema loader
697/// trusts).
698/// - Special files (sockets, FIFOs).
699fn copy_dir_contents(src: &Path, dest: &Path, skip_path: &Path) -> std::io::Result<()> {
700 for entry in std::fs::read_dir(src)? {
701 let entry = entry?;
702 let src_path = entry.path();
703 if src_path == skip_path {
704 continue;
705 }
706 let name = entry.file_name();
707 if name == ".git" {
708 continue;
709 }
710 let dest_path = dest.join(&name);
711 let file_type = entry.file_type()?;
712 if file_type.is_symlink() {
713 continue;
714 }
715 if file_type.is_dir() {
716 std::fs::create_dir_all(&dest_path)?;
717 copy_dir_contents_no_skip(&src_path, &dest_path)?;
718 } else if file_type.is_file() {
719 std::fs::copy(&src_path, &dest_path)?;
720 }
721 // Anything else (sockets, FIFOs, etc.) — skip.
722 }
723 Ok(())
724}
725
726/// Inner recursion that doesn't reapply the top-level skip rules.
727/// Nested directories shouldn't filter `.git` (a real `.git` deeper
728/// in the tree is regular content, not metadata) or `.lex-git-clone`
729/// (only the outermost level is a sibling of the source).
730fn copy_dir_contents_no_skip(src: &Path, dest: &Path) -> std::io::Result<()> {
731 for entry in std::fs::read_dir(src)? {
732 let entry = entry?;
733 let src_path = entry.path();
734 let dest_path = dest.join(entry.file_name());
735 let file_type = entry.file_type()?;
736 if file_type.is_symlink() {
737 continue;
738 }
739 if file_type.is_dir() {
740 std::fs::create_dir_all(&dest_path)?;
741 copy_dir_contents_no_skip(&src_path, &dest_path)?;
742 } else if file_type.is_file() {
743 std::fs::copy(&src_path, &dest_path)?;
744 }
745 }
746 Ok(())
747}
748
749#[cfg(test)]
750mod git_helper_tests {
751 use super::*;
752
753 // ---- is_immutable_git_rev ----
754
755 #[test]
756 fn immutable_rev_full_sha() {
757 assert!(is_immutable_git_rev(Some(
758 "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0"
759 )));
760 }
761
762 #[test]
763 fn immutable_rev_short_sha_seven_chars() {
764 assert!(is_immutable_git_rev(Some("a1b2c3d")));
765 }
766
767 #[test]
768 fn immutable_rev_rejects_sha_below_seven_chars() {
769 assert!(!is_immutable_git_rev(Some("abc123")));
770 }
771
772 #[test]
773 fn immutable_rev_rejects_uppercase_hex() {
774 // We only match lowercase — branch names that happen to be
775 // uppercase-hex-shaped shouldn't false-match as SHAs.
776 assert!(!is_immutable_git_rev(Some("ABC1234")));
777 }
778
779 #[test]
780 fn immutable_rev_semver_tag_with_v_prefix() {
781 assert!(is_immutable_git_rev(Some("v1.2.0")));
782 assert!(is_immutable_git_rev(Some("v0.14.0")));
783 }
784
785 #[test]
786 fn immutable_rev_semver_tag_without_v_prefix() {
787 assert!(is_immutable_git_rev(Some("1.2")));
788 assert!(is_immutable_git_rev(Some("1.2.3")));
789 }
790
791 #[test]
792 fn immutable_rev_semver_tag_with_decoration() {
793 // Tag names can carry decorations after the version
794 // (`-rc4`, `-pre`, etc.). The heuristic accepts these.
795 assert!(is_immutable_git_rev(Some("v1.2.3-rc4")));
796 assert!(is_immutable_git_rev(Some("1.2-pre")));
797 }
798
799 #[test]
800 fn immutable_rev_rejects_single_digit_branch_lookalike() {
801 // `1` alone isn't enough — no minor component.
802 assert!(!is_immutable_git_rev(Some("1")));
803 assert!(!is_immutable_git_rev(Some("v1")));
804 }
805
806 #[test]
807 fn immutable_rev_rejects_branch_names() {
808 assert!(!is_immutable_git_rev(Some("main")));
809 assert!(!is_immutable_git_rev(Some("master")));
810 assert!(!is_immutable_git_rev(Some("feature/foo")));
811 assert!(!is_immutable_git_rev(Some("release-2026-05")));
812 }
813
814 #[test]
815 fn immutable_rev_rejects_none() {
816 assert!(!is_immutable_git_rev(None));
817 }
818
819 #[test]
820 fn immutable_rev_rejects_empty_string() {
821 assert!(!is_immutable_git_rev(Some("")));
822 }
823
824 // ---- reconstruct_git_url ----
825
826 #[test]
827 fn reconstruct_url_git_scheme_passes_body_verbatim() {
828 assert_eq!(
829 reconstruct_git_url("git", "https://host/path/repo.git"),
830 "https://host/path/repo.git"
831 );
832 assert_eq!(
833 reconstruct_git_url("git", "git@host:owner/repo.git"),
834 "git@host:owner/repo.git"
835 );
836 assert_eq!(
837 reconstruct_git_url("git", "file:///tmp/bare"),
838 "file:///tmp/bare"
839 );
840 }
841
842 #[test]
843 fn reconstruct_url_git_ssh_scheme_rebuilds_full_url() {
844 // ParsedUri::parse("git+ssh://git@host/path.git") gives
845 // body = "//git@host/path.git"; the fetcher reconstructs
846 // the full URL by prepending the scheme.
847 assert_eq!(
848 reconstruct_git_url("git+ssh", "//git@host/path.git"),
849 "git+ssh://git@host/path.git"
850 );
851 }
852
853 // ---- classify_git_clone_error ----
854
855 #[test]
856 fn classify_dns_failure_is_network() {
857 let err = classify_git_clone_error(
858 "fatal: unable to access 'https://nonexistent.example/r.git/': Could not resolve host: nonexistent.example",
859 );
860 assert!(matches!(err, FetchError::Network { .. }), "got: {err:?}");
861 }
862
863 #[test]
864 fn classify_connection_refused_is_network() {
865 let err = classify_git_clone_error(
866 "fatal: unable to access 'https://localhost:1/r.git/': Failed to connect to localhost port 1: Connection refused",
867 );
868 assert!(matches!(err, FetchError::Network { .. }), "got: {err:?}");
869 }
870
871 #[test]
872 fn classify_auth_failure_is_upstream_status() {
873 let err = classify_git_clone_error(
874 "git@github.com: Permission denied (publickey).\nfatal: Could not read from remote repository.",
875 );
876 assert!(
877 matches!(err, FetchError::UpstreamStatus { .. }),
878 "got: {err:?}"
879 );
880 }
881
882 #[test]
883 fn classify_repository_not_found_is_upstream_status() {
884 // GitHub's "private repo without auth" surfaces as
885 // "Repository not found" — semantically an auth failure (the
886 // public can't see it).
887 let err = classify_git_clone_error(
888 "remote: Repository not found.\nfatal: repository 'https://github.com/private/secret.git/' not found",
889 );
890 assert!(
891 matches!(err, FetchError::UpstreamStatus { .. }),
892 "got: {err:?}"
893 );
894 }
895
896 // ---- validate_subdir ----
897
898 #[test]
899 fn validate_subdir_rejects_parent_dir() {
900 let err = validate_subdir("../escape").unwrap_err();
901 assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
902 }
903
904 #[test]
905 fn validate_subdir_rejects_parent_dir_in_middle() {
906 let err = validate_subdir("safe/../escape").unwrap_err();
907 assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
908 }
909
910 #[test]
911 fn validate_subdir_rejects_absolute_path() {
912 let err = validate_subdir("/etc/passwd").unwrap_err();
913 assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
914 }
915
916 #[test]
917 fn validate_subdir_accepts_normal_relative_path() {
918 validate_subdir("labels").unwrap();
919 validate_subdir("src/labels").unwrap();
920 validate_subdir("./labels").unwrap();
921 validate_subdir("a/b/c").unwrap();
922 }
923
924 // ---- safe_subdir_join ----
925
926 #[test]
927 fn safe_subdir_join_rejects_traversal_lexically_before_disk_lookup() {
928 // No clone_dir on disk needed — the lexical check fires first.
929 let err = safe_subdir_join(Path::new("/nonexistent"), "../escape").unwrap_err();
930 assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
931 }
932
933 #[test]
934 fn safe_subdir_join_refuses_symlink_at_subdir_root() {
935 let base = tempfile::tempdir().unwrap();
936 // Create a real "labels" target outside the base.
937 let outside = tempfile::tempdir().unwrap();
938 std::fs::create_dir(outside.path().join("real")).unwrap();
939 // Inside the clone dir, ship a `labels` symlink pointing
940 // outside. A repo-shipped symlink that git checkout restored
941 // is the exact attack we're defending against.
942 #[cfg(unix)]
943 std::os::unix::fs::symlink(outside.path(), base.path().join("labels")).unwrap();
944 #[cfg(windows)]
945 std::os::windows::fs::symlink_dir(outside.path(), base.path().join("labels")).unwrap();
946
947 let err = safe_subdir_join(base.path(), "labels").unwrap_err();
948 match err {
949 FetchError::Other { message } => assert!(
950 message.contains("symlink"),
951 "error should mention symlink, got: {message}"
952 ),
953 other => panic!("expected Other(symlink), got: {other:?}"),
954 }
955 }
956
957 #[test]
958 fn safe_subdir_join_refuses_symlink_at_intermediate_component() {
959 let base = tempfile::tempdir().unwrap();
960 let outside = tempfile::tempdir().unwrap();
961 std::fs::create_dir(outside.path().join("labels")).unwrap();
962 // `src` is a symlink, `src/labels` is the requested subdir.
963 // The intermediate component must be caught.
964 #[cfg(unix)]
965 std::os::unix::fs::symlink(outside.path(), base.path().join("src")).unwrap();
966 #[cfg(windows)]
967 std::os::windows::fs::symlink_dir(outside.path(), base.path().join("src")).unwrap();
968
969 let err = safe_subdir_join(base.path(), "src/labels").unwrap_err();
970 assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
971 }
972
973 #[test]
974 fn safe_subdir_join_accepts_normal_path_with_real_directories() {
975 let base = tempfile::tempdir().unwrap();
976 std::fs::create_dir_all(base.path().join("src/labels")).unwrap();
977 let joined = safe_subdir_join(base.path(), "src/labels").unwrap();
978 assert_eq!(joined, base.path().join("src/labels"));
979 }
980
981 #[test]
982 fn safe_subdir_join_accepts_path_with_missing_tail() {
983 // When the subdir doesn't exist, the walk terminates early
984 // and the caller surfaces "subdir not found" against the
985 // joined path. No error from safe_subdir_join itself.
986 let base = tempfile::tempdir().unwrap();
987 let joined = safe_subdir_join(base.path(), "does/not/exist").unwrap();
988 assert_eq!(joined, base.path().join("does/not/exist"));
989 }
990
991 #[test]
992 fn classify_unknown_ref_falls_through_to_other() {
993 let err = classify_git_clone_error(
994 "warning: Could not find remote branch nonexistent to clone.\nfatal: Remote branch nonexistent not found in upstream origin",
995 );
996 assert!(matches!(err, FetchError::Other { .. }), "got: {err:?}");
997 }
998}