Skip to main content

pond/
config.rs

1//! Configuration loading: the `[embeddings]`, `[sources]`, `[storage]`, and
2//! `[creds.*]` blocks.
3//!
4//! pond ships built-in defaults, so an instance with no `config.toml` still
5//! works. `pond config schema` emits [`DEFAULT_CONFIG_TOML`], the
6//! fully-annotated example. Loading layers `config.toml` under the `POND_*`
7//! env mirror via figment, so every command also works with no config file
8//! at all (spec.md#storage-configless) - URLs + env vars are sufficient.
9
10use std::{
11    collections::BTreeMap,
12    path::{Path, PathBuf},
13};
14
15use anyhow::{Context, Result, anyhow, bail};
16use figment::{
17    Figment,
18    providers::{Env, Format, Toml},
19};
20use serde::{Deserialize, Deserializer, Serialize, de};
21use serde_json::Value;
22use url::Url;
23
24/// Parse `"128 MiB"`, `"1 GiB"`, `"500 KiB"`, or a bare byte count. Accepts
25/// SI (KB/MB/GB) and binary (KiB/MiB/GiB/TiB) suffixes; treats the bare unit
26/// `"B"` and unsuffixed numbers as raw bytes. Tolerant of whitespace and
27/// case. The result MUST fit in `usize` (Lance's cache APIs take `usize`).
28fn parse_byte_size(raw: &str) -> Result<usize, String> {
29    let trimmed = raw.trim();
30    if trimmed.is_empty() {
31        return Err("byte-size value is empty".to_owned());
32    }
33    let split = trimmed
34        .find(|c: char| c.is_ascii_alphabetic())
35        .unwrap_or(trimmed.len());
36    let (number, unit) = trimmed.split_at(split);
37    let number: f64 = number
38        .trim()
39        .parse()
40        .map_err(|_| format!("byte-size value {raw:?} is not a number"))?;
41    if !number.is_finite() || number < 0.0 {
42        return Err(format!("byte-size value {raw:?} must be non-negative"));
43    }
44    let multiplier: f64 = match unit.trim().to_ascii_lowercase().as_str() {
45        "" | "b" => 1.0,
46        "k" | "kb" => 1_000.0,
47        "kib" => 1_024.0,
48        "m" | "mb" => 1_000_000.0,
49        "mib" => 1_048_576.0,
50        "g" | "gb" => 1_000_000_000.0,
51        "gib" => 1_073_741_824.0,
52        "tib" => 1_099_511_627_776.0,
53        other => {
54            return Err(format!(
55                "byte-size unit {other:?} not recognized (try MiB / GiB)"
56            ));
57        }
58    };
59    let bytes = number * multiplier;
60    if !bytes.is_finite() || bytes > usize::MAX as f64 {
61        return Err(format!("byte-size value {raw:?} overflows usize"));
62    }
63    Ok(bytes as usize)
64}
65
66/// Accept string / integer / float / bool and stringify. The env mirror
67/// parses values TOML-ishly, so `POND_CREDS_X_SECRET_ACCESS_KEY=12345`
68/// arrives as a number; these fields are strings no matter how they scan.
69fn lenient_string<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
70where
71    D: Deserializer<'de>,
72{
73    #[derive(Deserialize)]
74    #[serde(untagged)]
75    enum Repr {
76        Text(String),
77        Int(i64),
78        Float(f64),
79        Bool(bool),
80    }
81    Ok(
82        Option::<Repr>::deserialize(deserializer)?.map(|repr| match repr {
83            Repr::Text(value) => value,
84            Repr::Int(value) => value.to_string(),
85            Repr::Float(value) => value.to_string(),
86            Repr::Bool(value) => value.to_string(),
87        }),
88    )
89}
90
91fn deserialize_byte_size_opt<'de, D>(deserializer: D) -> Result<Option<usize>, D::Error>
92where
93    D: Deserializer<'de>,
94{
95    #[derive(Deserialize)]
96    #[serde(untagged)]
97    enum Repr {
98        Bytes(u64),
99        Text(String),
100    }
101    let repr: Option<Repr> = Option::deserialize(deserializer)?;
102    match repr {
103        None => Ok(None),
104        Some(Repr::Bytes(value)) => usize::try_from(value).map(Some).map_err(de::Error::custom),
105        Some(Repr::Text(value)) => parse_byte_size(&value).map(Some).map_err(de::Error::custom),
106    }
107}
108
109/// True when the URL is on the local filesystem. Mirrors Lance's
110/// `ObjectStore::is_local` (lance-io/src/object_store.rs:541): the `file` and
111/// `file+uring` schemes are local; everything else (incl. `memory://`) is not.
112pub fn is_local(url: &Url) -> bool {
113    matches!(url.scheme(), "file" | "file+uring")
114}
115
116/// Extract the filesystem `PathBuf` for local URLs. `None` for remote.
117pub fn local_path(url: &Url) -> Option<PathBuf> {
118    if is_local(url) {
119        url.to_file_path().ok()
120    } else {
121        None
122    }
123}
124
125/// URI string for a child of this location (typically one Lance dataset under
126/// the data dir). Trims a single trailing slash on the base, then concatenates
127/// with a `/` separator. This keeps `Dataset::open` / `Dataset::write` happy
128/// on both filesystem and object-store backends - they want the URI form, not
129/// a `url::Url`.
130pub fn child_uri(base: &Url, suffix: &str) -> String {
131    // For local URLs we strip the `file://` prefix so log lines and error
132    // messages render as plain paths (`/srv/pond/sessions.lance`), matching
133    // what pond used to emit before the URL migration.
134    if let Some(path) = local_path(base) {
135        return path.join(suffix).display().to_string();
136    }
137    format!("{}/{suffix}", base.as_str().trim_end_matches('/'))
138}
139
140/// Render a `Url` for human-readable log/diagnostic output: local URLs come
141/// back as plain paths (no `file://` prefix, `$HOME` contracted to `~`);
142/// remote URLs stay verbatim.
143pub fn display(url: &Url) -> String {
144    if let Some(path) = local_path(url) {
145        contract_home(&path).display().to_string()
146    } else {
147        url.to_string()
148    }
149}
150
151/// Build a `Url` from a filesystem path. Convenience for tests and for
152/// callers that hold a `PathBuf` already. The path must be
153/// absolute (`url::Url::from_file_path` is a hard requirement on Unix); a
154/// relative path gets canonicalized via `std::path::absolute` first.
155pub fn url_for_path(path: impl AsRef<Path>) -> Result<Url> {
156    let path = path.as_ref();
157    let absolute = if path.is_absolute() {
158        path.to_path_buf()
159    } else {
160        std::path::absolute(path)
161            .with_context(|| format!("failed to absolutize {}", path.display()))?
162    };
163    Url::from_file_path(&absolute).map_err(|()| {
164        anyhow!(
165            "failed to convert path {} into a file:// URL",
166            absolute.display()
167        )
168    })
169}
170
171/// Default `config.toml` body emitted by `pond config schema`. Every
172/// line is commented: pond ships built-in defaults, so the file is purely a
173/// discoverable template and pond still works with no `config.toml` on disk.
174pub const DEFAULT_CONFIG_TOML: &str = "\
175# pond configuration.
176#
177# pond ships built-in defaults, so every setting here is optional - delete this
178# file and pond still works. Uncomment and edit to override.
179
180# Where pond looks for source data to import. One entry per adapter type
181# (`claude-code`, `codex-cli`, ...). `pond sync` with no arguments syncs every
182# entry; `pond sync <adapter>` syncs just one. With an empty `[sources]`,
183# `pond sync` runs an interactive discovery against the known default paths
184# and writes the picks back here.
185#
186# Future wrap: pond is single-namespace in v1 (spec.md#wire-namespace-resolution); `[sources]` is
187# flat here. When multi-namespace pond lands, source registration becomes
188# per-tenant under `[namespaces.<ns>.sources.<adapter>]`. Pre-v1 the schema
189# is breakable; the rename is operationally free until a real second tenant
190# exists.
191#
192# [sources.claude-code]
193# enabled = true
194# path = \"~/.claude/projects\"
195#
196# [sources.codex-cli]
197# enabled = true
198# path = \"~/.codex/sessions\"
199#
200# Set `enabled = false` to keep the section but skip it on `pond sync`;
201# re-enable via `pond sync <adapter>`.
202
203# Embeddings. Search runs hybrid (vector + FTS) whenever the store has any
204# vectors, and FTS-only otherwise - the model loads lazily on the first hybrid
205# query, so there's no cost on FTS-only corpora. `model` selects the
206# HuggingFace XLM-RoBERTa model; `dim` declares its output width and is baked
207# into the messages.vector schema on table creation - it must equal the
208# model's hidden_size and be a multiple of 8 (IVF_PQ subspace stride).
209#
210# Common pairings:
211#   model = \"intfloat/multilingual-e5-small\"   dim = 384   (default)
212#   model = \"intfloat/multilingual-e5-base\"    dim = 768
213#   model = \"intfloat/multilingual-e5-large\"   dim = 1024
214#
215# A different-dim model needs a fresh data dir; pond enforces this at the
216# schema boundary.
217#
218# [embeddings]
219# model = \"intfloat/multilingual-e5-small\"
220# dim = 384
221
222# Search tuning. Leave unset for Lance defaults; set when tuning IVF_PQ recall
223# against a corpus.
224#
225# [search]
226# nprobes = 16
227# refine_factor = 2
228
229# Storage maintenance. Tunes the compaction + cleanup pass that runs inside
230# `pond sync` and `pond index optimize`.
231#
232# - `compaction_fragment_cap` is the per-task fragment-count backstop: a
233#   planned compaction task touching at least this many fragments always runs
234#   even when the write-amplification veto would skip it. Default 64; 0
235#   disables the veto and runs every task Lance plans.
236# - `cleanup_older_than` is the manifest-retention window for the safe cleanup
237#   pass. Accepts `Ns` / `Nm` / `Nh` / `Nd` (default `1d`, floor `1h` - it is
238#   what protects in-flight readers). Versions older than this are reclaimed
239#   by Lance's OCC-coordinated GC.
240# - `index_lag_threshold` is the minimum unindexed-fragment count before a
241#   per-intent append/rebuild runs in `pond index optimize`; the brute-force
242#   fallback keeps queries correct while fragments accumulate. Default 4.
243#
244# [maintenance]
245# compaction_fragment_cap = 64
246# cleanup_older_than = \"1d\"
247# index_lag_threshold = 4
248
249# Long-running process caps. Both accept either a plain byte count or a
250# humansize-style suffix (\"128 MiB\", \"1 GiB\"). Both are optional - leave
251# unset to let pond pick the backend-aware default:
252#   local FS  : index_cache = 256 MiB, metadata_cache = 128 MiB
253#   remote    : index_cache = 2 GiB,   metadata_cache = 512 MiB
254# Lance's library defaults (6 GiB / 1 GiB) are too generous for a per-session
255# `pond mcp` process; tightening them is what keeps RSS under the 500 MiB target
256# without measurable latency regressions on typical agent-history corpora.
257#
258# [runtime]
259# index_cache_bytes    = \"256 MiB\"
260# metadata_cache_bytes = \"128 MiB\"
261
262# Storage address and credentials (spec.md#storage-url-grammar).
263#
264# `path` is the default destination used when `--storage-path` (env
265# `POND_STORAGE_PATH`) is not passed. Absent = the platform-local data dir.
266# Addresses are URLs; the `s3+https` form carries the endpoint, bucket, and
267# prefix in one token:
268#
269#   /abs/path or ~/path                  local filesystem
270#   s3://bucket/prefix                   AWS S3 (ambient credential chain)
271#   s3+https://host/bucket/prefix        S3-compatible endpoint (Hetzner, R2, B2, MinIO)
272#   gs://bucket/prefix                   Google Cloud Storage
273#   az://account/container/prefix        Azure Blob
274#
275# Credentials live in `[creds.<name>]` sets and bind to URLs by `scope`
276# prefix - longest match wins (spec.md#creds-scope-match); a set without
277# `scope` matches any URL. With no matching set, the standard cloud SDK
278# chain applies (AWS_* env, shared credentials file, instance metadata).
279# Secrets never go in URLs or CLI flags; besides inline values,
280# `access_key_id_file` / `secret_access_key_file` read a file and
281# `secret_access_key_command` runs a command (e.g. `op read ...`). `extra`
282# holds verbatim `object_store` options pond has not typed.
283#
284# Every field mirrors to env: `POND_STORAGE_PATH`, `POND_CREDS_<NAME>_<FIELD>`
285# (set names are lowercase alphanumeric, so the env grammar is unambiguous).
286# Precedence: CLI flag > POND_* env > this file > ambient cloud chain.
287# Probe a destination end-to-end with `pond storage check`.
288#
289# Future wrap: pond is single-namespace in v1 (spec.md#wire-namespace-resolution);
290# `[storage]` is flat here on the assumption of one bucket per pond. When
291# multi-namespace pond lands this becomes `[namespaces.<ns>.storage]`.
292#
293# [storage]
294# path = \"s3+https://nbg1.your-objectstorage.com/my-pond\"
295#
296# [creds.default]
297# access_key_id     = \"...\"
298# secret_access_key = \"...\"
299";
300
301/// Top-level `config.toml` shape.
302#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
303#[serde(deny_unknown_fields)]
304pub struct Config {
305    #[serde(default)]
306    pub embeddings: EmbeddingsConfig,
307    #[serde(default)]
308    pub search: SearchConfig,
309    #[serde(default)]
310    pub maintenance: MaintenanceConfig,
311    #[serde(default)]
312    pub runtime: RuntimeConfig,
313    /// `[sources.<adapter>]` map: per-adapter config blobs the matching
314    /// factory deserializes inside its `open()`. The shape is adapter-defined
315    /// (filesystem adapters expect `{ path = "..." }`; API-backed adapters
316    /// expect endpoint + auth keys), so this layer stays opaque. Empty by
317    /// default; `pond sync` runs discovery into this map on first use.
318    #[serde(default)]
319    pub sources: BTreeMap<String, Value>,
320    /// `[storage]`: the default destination URL (spec.md#storage-url-grammar).
321    /// `None` = the platform-local data dir.
322    #[serde(default)]
323    pub storage: StorageConfig,
324    /// `[creds.<name>]`: URL-scoped credential sets. Every storage URL
325    /// resolves its own set by longest-prefix `scope` match
326    /// (spec.md#creds-scope-match); the resolver lives in `pond::substrate`.
327    #[serde(default)]
328    pub creds: BTreeMap<String, CredsSet>,
329}
330
331/// `[storage]`: the single default destination. Typed so the legacy
332/// passthrough map (ENV-style `object_store` keys) fails loudly with the
333/// rewrite recipe instead of silently changing meaning.
334#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
335#[serde(deny_unknown_fields)]
336pub struct StorageConfig {
337    #[serde(default)]
338    pub path: Option<String>,
339}
340
341/// One `[creds.<name>]` set. All fields optional; validation enforces at most
342/// one variant per logical secret. `extra` carries verbatim `object_store`
343/// options pond has not typed (redaction in `pond config show` still applies
344/// to its keys by name).
345#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
346#[serde(deny_unknown_fields)]
347pub struct CredsSet {
348    /// URL prefix this set binds to. `None` = the catch-all set (at most one).
349    #[serde(default)]
350    pub scope: Option<String>,
351    // Key / region fields are `lenient_string`: the env mirror parses values
352    // TOML-ishly, so an all-digit key or region arrives as a number and must
353    // still land in these String fields.
354    #[serde(default, deserialize_with = "lenient_string")]
355    pub access_key_id: Option<String>,
356    #[serde(default)]
357    pub access_key_id_file: Option<PathBuf>,
358    #[serde(default, deserialize_with = "lenient_string")]
359    pub secret_access_key: Option<String>,
360    #[serde(default)]
361    pub secret_access_key_file: Option<PathBuf>,
362    #[serde(default)]
363    pub secret_access_key_command: Option<String>,
364    #[serde(default, deserialize_with = "lenient_string")]
365    pub region: Option<String>,
366    #[serde(default)]
367    pub virtual_hosted_style_request: Option<bool>,
368    #[serde(default)]
369    pub extra: BTreeMap<String, String>,
370}
371
372/// `[runtime]`: long-running process caps. Both knobs accept either a plain
373/// byte count or a `humansize`-style suffix (`"128 MiB"`, `"1 GiB"`). Both are
374/// optional - `None` lets `pond::substrate` pick the backend-aware default
375/// (local FS gets a tight cap; object stores stay near Lance's defaults).
376#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
377#[serde(deny_unknown_fields, default)]
378pub struct RuntimeConfig {
379    #[serde(default, deserialize_with = "deserialize_byte_size_opt")]
380    pub index_cache_bytes: Option<usize>,
381    #[serde(default, deserialize_with = "deserialize_byte_size_opt")]
382    pub metadata_cache_bytes: Option<usize>,
383}
384
385/// `[search]`: optional Lance vector-query tuning knobs.
386#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
387#[serde(deny_unknown_fields)]
388pub struct SearchConfig {
389    #[serde(default)]
390    pub nprobes: Option<usize>,
391    #[serde(default)]
392    pub refine_factor: Option<u32>,
393}
394
395/// `[maintenance]`: storage-maintenance knobs shared by `pond sync` and
396/// `pond index optimize`. All optional - omit and pond falls back to the
397/// in-process defaults in `pond::substrate` (`DEFAULT_COMPACTION_FRAGMENT_CAP`,
398/// `default_cleanup_older_than`, and the `index_lag_threshold` initializer).
399#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
400#[serde(deny_unknown_fields)]
401pub struct MaintenanceConfig {
402    /// Sub-target fragment count past which the compaction phase runs (it also
403    /// runs once those fragments hold a whole target fragment's worth of rows).
404    /// Default 64 stops the automated sync re-compacting the trailing fragment
405    /// every pass; 0 compacts every pass.
406    #[serde(default)]
407    pub compaction_fragment_cap: Option<usize>,
408    /// Manifest-retention window for the safe cleanup pass. Accepts
409    /// `Ns`/`Nm`/`Nh`/`Nd` (default `1d`). Versions older than this are
410    /// reclaimed by Lance's OCC-coordinated GC (`delete_unverified=false`),
411    /// which never races a concurrent writer on any backend.
412    #[serde(default)]
413    pub cleanup_older_than: Option<String>,
414    /// Minimum unindexed-fragment count below which `optimize_table_indices`
415    /// skips the per-intent append/rebuild path; the brute-force fallback
416    /// keeps queries correct while fragments accumulate. Default 4 trades a
417    /// little query latency on cold fragments for far fewer remote index
418    /// commits during high-rate ingest.
419    #[serde(default)]
420    pub index_lag_threshold: Option<usize>,
421}
422
423/// `[embeddings]`: model selector and vector dimension. There is no master
424/// switch - the search path always runs hybrid when vectors exist in the
425/// store and FTS-only when they don't (`has_embeddings()` is the only gate);
426/// the candle/Metal model is `LazyEmbedder`-loaded on the first query that
427/// actually needs it. `model` and `dim` are installed into the process at
428/// startup via `embed::init_model_id` / `sessions::init_embedding_dim`, so
429/// swapping models for a one-off experiment is a temporary config file - no
430/// CLI flag and no per-call-site plumbing.
431#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
432#[serde(deny_unknown_fields, default)]
433pub struct EmbeddingsConfig {
434    /// The embedding model id (spec.md#search): any XLM-RoBERTa model loadable
435    /// by `candle-transformers`. Defaults to `intfloat/multilingual-e5-small`.
436    pub model: String,
437    /// Output dimension of `model`. Must equal the model's `hidden_size` and
438    /// be divisible by 8 (the IVF_PQ subspace stride; see `embed::index_params`).
439    /// Defaults to 384 (e5-small). Set to 768 for e5-base, 1024 for e5-large.
440    pub dim: usize,
441}
442
443impl Default for EmbeddingsConfig {
444    fn default() -> Self {
445        Self {
446            model: crate::embed::DEFAULT_MODEL_ID.to_owned(),
447            dim: crate::sessions::DEFAULT_EMBEDDING_DIM,
448        }
449    }
450}
451
452/// The platform-local default storage path, used when neither
453/// `--storage-path` / `POND_STORAGE_PATH` nor `[storage].path` is set:
454/// `$XDG_DATA_HOME/pond`, then `$HOME/.local/share/pond`, then `.pond`.
455/// `xdg_data_home` is honored only if absolute, per the XDG base-directory
456/// spec.
457pub fn default_storage_path(xdg_data_home: Option<PathBuf>, home: Option<PathBuf>) -> Result<Url> {
458    if let Some(xdg) = xdg_data_home.filter(|path| path.is_absolute()) {
459        return url_for_path(xdg.join("pond"));
460    }
461    if let Some(home) = home {
462        return url_for_path(home.join(".local").join("share").join("pond"));
463    }
464    // No HOME and no usable XDG var - stay usable rather than panic.
465    url_for_path(PathBuf::from(".pond"))
466}
467
468/// Local default path for `config.toml`. URI-backed data dirs always land
469/// here because the config file has to be local (it names the bucket and
470/// any creds). XDG hierarchy: `$XDG_CONFIG_HOME/pond/config.toml`, then
471/// `$HOME/.config/pond/config.toml`, then `.pond.toml` in cwd.
472pub fn default_config_path(xdg_config_home: Option<PathBuf>, home: Option<PathBuf>) -> PathBuf {
473    if let Some(xdg) = xdg_config_home.filter(|path| path.is_absolute()) {
474        return xdg.join("pond").join("config.toml");
475    }
476    if let Some(home) = home {
477        return home.join(".config").join("pond").join("config.toml");
478    }
479    PathBuf::from(".pond.toml")
480}
481
482impl Config {
483    /// Load `config.toml` from `path` (if it exists) layered under the
484    /// `POND_*` env mirror, and validate. A missing file yields the built-in
485    /// defaults - env vars alone are a complete config
486    /// (spec.md#storage-configless). On success the resolved embedding model
487    /// id + dim are installed into the process (`OnceLock`-backed; only the
488    /// first call per process sticks), so all downstream code paths see a
489    /// consistent pair without per-handler plumbing.
490    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
491        Ok(Self::load_with_provenance(path)?.0)
492    }
493
494    /// [`Config::load`] over an in-memory TOML body (still layered under the
495    /// `POND_*` env mirror). `pond init` uses this to validate and resolve
496    /// the config it is composing BEFORE anything touches disk - the wizard
497    /// writes exactly once, at the end.
498    pub fn load_str(body: &str) -> Result<Self> {
499        let figment = Figment::new().merge(Toml::string(body)).merge(env_mirror());
500        let config: Self = figment
501            .extract_lossy()
502            .map_err(|error| anyhow!("failed to load config: {error}"))?;
503        config.embeddings.validate()?;
504        config.validate_creds()?;
505        Ok(config)
506    }
507
508    /// [`Config::load`] that also returns the figment, so `pond config show`
509    /// can attribute each value to its source layer (file / env / default).
510    pub fn load_with_provenance(path: impl AsRef<Path>) -> Result<(Self, Figment)> {
511        let path = path.as_ref();
512        let figment = Figment::new().merge(Toml::file(path)).merge(env_mirror());
513        // `extract_lossy`, not `extract`: env values parse TOML-ishly, so an
514        // all-digit secret would arrive as a number and fail the String field;
515        // lossy stringifies scalars instead.
516        let config: Self = figment.extract_lossy().map_err(|error| {
517            if let Some(recipe) = detect_legacy_storage(path) {
518                return anyhow!("{recipe}");
519            }
520            // Inline figment's message (it already names the failing key and
521            // source layer) so single-line error surfaces keep the detail.
522            anyhow!("failed to load config {}: {error}", path.display())
523        })?;
524        config.embeddings.validate()?;
525        config.validate_creds()?;
526        config.embeddings.install_runtime();
527        if let Some(threshold) = config.maintenance.index_lag_threshold {
528            crate::substrate::init_index_lag_threshold(threshold);
529        }
530        // Tilde expansion is per-adapter (inside each factory's `open()`):
531        // an API-backed adapter has no path to expand, and only the
532        // filesystem-shaped adapters need the helper. See `expand_home_under`.
533        Ok((config, figment))
534    }
535
536    /// `[creds.*]` structural rules (spec.md#creds-scope-match): set-name
537    /// charset, at most one variant per logical secret, at most one
538    /// scope-less set, no duplicate scopes. All parse-time so a misbinding
539    /// dies before any URL resolves against it.
540    fn validate_creds(&self) -> Result<()> {
541        let mut scopeless: Option<&str> = None;
542        let mut scopes: BTreeMap<String, &str> = BTreeMap::new();
543        for (name, set) in &self.creds {
544            // Lowercase alphanumeric only - load-bearing for the env mirror:
545            // it makes `POND_CREDS_<NAME>_<FIELD>` splittable at the first
546            // `_` after the name (field names contain underscores).
547            let mut chars = name.chars();
548            let head_ok = chars.next().is_some_and(|c| c.is_ascii_lowercase());
549            if !head_ok
550                || name.len() > 16
551                || !chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit())
552            {
553                bail!(
554                    "creds set name {name:?} must match [a-z][a-z0-9]{{0,15}} (lowercase alphanumeric, no separators)"
555                );
556            }
557            if set.access_key_id.is_some() && set.access_key_id_file.is_some() {
558                bail!("[creds.{name}] sets both access_key_id and access_key_id_file; pick one");
559            }
560            let secret_variants = [
561                set.secret_access_key.is_some(),
562                set.secret_access_key_file.is_some(),
563                set.secret_access_key_command.is_some(),
564            ]
565            .iter()
566            .filter(|present| **present)
567            .count();
568            if secret_variants > 1 {
569                bail!(
570                    "[creds.{name}] sets more than one of secret_access_key / secret_access_key_file / secret_access_key_command; pick one"
571                );
572            }
573            match set.scope.as_deref() {
574                None => {
575                    if let Some(other) = scopeless {
576                        bail!(
577                            "[creds.{other}] and [creds.{name}] are both scope-less; at most one catch-all set is allowed - add a `scope` to one"
578                        );
579                    }
580                    scopeless = Some(name);
581                }
582                Some(scope) => {
583                    // Duplicates are checked on the canonical form (incl.
584                    // trailing-slash trim, matching scope-match semantics),
585                    // so two spellings of one prefix can never tie at
586                    // resolve time.
587                    let canonical = crate::substrate::parse_scope(scope)
588                        .map(|url| url.as_str().trim_end_matches('/').to_owned())
589                        .with_context(|| {
590                            format!("[creds.{name}] scope {scope:?} is not a valid URL prefix")
591                        })?;
592                    if let Some(other) = scopes.insert(canonical, name) {
593                        bail!(
594                            "[creds.{other}] and [creds.{name}] declare the same scope {scope:?}; merge them or narrow one"
595                        );
596                    }
597                }
598            }
599        }
600        Ok(())
601    }
602
603    /// Resolve the `[sources.<adapter>]` entries to drive `pond sync`. Only
604    /// sections with `enabled = true` flow through; sections with
605    /// `enabled = false` (or absent) are treated as opt-out and the
606    /// per-adapter blob (minus `enabled`) is handed to the factory's
607    /// `open()`. With `adapter = None` returns every enabled entry; with
608    /// `Some(name)` returns just that one - and errors if it's not in
609    /// config OR if it's currently disabled (the caller should then
610    /// re-prompt or report).
611    pub fn resolve_sources(&self, adapter: Option<&str>) -> Result<Vec<(String, Value)>> {
612        match adapter {
613            None => Ok(self
614                .sources
615                .iter()
616                .filter_map(|(name, blob)| take_enabled(name, blob))
617                .collect()),
618            Some(name) => {
619                let blob = self
620                    .sources
621                    .get(name)
622                    .ok_or_else(|| anyhow!("no [sources.{name}] entry in config"))?;
623                take_enabled(name, blob).map(|entry| vec![entry]).ok_or_else(|| {
624                    anyhow!(
625                        "source [{name}] is disabled (enabled = false); run `pond sync {name}` to re-enable"
626                    )
627                })
628            }
629        }
630    }
631
632    /// Names that are configured but currently `enabled = false`. Used by
633    /// `pond sync` post-import to know not to re-probe an adapter the user
634    /// already declined (the decline persists; re-prompt only via the
635    /// positional override `pond sync <name>`).
636    pub fn disabled_source_names(&self) -> Vec<&str> {
637        self.sources
638            .iter()
639            .filter_map(|(name, blob)| {
640                let enabled = blob
641                    .get("enabled")
642                    .and_then(Value::as_bool)
643                    .unwrap_or(false);
644                if enabled { None } else { Some(name.as_str()) }
645            })
646            .collect()
647    }
648}
649
650/// The `POND_*` env mirror (spec.md#storage-env-mirror): `POND_STORAGE_PATH`
651/// -> `storage.path`, `POND_CREDS_<NAME>_<FIELD>` -> `creds.<name>.<field>`.
652/// Filtered to exactly those two shapes - clap owns its own `POND_*` vars
653/// (`POND_CONFIG`, `POND_HOST`, ...) and an unfiltered prefix would turn each
654/// of them into an unknown-field error here.
655fn env_mirror() -> Env {
656    // Keys reach these closures pre-lowercasing (`CREDS_...`), so compare on
657    // an ascii-lowered copy; `str::starts_with` is case-sensitive.
658    Env::prefixed("POND_")
659        .filter(|key| {
660            let key = key.as_str().to_ascii_lowercase();
661            // `extra` has no env form (spec.md#storage-env-mirror): the env
662            // grammar stays flat strings; structured options belong in the
663            // file (or URL query params).
664            key == "storage_path" || (key.starts_with("creds_") && !key.ends_with("_extra"))
665        })
666        .map(|key| {
667            // Set names are lowercase alphanumeric (validate_creds), so the
668            // first `_` after `creds` and the one after the name are the only
669            // separators; field names keep their underscores.
670            let key = key.as_str().to_ascii_lowercase();
671            let dots = if key.starts_with("creds_") { 2 } else { 1 };
672            key.replacen('_', ".", dots).into()
673        })
674}
675
676/// The pre-redesign `[storage]` passthrough keys, by role (ENV-style
677/// `object_store` aliases). Both the load-time error recipe
678/// (`detect_legacy_storage`) and the `pond init` rewrite read these, so the
679/// legacy vocabulary lives in one place - a new alias must not require
680/// editing two detectors in lockstep.
681pub const LEGACY_ENDPOINT_KEYS: &[&str] = &["aws_endpoint", "endpoint"];
682pub const LEGACY_ACCESS_KEY_KEYS: &[&str] = &["aws_access_key_id", "access_key_id"];
683pub const LEGACY_SECRET_KEY_KEYS: &[&str] = &["aws_secret_access_key", "secret_access_key"];
684
685/// Recognize the pre-redesign `[storage]` passthrough map (ENV-style
686/// `object_store` keys) and return the exact rewrite onto `[storage].path` +
687/// `[creds.default]`. An error with a recipe, not a shim: old configs do not
688/// keep working.
689fn detect_legacy_storage(path: &Path) -> Option<String> {
690    let text = std::fs::read_to_string(path).ok()?;
691    let value: toml::Value = toml::from_str(&text).ok()?;
692    let storage = value.get("storage")?.as_table()?;
693    if storage.is_empty() || storage.keys().all(|key| key == "path") {
694        return None;
695    }
696    let get = |names: &[&str]| {
697        storage.iter().find_map(|(key, value)| {
698            names
699                .iter()
700                .any(|name| key.eq_ignore_ascii_case(name))
701                .then(|| value.as_str().unwrap_or_default().to_owned())
702        })
703    };
704    let endpoint = get(LEGACY_ENDPOINT_KEYS);
705    let host = endpoint
706        .as_deref()
707        .and_then(|e| e.split("://").nth(1))
708        .unwrap_or("<endpoint-host>");
709    // spec.md#storage-redaction: never echo credential values, even back to
710    // their owner - stderr lands in logs, scrollback, and pasted bug reports.
711    let mut recipe = format!(
712        "config {} uses the old [storage] passthrough map; rewrite it as:\n\n[storage]\npath = \"s3+https://{host}/<bucket>/<prefix>\"\n\n[creds.default]\n",
713        path.display(),
714    );
715    recipe.push_str("access_key_id     = \"...\"  # copy from the old [storage] section\n");
716    recipe.push_str("secret_access_key = \"...\"  # copy from the old [storage] section\n");
717    recipe.push_str(
718        "\n(the endpoint and bucket fold into the URL; allow_http is scheme-derived; virtual-hosted addressing defaults on; the region is autodetected - append ?region=<x> to the URL only if your store insists. `pond storage check` verifies the result end-to-end, and `pond init` can apply this rewrite for you)",
719    );
720    Some(recipe)
721}
722
723/// Inner helper: return `Some((name, blob))` when the source section is
724/// enabled, stripping the discriminator from the blob before handing it on;
725/// `None` when the section is missing `enabled` or has `enabled = false`.
726fn take_enabled(name: &str, blob: &Value) -> Option<(String, Value)> {
727    let enabled = blob
728        .get("enabled")
729        .and_then(Value::as_bool)
730        .unwrap_or(false);
731    if !enabled {
732        return None;
733    }
734    let mut clean = blob.clone();
735    if let Some(obj) = clean.as_object_mut() {
736        obj.remove("enabled");
737    }
738    Some((name.to_owned(), clean))
739}
740
741/// Expand `~` and `$VAR`/`${VAR}` in `path` against an explicit `home`.
742/// Filesystem-shaped adapters call this from inside their factory's `open()`.
743/// Tests use it directly to exercise the rule without mutating the
744/// process-wide `HOME` env var (`std::env::set_var` is `unsafe` under
745/// edition 2024 and pond forbids unsafe code). Unset vars and `~user` forms
746/// pass through unchanged - never guess.
747pub fn expand_home_under(path: &Path, home: &Path) -> PathBuf {
748    let Some(text) = path.to_str() else {
749        return path.to_path_buf();
750    };
751    let home_text = home.to_string_lossy();
752    let expanded = shellexpand::full_with_context_no_errors(
753        text,
754        || Some(home_text.clone()),
755        |var| std::env::var(var).ok(),
756    );
757    PathBuf::from(expanded.as_ref())
758}
759
760/// The inverse of [`expand_home_under`] for display and config writes:
761/// contract a `home` prefix back to `~` so user-facing surfaces (and the
762/// paths `pond init` persists) stay portable and readable. Non-home paths
763/// pass through unchanged.
764pub fn contract_home_under(path: &Path, home: &Path) -> PathBuf {
765    match path.strip_prefix(home) {
766        Ok(rest) if rest.as_os_str().is_empty() => PathBuf::from("~"),
767        Ok(rest) => Path::new("~").join(rest),
768        Err(_) => path.to_path_buf(),
769    }
770}
771
772/// [`contract_home_under`] against the process `HOME`. Returns the input
773/// rendered for humans; machine surfaces (JSON output, the wire) keep
774/// absolute paths.
775pub fn contract_home(path: &Path) -> PathBuf {
776    match std::env::var_os("HOME") {
777        Some(home) => contract_home_under(path, Path::new(&home)),
778        None => path.to_path_buf(),
779    }
780}
781
782impl EmbeddingsConfig {
783    /// Surface-level validation: model id non-empty and dim divisible by 8.
784    /// The dim/model mismatch is the load-time check inside `CandleEmbedder::load`,
785    /// which knows the model's `hidden_size`; what we can catch up front is the
786    /// IVF_PQ subspace stride (`dim / 8` in `embed::index_params`).
787    pub fn validate(&self) -> Result<()> {
788        if self.model.trim().is_empty() {
789            bail!("embeddings.model must be a non-empty HuggingFace model id");
790        }
791        if self.dim == 0 || !self.dim.is_multiple_of(8) {
792            bail!(
793                "embeddings.dim = {} must be a positive multiple of 8 (IVF_PQ subspace stride)",
794                self.dim,
795            );
796        }
797        Ok(())
798    }
799
800    /// Install model id + dim into the process. Idempotent: only the first
801    /// call sticks (matches `OnceLock` semantics in `embed::init_model_id` and
802    /// `sessions::init_embedding_dim`).
803    pub fn install_runtime(&self) {
804        crate::embed::init_model_id(self.model.clone());
805        crate::sessions::init_embedding_dim(self.dim);
806    }
807}
808
809#[cfg(test)]
810mod tests {
811    // `result_large_err`: `figment::Jail` closures return `figment::Error`
812    // by contract; the size is figment's, not ours.
813    #![allow(clippy::expect_used, clippy::unwrap_used, clippy::result_large_err)]
814
815    use super::*;
816    use serde_json::Value;
817    use tempfile::TempDir;
818
819    #[test]
820    fn validate_catches_empty_model_and_bad_dim() {
821        assert!(EmbeddingsConfig::default().validate().is_ok());
822        // Empty / whitespace-only model id is rejected: HuggingFace fetch
823        // would fail far away from the config error.
824        let bad_model = EmbeddingsConfig {
825            model: "   ".to_owned(),
826            dim: 768,
827        };
828        assert!(bad_model.validate().is_err());
829        // Dim must divide 8 (PQ subspace stride in `embed::index_params`).
830        let bad_dim = EmbeddingsConfig {
831            model: "intfloat/multilingual-e5-base".to_owned(),
832            dim: 100,
833        };
834        assert!(bad_dim.validate().is_err());
835        // Zero is rejected too (would divide-by-zero inside index_params).
836        let zero_dim = EmbeddingsConfig {
837            model: "intfloat/multilingual-e5-base".to_owned(),
838            dim: 0,
839        };
840        assert!(zero_dim.validate().is_err());
841    }
842
843    #[test]
844    fn config_load_missing_file_falls_back_to_builtin() {
845        let config = Config::load("/nonexistent/pond-config-xyz.toml").unwrap();
846        assert_eq!(config.embeddings, EmbeddingsConfig::default());
847    }
848
849    #[test]
850    fn default_config_toml_loads_to_the_builtin_defaults() {
851        let dir = TempDir::new().unwrap();
852        let path = dir.path().join("config.toml");
853        std::fs::write(&path, DEFAULT_CONFIG_TOML).unwrap();
854        // The shipped template is all comments, so it must load and validate as
855        // the built-in defaults - a malformed template fails right here.
856        let config = Config::load(&path).unwrap();
857        assert_eq!(config.embeddings, EmbeddingsConfig::default());
858        assert_eq!(config.embeddings.model, crate::embed::DEFAULT_MODEL_ID);
859        assert_eq!(
860            config.embeddings.dim,
861            crate::sessions::DEFAULT_EMBEDDING_DIM
862        );
863    }
864
865    #[test]
866    fn default_storage_path_follows_xdg_then_home() {
867        // An absolute XDG_DATA_HOME wins.
868        let resolved =
869            default_storage_path(Some(PathBuf::from("/xdg")), Some(PathBuf::from("/home")))
870                .unwrap();
871        assert!(is_local(&resolved));
872        assert_eq!(local_path(&resolved).unwrap(), PathBuf::from("/xdg/pond"));
873
874        // A relative XDG_DATA_HOME is ignored per the XDG spec; HOME is the fallback.
875        let resolved = default_storage_path(
876            Some(PathBuf::from("relative")),
877            Some(PathBuf::from("/home")),
878        )
879        .unwrap();
880        assert_eq!(
881            local_path(&resolved).unwrap(),
882            PathBuf::from("/home/.local/share/pond"),
883        );
884
885        // No XDG and no HOME - stays usable: returns the cwd-anchored `.pond`.
886        // The result is absolute (Lance's URL conversion requires it), so we
887        // just check that the URL ends with the relative path's components.
888        let resolved = default_storage_path(None, None).unwrap();
889        assert!(is_local(&resolved));
890        assert!(
891            local_path(&resolved).unwrap().ends_with(".pond"),
892            "fallback path should end with .pond: {resolved}",
893        );
894    }
895
896    #[test]
897    fn expand_home_under_handles_tilde_forms() {
898        let home = Path::new("/srv/me");
899        assert_eq!(
900            expand_home_under(Path::new("~"), home),
901            PathBuf::from("/srv/me")
902        );
903        assert_eq!(
904            expand_home_under(Path::new("~/.codex/sessions"), home),
905            PathBuf::from("/srv/me/.codex/sessions"),
906        );
907        // Absolute paths pass through unchanged.
908        assert_eq!(
909            expand_home_under(Path::new("/etc/passwd"), home),
910            PathBuf::from("/etc/passwd"),
911        );
912        // A leading `~something` (no slash) is not the home form - leave it.
913        assert_eq!(
914            expand_home_under(Path::new("~user/elsewhere"), home),
915            PathBuf::from("~user/elsewhere"),
916        );
917    }
918
919    #[test]
920    fn expand_home_under_handles_env_vars() {
921        // Jail serializes env mutation against the other env-touching tests.
922        figment::Jail::expect_with(|jail| {
923            jail.set_env("POND_TEST_EXPAND_DIR", "/srv/data");
924            let home = Path::new("/srv/me");
925            assert_eq!(
926                expand_home_under(Path::new("$POND_TEST_EXPAND_DIR/pond"), home),
927                PathBuf::from("/srv/data/pond"),
928            );
929            assert_eq!(
930                expand_home_under(Path::new("${POND_TEST_EXPAND_DIR}/pond"), home),
931                PathBuf::from("/srv/data/pond"),
932            );
933            // Unset vars pass through unchanged - never guess.
934            assert_eq!(
935                expand_home_under(Path::new("$POND_TEST_UNSET_VAR/x"), home),
936                PathBuf::from("$POND_TEST_UNSET_VAR/x"),
937            );
938            Ok(())
939        });
940    }
941
942    #[test]
943    fn contract_home_under_inverts_expansion() {
944        let home = Path::new("/srv/me");
945        assert_eq!(
946            contract_home_under(Path::new("/srv/me/.local/share/pond"), home),
947            PathBuf::from("~/.local/share/pond"),
948        );
949        assert_eq!(
950            contract_home_under(Path::new("/srv/me"), home),
951            PathBuf::from("~")
952        );
953        // Non-home paths pass through unchanged.
954        assert_eq!(
955            contract_home_under(Path::new("/etc/passwd"), home),
956            PathBuf::from("/etc/passwd"),
957        );
958    }
959
960    #[test]
961    fn resolve_sources_returns_one_or_all_or_errors() {
962        let temp = TempDir::new().unwrap();
963        let body = "\
964[sources.claude-code]
965enabled = true
966path = \"/srv/claude\"
967
968[sources.codex-cli]
969enabled = true
970path = \"/srv/codex\"
971
972[sources.opencode]
973enabled = false
974";
975        let path = temp.path().join("config.toml");
976        std::fs::write(&path, body).expect("write config");
977        let config = Config::load(&path).unwrap();
978
979        // None -> only enabled entries
980        let all = config.resolve_sources(None).unwrap();
981        assert_eq!(all.len(), 2);
982        let names: Vec<_> = all.iter().map(|(n, _)| n.as_str()).collect();
983        assert!(names.contains(&"claude-code"));
984        assert!(names.contains(&"codex-cli"));
985        // The `enabled` discriminator never reaches the adapter blob.
986        for (_, blob) in &all {
987            assert!(blob.get("enabled").is_none(), "enabled should be stripped");
988        }
989
990        // Some(name) -> one entry, opaque JSON blob
991        let one = config.resolve_sources(Some("codex-cli")).unwrap();
992        assert_eq!(one.len(), 1);
993        assert_eq!(one[0].0, "codex-cli");
994        assert_eq!(
995            one[0].1.get("path").and_then(Value::as_str),
996            Some("/srv/codex"),
997        );
998
999        // Disabled positional -> errors with the recovery hint baked in.
1000        let disabled = config.resolve_sources(Some("opencode"));
1001        let err = disabled
1002            .expect_err("disabled adapter must error")
1003            .to_string();
1004        assert!(err.contains("enabled = false"), "got: {err}");
1005        assert!(err.contains("pond sync opencode"), "got: {err}");
1006
1007        // Unknown -> error
1008        assert!(config.resolve_sources(Some("nope")).is_err());
1009
1010        // disabled_source_names lists exactly the off ones.
1011        assert_eq!(config.disabled_source_names(), vec!["opencode"]);
1012    }
1013
1014    #[test]
1015    fn memory_uri_is_classified_as_remote() {
1016        let url = Url::parse("memory:///pond-remote-test").expect("memory uri parses");
1017        assert!(
1018            !is_local(&url),
1019            "memory:// is not a local-filesystem URL: {url}",
1020        );
1021        assert!(
1022            local_path(&url).is_none(),
1023            "local_path must return None for non-file schemes",
1024        );
1025    }
1026
1027    // The storage/creds tests run inside `figment::Jail` even when they set
1028    // no env vars: the Jail-based env-mirror test mutates process-global env
1029    // mid-flight, and the Jail lock is what serializes them against it.
1030
1031    #[test]
1032    fn storage_and_creds_round_trip() {
1033        figment::Jail::expect_with(|jail| {
1034            jail.create_file(
1035                "config.toml",
1036                r#"
1037[storage]
1038path = "s3+https://nbg1.example.com/my-pond"
1039
1040[creds.default]
1041access_key_id     = "AKIA123"
1042secret_access_key = "shh"
1043
1044[creds.work]
1045scope             = "s3+https://fsn1.example.com/work-pond/"
1046access_key_id     = "AKIA456"
1047secret_access_key_command = "op read op://vault/pond/secret"
1048region            = "fsn1"
1049virtual_hosted_style_request = false
1050extra = { request_timeout = "60 seconds" }
1051"#,
1052            )?;
1053            let config = Config::load("config.toml").expect("config loads");
1054            assert_eq!(
1055                config.storage.path.as_deref(),
1056                Some("s3+https://nbg1.example.com/my-pond"),
1057            );
1058            assert_eq!(config.creds.len(), 2);
1059            let work = &config.creds["work"];
1060            assert_eq!(
1061                work.secret_access_key_command.as_deref(),
1062                Some("op read op://vault/pond/secret"),
1063            );
1064            assert_eq!(work.virtual_hosted_style_request, Some(false));
1065            assert_eq!(work.extra["request_timeout"], "60 seconds");
1066            Ok(())
1067        });
1068    }
1069
1070    #[test]
1071    fn creds_validators_reject_bad_shapes() {
1072        let cases: &[(&str, &str)] = &[
1073            // Unknown key dies loudly (typos must not silently no-op).
1074            ("[creds.a]\nacces_key_id = \"x\"\n", "acces_key_id"),
1075            // Name charset: separators break the env-mirror grammar.
1076            ("[creds.my_set]\naccess_key_id = \"x\"\n", "[a-z][a-z0-9]"),
1077            ("[creds.A1]\naccess_key_id = \"x\"\n", "[a-z][a-z0-9]"),
1078            // One variant per logical secret.
1079            (
1080                "[creds.a]\nsecret_access_key = \"x\"\nsecret_access_key_command = \"cat\"\n",
1081                "more than one",
1082            ),
1083            (
1084                "[creds.a]\naccess_key_id = \"x\"\naccess_key_id_file = \"/k\"\n",
1085                "pick one",
1086            ),
1087            // At most one scope-less set.
1088            (
1089                "[creds.a]\naccess_key_id = \"x\"\n[creds.b]\naccess_key_id = \"y\"\n",
1090                "scope-less",
1091            ),
1092            // Duplicate scopes can never tie-break - checked canonicalized,
1093            // so two spellings of one prefix still collide.
1094            (
1095                "[creds.a]\nscope = \"s3+https://h:443/b/\"\naccess_key_id = \"x\"\n[creds.b]\nscope = \"s3+https://h/b\"\naccess_key_id = \"y\"\n",
1096                "same scope",
1097            ),
1098        ];
1099        figment::Jail::expect_with(|jail| {
1100            for (body, needle) in cases {
1101                jail.create_file("config.toml", body)?;
1102                let err = Config::load("config.toml").expect_err(body).to_string();
1103                assert!(
1104                    err.contains(needle),
1105                    "want {needle:?} in error for {body:?}, got: {err}",
1106                );
1107            }
1108            Ok(())
1109        });
1110    }
1111
1112    #[test]
1113    fn legacy_storage_map_errors_with_the_rewrite_recipe() {
1114        figment::Jail::expect_with(|jail| {
1115            jail.create_file(
1116                "config.toml",
1117                r#"
1118[storage]
1119AWS_ACCESS_KEY_ID = "AKIA123"
1120AWS_SECRET_ACCESS_KEY = "shh"
1121AWS_REGION = "nbg1"
1122AWS_ENDPOINT = "https://ttq.nbg1.your-objectstorage.com"
1123aws_virtual_hosted_style_request = "true"
1124"#,
1125            )?;
1126            let err = Config::load("config.toml")
1127                .expect_err("legacy map must error")
1128                .to_string();
1129            // The error IS the migration: old keys mapped onto the new shape.
1130            assert!(err.contains("old [storage] passthrough map"), "got: {err}");
1131            assert!(
1132                err.contains("s3+https://ttq.nbg1.your-objectstorage.com/"),
1133                "endpoint host must fold into the URL recipe, got: {err}",
1134            );
1135            // spec.md#storage-redaction: the recipe must NOT echo the real
1136            // key values - placeholders plus a "copy from" pointer only.
1137            assert!(!err.contains("AKIA123"), "got: {err}");
1138            assert!(!err.contains("\"shh\""), "got: {err}");
1139            assert!(err.contains("access_key_id     = \"...\""), "got: {err}");
1140            // Region is autodetected (AWS) or defaulted (S3-compatible
1141            // endpoints ignore it): the recipe must not carry AWS_REGION
1142            // forward, only name the ?region= override.
1143            assert!(!err.contains("region            ="), "got: {err}");
1144            assert!(err.contains("?region="), "got: {err}");
1145            assert!(err.contains("pond storage check"), "got: {err}");
1146            Ok(())
1147        });
1148    }
1149
1150    #[test]
1151    fn env_mirror_layers_over_file() {
1152        figment::Jail::expect_with(|jail| {
1153            jail.create_file(
1154                "config.toml",
1155                r#"
1156[storage]
1157path = "/from-file"
1158
1159[creds.work]
1160scope         = "s3://file-bucket/"
1161access_key_id = "from-file"
1162region        = "file-region"
1163"#,
1164            )?;
1165            // Env beats file per field; untouched fields survive the merge.
1166            jail.set_env("POND_STORAGE_PATH", "/from-env");
1167            jail.set_env("POND_CREDS_WORK_ACCESS_KEY_ID", "from-env");
1168            // A purely-numeric env secret must stay a string (extract_lossy).
1169            jail.set_env("POND_CREDS_WORK_SECRET_ACCESS_KEY", "12345");
1170            // A set defined only in env is discovered by the prefix scan.
1171            jail.set_env("POND_CREDS_CI_ACCESS_KEY_ID", "ci-key");
1172            let config = Config::load("config.toml").expect("env+file config loads");
1173            assert_eq!(config.storage.path.as_deref(), Some("/from-env"));
1174            let work = &config.creds["work"];
1175            assert_eq!(work.access_key_id.as_deref(), Some("from-env"));
1176            assert_eq!(work.secret_access_key.as_deref(), Some("12345"));
1177            assert_eq!(work.region.as_deref(), Some("file-region"));
1178            assert_eq!(work.scope.as_deref(), Some("s3://file-bucket/"));
1179            assert_eq!(config.creds["ci"].access_key_id.as_deref(), Some("ci-key"));
1180            Ok(())
1181        });
1182    }
1183}