Skip to main content

pond/
config.rs

1//! Configuration loading: the `[embeddings]`, `[sources]`, `[storage]`, and
2//! `[creds.*]` blocks.
3//!
4//! pond ships built-in defaults, so an instance with no `config.toml` still
5//! works. `pond config schema` emits [`DEFAULT_CONFIG_TOML`], the
6//! fully-annotated example. Loading layers `config.toml` under the `POND_*`
7//! env mirror via figment, so every command also works with no config file
8//! at all (spec.md#storage-configless) - URLs + env vars are sufficient.
9
10use std::{
11    collections::BTreeMap,
12    path::{Path, PathBuf},
13};
14
15use anyhow::{Context, Result, anyhow, bail};
16use figment::{
17    Figment,
18    providers::{Env, Format, Toml},
19};
20use serde::{Deserialize, Deserializer, Serialize, de};
21use serde_json::Value;
22use url::Url;
23
24/// Parse `"128 MiB"`, `"1 GiB"`, `"500 KiB"`, or a bare byte count. Accepts
25/// SI (KB/MB/GB) and binary (KiB/MiB/GiB/TiB) suffixes; treats the bare unit
26/// `"B"` and unsuffixed numbers as raw bytes. Tolerant of whitespace and
27/// case. The result MUST fit in `usize` (Lance's cache APIs take `usize`).
28fn parse_byte_size(raw: &str) -> Result<usize, String> {
29    let trimmed = raw.trim();
30    if trimmed.is_empty() {
31        return Err("byte-size value is empty".to_owned());
32    }
33    let split = trimmed
34        .find(|c: char| c.is_ascii_alphabetic())
35        .unwrap_or(trimmed.len());
36    let (number, unit) = trimmed.split_at(split);
37    let number: f64 = number
38        .trim()
39        .parse()
40        .map_err(|_| format!("byte-size value {raw:?} is not a number"))?;
41    if !number.is_finite() || number < 0.0 {
42        return Err(format!("byte-size value {raw:?} must be non-negative"));
43    }
44    let multiplier: f64 = match unit.trim().to_ascii_lowercase().as_str() {
45        "" | "b" => 1.0,
46        "k" | "kb" => 1_000.0,
47        "kib" => 1_024.0,
48        "m" | "mb" => 1_000_000.0,
49        "mib" => 1_048_576.0,
50        "g" | "gb" => 1_000_000_000.0,
51        "gib" => 1_073_741_824.0,
52        "tib" => 1_099_511_627_776.0,
53        other => {
54            return Err(format!(
55                "byte-size unit {other:?} not recognized (try MiB / GiB)"
56            ));
57        }
58    };
59    let bytes = number * multiplier;
60    if !bytes.is_finite() || bytes > usize::MAX as f64 {
61        return Err(format!("byte-size value {raw:?} overflows usize"));
62    }
63    Ok(bytes as usize)
64}
65
66/// Accept string / integer / float / bool and stringify. The env mirror
67/// parses values TOML-ishly, so `POND_CREDS_X_SECRET_ACCESS_KEY=12345`
68/// arrives as a number; these fields are strings no matter how they scan.
69fn lenient_string<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
70where
71    D: Deserializer<'de>,
72{
73    #[derive(Deserialize)]
74    #[serde(untagged)]
75    enum Repr {
76        Text(String),
77        Int(i64),
78        Float(f64),
79        Bool(bool),
80    }
81    Ok(
82        Option::<Repr>::deserialize(deserializer)?.map(|repr| match repr {
83            Repr::Text(value) => value,
84            Repr::Int(value) => value.to_string(),
85            Repr::Float(value) => value.to_string(),
86            Repr::Bool(value) => value.to_string(),
87        }),
88    )
89}
90
91fn deserialize_byte_size_opt<'de, D>(deserializer: D) -> Result<Option<usize>, D::Error>
92where
93    D: Deserializer<'de>,
94{
95    #[derive(Deserialize)]
96    #[serde(untagged)]
97    enum Repr {
98        Bytes(u64),
99        Text(String),
100    }
101    let repr: Option<Repr> = Option::deserialize(deserializer)?;
102    match repr {
103        None => Ok(None),
104        Some(Repr::Bytes(value)) => usize::try_from(value).map(Some).map_err(de::Error::custom),
105        Some(Repr::Text(value)) => parse_byte_size(&value).map(Some).map_err(de::Error::custom),
106    }
107}
108
109/// True when the URL is on the local filesystem. Mirrors Lance's
110/// `ObjectStore::is_local` (lance-io/src/object_store.rs:541): the `file` and
111/// `file+uring` schemes are local; everything else (incl. `memory://`) is not.
112pub fn is_local(url: &Url) -> bool {
113    matches!(url.scheme(), "file" | "file+uring")
114}
115
116/// Extract the filesystem `PathBuf` for local URLs. `None` for remote.
117pub fn local_path(url: &Url) -> Option<PathBuf> {
118    if is_local(url) {
119        url.to_file_path().ok()
120    } else {
121        None
122    }
123}
124
125/// URI string for a child of this location (typically one Lance dataset under
126/// the data dir). Trims a single trailing slash on the base, then concatenates
127/// with a `/` separator. This keeps `Dataset::open` / `Dataset::write` happy
128/// on both filesystem and object-store backends - they want the URI form, not
129/// a `url::Url`.
130pub fn child_uri(base: &Url, suffix: &str) -> String {
131    // For local URLs we strip the `file://` prefix so log lines and error
132    // messages render as plain paths (`/srv/pond/sessions.lance`), matching
133    // what pond used to emit before the URL migration.
134    if let Some(path) = local_path(base) {
135        return path.join(suffix).display().to_string();
136    }
137    format!("{}/{suffix}", base.as_str().trim_end_matches('/'))
138}
139
140/// Render a `Url` for human-readable log/diagnostic output: local URLs come
141/// back as plain paths (no `file://` prefix, `$HOME` contracted to `~`);
142/// remote URLs stay verbatim.
143pub fn display(url: &Url) -> String {
144    if let Some(path) = local_path(url) {
145        contract_home(&path).display().to_string()
146    } else {
147        url.to_string()
148    }
149}
150
151/// Build a `Url` from a filesystem path. Convenience for tests and for
152/// callers that hold a `PathBuf` already. The path must be
153/// absolute (`url::Url::from_file_path` is a hard requirement on Unix); a
154/// relative path gets canonicalized via `std::path::absolute` first.
155pub fn url_for_path(path: impl AsRef<Path>) -> Result<Url> {
156    let path = path.as_ref();
157    let absolute = if path.is_absolute() {
158        path.to_path_buf()
159    } else {
160        std::path::absolute(path)
161            .with_context(|| format!("failed to absolutize {}", path.display()))?
162    };
163    Url::from_file_path(&absolute).map_err(|()| {
164        anyhow!(
165            "failed to convert path {} into a file:// URL",
166            absolute.display()
167        )
168    })
169}
170
171/// Default `config.toml` body emitted by `pond config schema`. Every
172/// line is commented: pond ships built-in defaults, so the file is purely a
173/// discoverable template and pond still works with no `config.toml` on disk.
174pub const DEFAULT_CONFIG_TOML: &str = "\
175# pond configuration.
176#
177# pond ships built-in defaults, so every setting here is optional - delete this
178# file and pond still works. Uncomment and edit to override.
179
180# Where pond looks for source data to import. One entry per adapter type
181# (`claude-code`, `codex-cli`, ...). `pond sync` with no arguments syncs every
182# entry; `pond sync <adapter>` syncs just one. With an empty `[sources]`,
183# `pond sync` runs an interactive discovery against the known default paths
184# and writes the picks back here.
185#
186# Future wrap: pond is single-namespace in v1 (spec.md#wire-namespace-resolution); `[sources]` is
187# flat here. When multi-namespace pond lands, source registration becomes
188# per-tenant under `[namespaces.<ns>.sources.<adapter>]`. Pre-v1 the schema
189# is breakable; the rename is operationally free until a real second tenant
190# exists.
191#
192# [sources.claude-code]
193# enabled = true
194# path = \"~/.claude/projects\"
195#
196# [sources.codex-cli]
197# enabled = true
198# path = \"~/.codex/sessions\"
199#
200# Set `enabled = false` to keep the section but skip it on `pond sync`;
201# re-enable via `pond sync <adapter>`.
202
203# Embeddings. Search runs hybrid (vector + FTS) whenever the store has any
204# vectors, and FTS-only otherwise - the model loads lazily on the first hybrid
205# query, so there's no cost on FTS-only corpora. `model` selects the
206# HuggingFace XLM-RoBERTa model; `dim` declares its output width and is baked
207# into the messages.vector schema on table creation - it must equal the
208# model's hidden_size and be a multiple of 8 (IVF_PQ subspace stride).
209#
210# Common pairings:
211#   model = \"intfloat/multilingual-e5-small\"   dim = 384   (default)
212#   model = \"intfloat/multilingual-e5-base\"    dim = 768
213#   model = \"intfloat/multilingual-e5-large\"   dim = 1024
214#
215# A different-dim model needs a fresh data dir; pond enforces this at the
216# schema boundary.
217#
218# [embeddings]
219# model = \"intfloat/multilingual-e5-small\"
220# dim = 384
221
222# Search tuning. Leave unset for Lance defaults; set when tuning IVF_PQ recall
223# against a corpus.
224#
225# [search]
226# nprobes = 16
227# refine_factor = 2
228
229# Storage maintenance. Tunes the compaction + cleanup pass that runs inside
230# `pond sync` and `pond index optimize`.
231#
232# - `compaction_fragment_cap` is the per-task fragment-count backstop: a
233#   planned compaction task touching at least this many fragments always runs
234#   even when the write-amplification veto would skip it. Default 64; 0
235#   disables the veto and runs every task Lance plans.
236# - `cleanup_older_than` is the manifest-retention window for the safe cleanup
237#   pass. Accepts `Ns` / `Nm` / `Nh` / `Nd` (default `1d`, floor `1h` - it is
238#   what protects in-flight readers). Versions older than this are reclaimed
239#   by Lance's OCC-coordinated GC.
240# - `index_lag_threshold` is the minimum unindexed-fragment count before a
241#   per-intent append/rebuild runs in `pond index optimize`; the brute-force
242#   fallback keeps queries correct while fragments accumulate. Default 4.
243#
244# [maintenance]
245# compaction_fragment_cap = 64
246# cleanup_older_than = \"1d\"
247# index_lag_threshold = 4
248
249# Long-running process caps. Both accept either a plain byte count or a
250# humansize-style suffix (\"128 MiB\", \"1 GiB\"). Both are optional - leave
251# unset to let pond pick the backend-aware default:
252#   local FS  : index_cache = 256 MiB, metadata_cache = 128 MiB
253#   remote    : index_cache = 2 GiB,   metadata_cache = 512 MiB
254# Lance's library defaults (6 GiB / 1 GiB) are too generous for a per-session
255# `pond mcp` process; tightening them is what keeps RSS under the 500 MiB target
256# without measurable latency regressions on typical agent-history corpora.
257#
258# [runtime]
259# index_cache_bytes    = \"256 MiB\"
260# metadata_cache_bytes = \"128 MiB\"
261
262# Storage address and credentials (spec.md#storage-url-grammar).
263#
264# `path` is the default destination used when `--storage-path` (env
265# `POND_STORAGE_PATH`) is not passed. Absent = the platform-local data dir.
266# Addresses are URLs; the `s3+https` form carries the endpoint, bucket, and
267# prefix in one token:
268#
269#   /abs/path or ~/path                  local filesystem
270#   s3://bucket/prefix                   AWS S3 (ambient credential chain)
271#   s3+https://host/bucket/prefix        S3-compatible endpoint (Hetzner, R2, B2, MinIO)
272#   gs://bucket/prefix                   Google Cloud Storage
273#   az://account/container/prefix        Azure Blob
274#
275# Credentials live in `[creds.<name>]` sets and bind to URLs by `scope`
276# prefix - longest match wins (spec.md#creds-scope-match); a set without
277# `scope` matches any URL. With no matching set, the standard cloud SDK
278# chain applies (AWS_* env, shared credentials file, instance metadata).
279# Secrets never go in URLs or CLI flags; besides inline values,
280# `access_key_id_file` / `secret_access_key_file` read a file and
281# `secret_access_key_command` runs a command (e.g. `op read ...`). `extra`
282# holds verbatim `object_store` options pond has not typed.
283#
284# Every field mirrors to env: `POND_STORAGE_PATH`, `POND_CREDS_<NAME>_<FIELD>`
285# (set names are lowercase alphanumeric, so the env grammar is unambiguous).
286# Precedence: CLI flag > POND_* env > this file > ambient cloud chain.
287# Probe a destination end-to-end with `pond storage check`.
288#
289# Future wrap: pond is single-namespace in v1 (spec.md#wire-namespace-resolution);
290# `[storage]` is flat here on the assumption of one bucket per pond. When
291# multi-namespace pond lands this becomes `[namespaces.<ns>.storage]`.
292#
293# [storage]
294# path = \"s3+https://nbg1.your-objectstorage.com/my-pond\"
295#
296# [creds.default]
297# access_key_id     = \"...\"
298# secret_access_key = \"...\"
299";
300
301/// Top-level `config.toml` shape.
302#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
303#[serde(deny_unknown_fields)]
304pub struct Config {
305    #[serde(default)]
306    pub embeddings: EmbeddingsConfig,
307    #[serde(default)]
308    pub search: SearchConfig,
309    #[serde(default)]
310    pub maintenance: MaintenanceConfig,
311    #[serde(default)]
312    pub runtime: RuntimeConfig,
313    /// `[sources.<adapter>]` map: per-adapter config blobs the matching
314    /// factory deserializes inside its `open()`. The shape is adapter-defined
315    /// (filesystem adapters expect `{ path = "..." }`; API-backed adapters
316    /// expect endpoint + auth keys), so this layer stays opaque. Empty by
317    /// default; `pond sync` runs discovery into this map on first use.
318    #[serde(default)]
319    pub sources: BTreeMap<String, Value>,
320    /// `[storage]`: the default destination URL (spec.md#storage-url-grammar).
321    /// `None` = the platform-local data dir.
322    #[serde(default)]
323    pub storage: StorageConfig,
324    /// `[creds.<name>]`: URL-scoped credential sets. Every storage URL
325    /// resolves its own set by longest-prefix `scope` match
326    /// (spec.md#creds-scope-match); the resolver lives in `pond::substrate`.
327    #[serde(default)]
328    pub creds: BTreeMap<String, CredsSet>,
329}
330
331/// `[storage]`: the single default destination. Typed so the legacy
332/// passthrough map (ENV-style `object_store` keys) fails loudly with the
333/// rewrite recipe instead of silently changing meaning.
334#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
335#[serde(deny_unknown_fields)]
336pub struct StorageConfig {
337    #[serde(default)]
338    pub path: Option<String>,
339}
340
341/// One `[creds.<name>]` set. All fields optional; validation enforces at most
342/// one variant per logical secret. `extra` carries verbatim `object_store`
343/// options pond has not typed (redaction in `pond config show` still applies
344/// to its keys by name).
345#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
346#[serde(deny_unknown_fields)]
347pub struct CredsSet {
348    /// URL prefix this set binds to. `None` = the catch-all set (at most one).
349    #[serde(default)]
350    pub scope: Option<String>,
351    // Key / region fields are `lenient_string`: the env mirror parses values
352    // TOML-ishly, so an all-digit key or region arrives as a number and must
353    // still land in these String fields.
354    #[serde(default, deserialize_with = "lenient_string")]
355    pub access_key_id: Option<String>,
356    #[serde(default)]
357    pub access_key_id_file: Option<PathBuf>,
358    #[serde(default, deserialize_with = "lenient_string")]
359    pub secret_access_key: Option<String>,
360    #[serde(default)]
361    pub secret_access_key_file: Option<PathBuf>,
362    #[serde(default)]
363    pub secret_access_key_command: Option<String>,
364    #[serde(default, deserialize_with = "lenient_string")]
365    pub region: Option<String>,
366    #[serde(default)]
367    pub virtual_hosted_style_request: Option<bool>,
368    #[serde(default)]
369    pub extra: BTreeMap<String, String>,
370}
371
372/// `[runtime]`: long-running process caps. Both knobs accept either a plain
373/// byte count or a `humansize`-style suffix (`"128 MiB"`, `"1 GiB"`). Both are
374/// optional - `None` lets `pond::substrate` pick the backend-aware default
375/// (local FS gets a tight cap; object stores stay near Lance's defaults).
376#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
377#[serde(deny_unknown_fields, default)]
378pub struct RuntimeConfig {
379    #[serde(default, deserialize_with = "deserialize_byte_size_opt")]
380    pub index_cache_bytes: Option<usize>,
381    #[serde(default, deserialize_with = "deserialize_byte_size_opt")]
382    pub metadata_cache_bytes: Option<usize>,
383}
384
385/// `[search]`: optional Lance vector-query tuning knobs.
386#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
387#[serde(deny_unknown_fields)]
388pub struct SearchConfig {
389    #[serde(default)]
390    pub nprobes: Option<usize>,
391    #[serde(default)]
392    pub refine_factor: Option<u32>,
393}
394
395/// `[maintenance]`: storage-maintenance knobs shared by `pond sync` and
396/// `pond index optimize`. All optional - omit and pond falls back to the
397/// in-process defaults in `pond::substrate` (`DEFAULT_COMPACTION_FRAGMENT_CAP`,
398/// `default_cleanup_older_than`, and the `index_lag_threshold` initializer).
399#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
400#[serde(deny_unknown_fields)]
401pub struct MaintenanceConfig {
402    /// Sub-target fragment count past which the compaction phase runs (it also
403    /// runs once those fragments hold a whole target fragment's worth of rows).
404    /// Default 64 stops the automated sync re-compacting the trailing fragment
405    /// every pass; 0 compacts every pass.
406    #[serde(default)]
407    pub compaction_fragment_cap: Option<usize>,
408    /// Manifest-retention window for the safe cleanup pass. Accepts
409    /// `Ns`/`Nm`/`Nh`/`Nd` (default `1d`). Versions older than this are
410    /// reclaimed by Lance's OCC-coordinated GC (`delete_unverified=false`),
411    /// which never races a concurrent writer on any backend.
412    #[serde(default)]
413    pub cleanup_older_than: Option<String>,
414    /// Minimum unindexed-fragment count below which `optimize_table_indices`
415    /// skips the per-intent append/rebuild path; the brute-force fallback
416    /// keeps queries correct while fragments accumulate. Default 4 trades a
417    /// little query latency on cold fragments for far fewer remote index
418    /// commits during high-rate ingest.
419    #[serde(default)]
420    pub index_lag_threshold: Option<usize>,
421}
422
423/// `[embeddings]`: model selector and vector dimension. There is no master
424/// switch - the search path always runs hybrid when vectors exist in the
425/// store and FTS-only when they don't (`has_embeddings()` is the only gate);
426/// the candle/Metal model is `LazyEmbedder`-loaded on the first query that
427/// actually needs it. `model` and `dim` are installed into the process at
428/// startup via `embed::init_model_id` / `sessions::init_embedding_dim`, so
429/// swapping models for a one-off experiment is a temporary config file - no
430/// CLI flag and no per-call-site plumbing.
431#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
432#[serde(deny_unknown_fields, default)]
433pub struct EmbeddingsConfig {
434    /// The embedding model id (spec.md#search): any XLM-RoBERTa model loadable
435    /// by `candle-transformers`. Defaults to `intfloat/multilingual-e5-small`.
436    pub model: String,
437    /// Output dimension of `model`. Must equal the model's `hidden_size` and
438    /// be divisible by 8 (the IVF_PQ subspace stride; see `embed::index_params`).
439    /// Defaults to 384 (e5-small). Set to 768 for e5-base, 1024 for e5-large.
440    pub dim: usize,
441}
442
443impl Default for EmbeddingsConfig {
444    fn default() -> Self {
445        Self {
446            model: crate::embed::DEFAULT_MODEL_ID.to_owned(),
447            dim: crate::sessions::DEFAULT_EMBEDDING_DIM,
448        }
449    }
450}
451
452/// The platform-local default storage path, used when neither
453/// `--storage-path` / `POND_STORAGE_PATH` nor `[storage].path` is set:
454/// `$XDG_DATA_HOME/pond`, then `$HOME/.local/share/pond`, then `.pond`.
455/// `xdg_data_home` is honored only if absolute, per the XDG base-directory
456/// spec.
457pub fn default_storage_path(xdg_data_home: Option<PathBuf>, home: Option<PathBuf>) -> Result<Url> {
458    if let Some(xdg) = xdg_data_home.filter(|path| path.is_absolute()) {
459        return url_for_path(xdg.join("pond"));
460    }
461    if let Some(home) = home {
462        return url_for_path(home.join(".local").join("share").join("pond"));
463    }
464    // No HOME and no usable XDG var - stay usable rather than panic.
465    url_for_path(PathBuf::from(".pond"))
466}
467
468/// Local default path for `config.toml`. URI-backed data dirs always land
469/// here because the config file has to be local (it names the bucket and
470/// any creds). XDG hierarchy: `$XDG_CONFIG_HOME/pond/config.toml`, then
471/// `$HOME/.config/pond/config.toml`, then `.pond.toml` in cwd.
472pub fn default_config_path(xdg_config_home: Option<PathBuf>, home: Option<PathBuf>) -> PathBuf {
473    if let Some(xdg) = xdg_config_home.filter(|path| path.is_absolute()) {
474        return xdg.join("pond").join("config.toml");
475    }
476    if let Some(home) = home {
477        return home.join(".config").join("pond").join("config.toml");
478    }
479    PathBuf::from(".pond.toml")
480}
481
482impl Config {
483    /// Load `config.toml` from `path` (if it exists) layered under the
484    /// `POND_*` env mirror, and validate. A missing file yields the built-in
485    /// defaults - env vars alone are a complete config
486    /// (spec.md#storage-configless). On success the resolved embedding model
487    /// id + dim are installed into the process (`OnceLock`-backed; only the
488    /// first call per process sticks), so all downstream code paths see a
489    /// consistent pair without per-handler plumbing.
490    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
491        Ok(Self::load_with_provenance(path)?.0)
492    }
493
494    /// [`Config::load`] over an in-memory TOML body (still layered under the
495    /// `POND_*` env mirror). `pond init` uses this to validate and resolve
496    /// the config it is composing BEFORE anything touches disk - the wizard
497    /// writes exactly once, at the end.
498    pub fn load_str(body: &str) -> Result<Self> {
499        let figment = Figment::new().merge(Toml::string(body)).merge(env_mirror());
500        let config: Self = figment
501            .extract_lossy()
502            .map_err(|error| anyhow!("failed to load config: {error}"))?;
503        config.embeddings.validate()?;
504        config.validate_creds()?;
505        Ok(config)
506    }
507
508    /// [`Config::load`] that also returns the figment, so `pond config show`
509    /// can attribute each value to its source layer (file / env / default).
510    pub fn load_with_provenance(path: impl AsRef<Path>) -> Result<(Self, Figment)> {
511        let path = path.as_ref();
512        let figment = Figment::new().merge(Toml::file(path)).merge(env_mirror());
513        // `extract_lossy`, not `extract`: env values parse TOML-ishly, so an
514        // all-digit secret would arrive as a number and fail the String field;
515        // lossy stringifies scalars instead.
516        let config: Self = figment.extract_lossy().map_err(|error| {
517            if let Some(recipe) = detect_legacy_storage(path) {
518                return anyhow!("{recipe}");
519            }
520            // Inline figment's message (it already names the failing key and
521            // source layer) so single-line error surfaces keep the detail.
522            anyhow!("failed to load config {}: {error}", path.display())
523        })?;
524        config.embeddings.validate()?;
525        config.validate_creds()?;
526        config.embeddings.install_runtime();
527        if let Some(threshold) = config.maintenance.index_lag_threshold {
528            crate::substrate::init_index_lag_threshold(threshold);
529        }
530        // Tilde expansion is per-adapter (inside each factory's `open()`):
531        // an API-backed adapter has no path to expand, and only the
532        // filesystem-shaped adapters need the helper. See `expand_home_under`.
533        Ok((config, figment))
534    }
535
536    /// `[creds.*]` structural rules (spec.md#creds-scope-match): set-name
537    /// charset, at most one variant per logical secret, at most one
538    /// scope-less set, no duplicate scopes. All parse-time so a misbinding
539    /// dies before any URL resolves against it.
540    fn validate_creds(&self) -> Result<()> {
541        let mut scopeless: Option<&str> = None;
542        let mut scopes: BTreeMap<String, &str> = BTreeMap::new();
543        for (name, set) in &self.creds {
544            // Lowercase alphanumeric only - load-bearing for the env mirror:
545            // it makes `POND_CREDS_<NAME>_<FIELD>` splittable at the first
546            // `_` after the name (field names contain underscores).
547            let mut chars = name.chars();
548            let head_ok = chars.next().is_some_and(|c| c.is_ascii_lowercase());
549            if !head_ok
550                || name.len() > 16
551                || !chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit())
552            {
553                bail!(
554                    "creds set name {name:?} must match [a-z][a-z0-9]{{0,15}} (lowercase alphanumeric, no separators)"
555                );
556            }
557            if set.access_key_id.is_some() && set.access_key_id_file.is_some() {
558                bail!("[creds.{name}] sets both access_key_id and access_key_id_file; pick one");
559            }
560            let secret_variants = [
561                set.secret_access_key.is_some(),
562                set.secret_access_key_file.is_some(),
563                set.secret_access_key_command.is_some(),
564            ]
565            .iter()
566            .filter(|present| **present)
567            .count();
568            if secret_variants > 1 {
569                bail!(
570                    "[creds.{name}] sets more than one of secret_access_key / secret_access_key_file / secret_access_key_command; pick one"
571                );
572            }
573            match set.scope.as_deref() {
574                None => {
575                    if let Some(other) = scopeless {
576                        bail!(
577                            "[creds.{other}] and [creds.{name}] are both scope-less; at most one catch-all set is allowed - add a `scope` to one"
578                        );
579                    }
580                    scopeless = Some(name);
581                }
582                Some(scope) => {
583                    // Duplicates are checked on the canonical form (incl.
584                    // trailing-slash trim, matching scope-match semantics),
585                    // so two spellings of one prefix can never tie at
586                    // resolve time.
587                    let canonical = crate::substrate::parse_scope(scope)
588                        .map(|url| url.as_str().trim_end_matches('/').to_owned())
589                        .with_context(|| {
590                            format!("[creds.{name}] scope {scope:?} is not a valid URL prefix")
591                        })?;
592                    if let Some(other) = scopes.insert(canonical, name) {
593                        bail!(
594                            "[creds.{other}] and [creds.{name}] declare the same scope {scope:?}; merge them or narrow one"
595                        );
596                    }
597                }
598            }
599        }
600        Ok(())
601    }
602
603    /// Resolve the `[sources.<adapter>]` entries to drive `pond sync`. Only
604    /// sections with `enabled = true` flow through; sections with
605    /// `enabled = false` (or absent) are treated as opt-out and the
606    /// per-adapter blob (minus `enabled`) is handed to the factory's
607    /// `open()`. With `adapter = None` returns every enabled entry; with
608    /// `Some(name)` returns just that one - and errors if it's not in
609    /// config OR if it's currently disabled (the caller should then
610    /// re-prompt or report).
611    pub fn resolve_sources(&self, adapter: Option<&str>) -> Result<Vec<(String, Value)>> {
612        match adapter {
613            None => Ok(self
614                .sources
615                .iter()
616                .filter_map(|(name, blob)| take_enabled(name, blob))
617                .collect()),
618            Some(name) => {
619                let blob = self
620                    .sources
621                    .get(name)
622                    .ok_or_else(|| anyhow!("no [sources.{name}] entry in config"))?;
623                take_enabled(name, blob).map(|entry| vec![entry]).ok_or_else(|| {
624                    anyhow!(
625                        "source [{name}] is disabled (enabled = false); run `pond sync {name}` to re-enable"
626                    )
627                })
628            }
629        }
630    }
631
632    /// Names that are configured but currently `enabled = false`. Used by
633    /// `pond sync` post-import to know not to re-probe an adapter the user
634    /// already declined (the decline persists; re-prompt only via the
635    /// positional override `pond sync <name>`).
636    pub fn disabled_source_names(&self) -> Vec<&str> {
637        self.sources
638            .iter()
639            .filter_map(|(name, blob)| {
640                let enabled = blob
641                    .get("enabled")
642                    .and_then(Value::as_bool)
643                    .unwrap_or(false);
644                if enabled { None } else { Some(name.as_str()) }
645            })
646            .collect()
647    }
648}
649
650/// The `POND_*` env mirror (spec.md#storage-env-mirror): `POND_STORAGE_PATH`
651/// -> `storage.path`, `POND_CREDS_<NAME>_<FIELD>` -> `creds.<name>.<field>`.
652/// Filtered to exactly those two shapes - clap owns its own `POND_*` vars
653/// (`POND_CONFIG`, `POND_HOST`, ...) and an unfiltered prefix would turn each
654/// of them into an unknown-field error here.
655fn env_mirror() -> Env {
656    // Keys reach these closures pre-lowercasing (`CREDS_...`), so compare on
657    // an ascii-lowered copy; `str::starts_with` is case-sensitive.
658    Env::prefixed("POND_")
659        .filter(|key| {
660            let key = key.as_str().to_ascii_lowercase();
661            // `extra` has no env form (spec.md#storage-env-mirror): the env
662            // grammar stays flat strings; structured options belong in the
663            // file (or URL query params).
664            key == "storage_path" || (key.starts_with("creds_") && !key.ends_with("_extra"))
665        })
666        .map(|key| {
667            // Set names are lowercase alphanumeric (validate_creds), so the
668            // first `_` after `creds` and the one after the name are the only
669            // separators; field names keep their underscores.
670            let key = key.as_str().to_ascii_lowercase();
671            let dots = if key.starts_with("creds_") { 2 } else { 1 };
672            key.replacen('_', ".", dots).into()
673        })
674}
675
676/// The pre-redesign `[storage]` passthrough keys, by role (ENV-style
677/// `object_store` aliases). Both the load-time error recipe
678/// (`detect_legacy_storage`) and the `pond init` rewrite read these, so the
679/// legacy vocabulary lives in one place - a new alias must not require
680/// editing two detectors in lockstep.
681pub const LEGACY_ENDPOINT_KEYS: &[&str] = &["aws_endpoint", "endpoint"];
682pub const LEGACY_ACCESS_KEY_KEYS: &[&str] = &["aws_access_key_id", "access_key_id"];
683pub const LEGACY_SECRET_KEY_KEYS: &[&str] = &["aws_secret_access_key", "secret_access_key"];
684pub const LEGACY_VIRTUAL_HOSTED_KEYS: &[&str] = &[
685    "aws_virtual_hosted_style_request",
686    "virtual_hosted_style_request",
687];
688
689/// Recognize the pre-redesign `[storage]` passthrough map (ENV-style
690/// `object_store` keys) and return the exact rewrite onto `[storage].path` +
691/// `[creds.default]`. An error with a recipe, not a shim: old configs do not
692/// keep working.
693fn detect_legacy_storage(path: &Path) -> Option<String> {
694    let text = std::fs::read_to_string(path).ok()?;
695    let value: toml::Value = toml::from_str(&text).ok()?;
696    let storage = value.get("storage")?.as_table()?;
697    if storage.is_empty() || storage.keys().all(|key| key == "path") {
698        return None;
699    }
700    let get = |names: &[&str]| {
701        storage.iter().find_map(|(key, value)| {
702            names
703                .iter()
704                .any(|name| key.eq_ignore_ascii_case(name))
705                .then(|| value.as_str().unwrap_or_default().to_owned())
706        })
707    };
708    let endpoint = get(LEGACY_ENDPOINT_KEYS);
709    let host = endpoint
710        .as_deref()
711        .and_then(|e| e.split("://").nth(1))
712        .unwrap_or("<endpoint-host>");
713    // Under the declared virtual-hosted style the endpoint host leads with
714    // the bucket; de-fold it, or following the recipe verbatim folds the
715    // bucket in twice (the new grammar re-applies virtual hosting).
716    let virtual_hosted = storage.iter().any(|(key, value)| {
717        LEGACY_VIRTUAL_HOSTED_KEYS
718            .iter()
719            .any(|name| key.eq_ignore_ascii_case(name))
720            && (value.as_bool().unwrap_or(false)
721                || value
722                    .as_str()
723                    .is_some_and(|text| text.eq_ignore_ascii_case("true") || text == "1"))
724    });
725    let path_recipe = match host.split_once('.') {
726        Some((bucket, rest)) if virtual_hosted && rest.contains('.') => {
727            format!("s3+https://{rest}/{bucket}/<prefix>")
728        }
729        _ => format!("s3+https://{host}/<bucket>/<prefix>"),
730    };
731    // spec.md#storage-redaction: never echo credential values, even back to
732    // their owner - stderr lands in logs, scrollback, and pasted bug reports.
733    let mut recipe = format!(
734        "config {} uses the old [storage] passthrough map; rewrite it as:\n\n[storage]\npath = \"{path_recipe}\"\n\n[creds.default]\n",
735        path.display(),
736    );
737    recipe.push_str("access_key_id     = \"...\"  # copy from the old [storage] section\n");
738    recipe.push_str("secret_access_key = \"...\"  # copy from the old [storage] section\n");
739    recipe.push_str(
740        "\n(the endpoint and bucket fold into the URL; allow_http is scheme-derived; virtual-hosted addressing defaults on; the region is autodetected - append ?region=<x> to the URL only if your store insists. `pond storage check` verifies the result end-to-end, and `pond init` can apply this rewrite for you)",
741    );
742    Some(recipe)
743}
744
745/// Inner helper: return `Some((name, blob))` when the source section is
746/// enabled, stripping the discriminator from the blob before handing it on;
747/// `None` when the section is missing `enabled` or has `enabled = false`.
748fn take_enabled(name: &str, blob: &Value) -> Option<(String, Value)> {
749    let enabled = blob
750        .get("enabled")
751        .and_then(Value::as_bool)
752        .unwrap_or(false);
753    if !enabled {
754        return None;
755    }
756    let mut clean = blob.clone();
757    if let Some(obj) = clean.as_object_mut() {
758        obj.remove("enabled");
759    }
760    Some((name.to_owned(), clean))
761}
762
763/// Expand `~` and `$VAR`/`${VAR}` in `path` against an explicit `home`.
764/// Filesystem-shaped adapters call this from inside their factory's `open()`.
765/// Tests use it directly to exercise the rule without mutating the
766/// process-wide `HOME` env var (`std::env::set_var` is `unsafe` under
767/// edition 2024 and pond forbids unsafe code). Unset vars and `~user` forms
768/// pass through unchanged - never guess.
769pub fn expand_home_under(path: &Path, home: &Path) -> PathBuf {
770    let Some(text) = path.to_str() else {
771        return path.to_path_buf();
772    };
773    let home_text = home.to_string_lossy();
774    let expanded = shellexpand::full_with_context_no_errors(
775        text,
776        || Some(home_text.clone()),
777        |var| std::env::var(var).ok(),
778    );
779    PathBuf::from(expanded.as_ref())
780}
781
782/// The inverse of [`expand_home_under`] for display and config writes:
783/// contract a `home` prefix back to `~` so user-facing surfaces (and the
784/// paths `pond init` persists) stay portable and readable. Non-home paths
785/// pass through unchanged.
786pub fn contract_home_under(path: &Path, home: &Path) -> PathBuf {
787    match path.strip_prefix(home) {
788        Ok(rest) if rest.as_os_str().is_empty() => PathBuf::from("~"),
789        Ok(rest) => Path::new("~").join(rest),
790        Err(_) => path.to_path_buf(),
791    }
792}
793
794/// [`contract_home_under`] against the process `HOME`. Returns the input
795/// rendered for humans; machine surfaces (JSON output, the wire) keep
796/// absolute paths.
797pub fn contract_home(path: &Path) -> PathBuf {
798    match std::env::var_os("HOME") {
799        Some(home) => contract_home_under(path, Path::new(&home)),
800        None => path.to_path_buf(),
801    }
802}
803
804impl EmbeddingsConfig {
805    /// Surface-level validation: model id non-empty and dim divisible by 8.
806    /// The dim/model mismatch is the load-time check inside `CandleEmbedder::load`,
807    /// which knows the model's `hidden_size`; what we can catch up front is the
808    /// IVF_PQ subspace stride (`dim / 8` in `embed::index_params`).
809    pub fn validate(&self) -> Result<()> {
810        if self.model.trim().is_empty() {
811            bail!("embeddings.model must be a non-empty HuggingFace model id");
812        }
813        if self.dim == 0 || !self.dim.is_multiple_of(8) {
814            bail!(
815                "embeddings.dim = {} must be a positive multiple of 8 (IVF_PQ subspace stride)",
816                self.dim,
817            );
818        }
819        Ok(())
820    }
821
822    /// Install model id + dim into the process. Idempotent: only the first
823    /// call sticks (matches `OnceLock` semantics in `embed::init_model_id` and
824    /// `sessions::init_embedding_dim`).
825    pub fn install_runtime(&self) {
826        crate::embed::init_model_id(self.model.clone());
827        crate::sessions::init_embedding_dim(self.dim);
828    }
829}
830
831#[cfg(test)]
832mod tests {
833    // `result_large_err`: `figment::Jail` closures return `figment::Error`
834    // by contract; the size is figment's, not ours.
835    #![allow(clippy::expect_used, clippy::unwrap_used, clippy::result_large_err)]
836
837    use super::*;
838    use serde_json::Value;
839    use tempfile::TempDir;
840
841    #[test]
842    fn validate_catches_empty_model_and_bad_dim() {
843        assert!(EmbeddingsConfig::default().validate().is_ok());
844        // Empty / whitespace-only model id is rejected: HuggingFace fetch
845        // would fail far away from the config error.
846        let bad_model = EmbeddingsConfig {
847            model: "   ".to_owned(),
848            dim: 768,
849        };
850        assert!(bad_model.validate().is_err());
851        // Dim must divide 8 (PQ subspace stride in `embed::index_params`).
852        let bad_dim = EmbeddingsConfig {
853            model: "intfloat/multilingual-e5-base".to_owned(),
854            dim: 100,
855        };
856        assert!(bad_dim.validate().is_err());
857        // Zero is rejected too (would divide-by-zero inside index_params).
858        let zero_dim = EmbeddingsConfig {
859            model: "intfloat/multilingual-e5-base".to_owned(),
860            dim: 0,
861        };
862        assert!(zero_dim.validate().is_err());
863    }
864
865    #[test]
866    fn config_load_missing_file_falls_back_to_builtin() {
867        let config = Config::load("/nonexistent/pond-config-xyz.toml").unwrap();
868        assert_eq!(config.embeddings, EmbeddingsConfig::default());
869    }
870
871    #[test]
872    fn default_config_toml_loads_to_the_builtin_defaults() {
873        let dir = TempDir::new().unwrap();
874        let path = dir.path().join("config.toml");
875        std::fs::write(&path, DEFAULT_CONFIG_TOML).unwrap();
876        // The shipped template is all comments, so it must load and validate as
877        // the built-in defaults - a malformed template fails right here.
878        let config = Config::load(&path).unwrap();
879        assert_eq!(config.embeddings, EmbeddingsConfig::default());
880        assert_eq!(config.embeddings.model, crate::embed::DEFAULT_MODEL_ID);
881        assert_eq!(
882            config.embeddings.dim,
883            crate::sessions::DEFAULT_EMBEDDING_DIM
884        );
885    }
886
887    #[test]
888    fn default_storage_path_follows_xdg_then_home() {
889        // An absolute XDG_DATA_HOME wins.
890        let resolved =
891            default_storage_path(Some(PathBuf::from("/xdg")), Some(PathBuf::from("/home")))
892                .unwrap();
893        assert!(is_local(&resolved));
894        assert_eq!(local_path(&resolved).unwrap(), PathBuf::from("/xdg/pond"));
895
896        // A relative XDG_DATA_HOME is ignored per the XDG spec; HOME is the fallback.
897        let resolved = default_storage_path(
898            Some(PathBuf::from("relative")),
899            Some(PathBuf::from("/home")),
900        )
901        .unwrap();
902        assert_eq!(
903            local_path(&resolved).unwrap(),
904            PathBuf::from("/home/.local/share/pond"),
905        );
906
907        // No XDG and no HOME - stays usable: returns the cwd-anchored `.pond`.
908        // The result is absolute (Lance's URL conversion requires it), so we
909        // just check that the URL ends with the relative path's components.
910        let resolved = default_storage_path(None, None).unwrap();
911        assert!(is_local(&resolved));
912        assert!(
913            local_path(&resolved).unwrap().ends_with(".pond"),
914            "fallback path should end with .pond: {resolved}",
915        );
916    }
917
918    #[test]
919    fn expand_home_under_handles_tilde_forms() {
920        let home = Path::new("/srv/me");
921        assert_eq!(
922            expand_home_under(Path::new("~"), home),
923            PathBuf::from("/srv/me")
924        );
925        assert_eq!(
926            expand_home_under(Path::new("~/.codex/sessions"), home),
927            PathBuf::from("/srv/me/.codex/sessions"),
928        );
929        // Absolute paths pass through unchanged.
930        assert_eq!(
931            expand_home_under(Path::new("/etc/passwd"), home),
932            PathBuf::from("/etc/passwd"),
933        );
934        // A leading `~something` (no slash) is not the home form - leave it.
935        assert_eq!(
936            expand_home_under(Path::new("~user/elsewhere"), home),
937            PathBuf::from("~user/elsewhere"),
938        );
939    }
940
941    #[test]
942    fn expand_home_under_handles_env_vars() {
943        // Jail serializes env mutation against the other env-touching tests.
944        figment::Jail::expect_with(|jail| {
945            jail.set_env("POND_TEST_EXPAND_DIR", "/srv/data");
946            let home = Path::new("/srv/me");
947            assert_eq!(
948                expand_home_under(Path::new("$POND_TEST_EXPAND_DIR/pond"), home),
949                PathBuf::from("/srv/data/pond"),
950            );
951            assert_eq!(
952                expand_home_under(Path::new("${POND_TEST_EXPAND_DIR}/pond"), home),
953                PathBuf::from("/srv/data/pond"),
954            );
955            // Unset vars pass through unchanged - never guess.
956            assert_eq!(
957                expand_home_under(Path::new("$POND_TEST_UNSET_VAR/x"), home),
958                PathBuf::from("$POND_TEST_UNSET_VAR/x"),
959            );
960            Ok(())
961        });
962    }
963
964    #[test]
965    fn contract_home_under_inverts_expansion() {
966        let home = Path::new("/srv/me");
967        assert_eq!(
968            contract_home_under(Path::new("/srv/me/.local/share/pond"), home),
969            PathBuf::from("~/.local/share/pond"),
970        );
971        assert_eq!(
972            contract_home_under(Path::new("/srv/me"), home),
973            PathBuf::from("~")
974        );
975        // Non-home paths pass through unchanged.
976        assert_eq!(
977            contract_home_under(Path::new("/etc/passwd"), home),
978            PathBuf::from("/etc/passwd"),
979        );
980    }
981
982    #[test]
983    fn resolve_sources_returns_one_or_all_or_errors() {
984        let temp = TempDir::new().unwrap();
985        let body = "\
986[sources.claude-code]
987enabled = true
988path = \"/srv/claude\"
989
990[sources.codex-cli]
991enabled = true
992path = \"/srv/codex\"
993
994[sources.opencode]
995enabled = false
996";
997        let path = temp.path().join("config.toml");
998        std::fs::write(&path, body).expect("write config");
999        let config = Config::load(&path).unwrap();
1000
1001        // None -> only enabled entries
1002        let all = config.resolve_sources(None).unwrap();
1003        assert_eq!(all.len(), 2);
1004        let names: Vec<_> = all.iter().map(|(n, _)| n.as_str()).collect();
1005        assert!(names.contains(&"claude-code"));
1006        assert!(names.contains(&"codex-cli"));
1007        // The `enabled` discriminator never reaches the adapter blob.
1008        for (_, blob) in &all {
1009            assert!(blob.get("enabled").is_none(), "enabled should be stripped");
1010        }
1011
1012        // Some(name) -> one entry, opaque JSON blob
1013        let one = config.resolve_sources(Some("codex-cli")).unwrap();
1014        assert_eq!(one.len(), 1);
1015        assert_eq!(one[0].0, "codex-cli");
1016        assert_eq!(
1017            one[0].1.get("path").and_then(Value::as_str),
1018            Some("/srv/codex"),
1019        );
1020
1021        // Disabled positional -> errors with the recovery hint baked in.
1022        let disabled = config.resolve_sources(Some("opencode"));
1023        let err = disabled
1024            .expect_err("disabled adapter must error")
1025            .to_string();
1026        assert!(err.contains("enabled = false"), "got: {err}");
1027        assert!(err.contains("pond sync opencode"), "got: {err}");
1028
1029        // Unknown -> error
1030        assert!(config.resolve_sources(Some("nope")).is_err());
1031
1032        // disabled_source_names lists exactly the off ones.
1033        assert_eq!(config.disabled_source_names(), vec!["opencode"]);
1034    }
1035
1036    #[test]
1037    fn memory_uri_is_classified_as_remote() {
1038        let url = Url::parse("memory:///pond-remote-test").expect("memory uri parses");
1039        assert!(
1040            !is_local(&url),
1041            "memory:// is not a local-filesystem URL: {url}",
1042        );
1043        assert!(
1044            local_path(&url).is_none(),
1045            "local_path must return None for non-file schemes",
1046        );
1047    }
1048
1049    // The storage/creds tests run inside `figment::Jail` even when they set
1050    // no env vars: the Jail-based env-mirror test mutates process-global env
1051    // mid-flight, and the Jail lock is what serializes them against it.
1052
1053    #[test]
1054    fn storage_and_creds_round_trip() {
1055        figment::Jail::expect_with(|jail| {
1056            jail.create_file(
1057                "config.toml",
1058                r#"
1059[storage]
1060path = "s3+https://nbg1.example.com/my-pond"
1061
1062[creds.default]
1063access_key_id     = "AKIA123"
1064secret_access_key = "shh"
1065
1066[creds.work]
1067scope             = "s3+https://fsn1.example.com/work-pond/"
1068access_key_id     = "AKIA456"
1069secret_access_key_command = "op read op://vault/pond/secret"
1070region            = "fsn1"
1071virtual_hosted_style_request = false
1072extra = { request_timeout = "60 seconds" }
1073"#,
1074            )?;
1075            let config = Config::load("config.toml").expect("config loads");
1076            assert_eq!(
1077                config.storage.path.as_deref(),
1078                Some("s3+https://nbg1.example.com/my-pond"),
1079            );
1080            assert_eq!(config.creds.len(), 2);
1081            let work = &config.creds["work"];
1082            assert_eq!(
1083                work.secret_access_key_command.as_deref(),
1084                Some("op read op://vault/pond/secret"),
1085            );
1086            assert_eq!(work.virtual_hosted_style_request, Some(false));
1087            assert_eq!(work.extra["request_timeout"], "60 seconds");
1088            Ok(())
1089        });
1090    }
1091
1092    #[test]
1093    fn creds_validators_reject_bad_shapes() {
1094        let cases: &[(&str, &str)] = &[
1095            // Unknown key dies loudly (typos must not silently no-op).
1096            ("[creds.a]\nacces_key_id = \"x\"\n", "acces_key_id"),
1097            // Name charset: separators break the env-mirror grammar.
1098            ("[creds.my_set]\naccess_key_id = \"x\"\n", "[a-z][a-z0-9]"),
1099            ("[creds.A1]\naccess_key_id = \"x\"\n", "[a-z][a-z0-9]"),
1100            // One variant per logical secret.
1101            (
1102                "[creds.a]\nsecret_access_key = \"x\"\nsecret_access_key_command = \"cat\"\n",
1103                "more than one",
1104            ),
1105            (
1106                "[creds.a]\naccess_key_id = \"x\"\naccess_key_id_file = \"/k\"\n",
1107                "pick one",
1108            ),
1109            // At most one scope-less set.
1110            (
1111                "[creds.a]\naccess_key_id = \"x\"\n[creds.b]\naccess_key_id = \"y\"\n",
1112                "scope-less",
1113            ),
1114            // Duplicate scopes can never tie-break - checked canonicalized,
1115            // so two spellings of one prefix still collide.
1116            (
1117                "[creds.a]\nscope = \"s3+https://h:443/b/\"\naccess_key_id = \"x\"\n[creds.b]\nscope = \"s3+https://h/b\"\naccess_key_id = \"y\"\n",
1118                "same scope",
1119            ),
1120        ];
1121        figment::Jail::expect_with(|jail| {
1122            for (body, needle) in cases {
1123                jail.create_file("config.toml", body)?;
1124                let err = Config::load("config.toml").expect_err(body).to_string();
1125                assert!(
1126                    err.contains(needle),
1127                    "want {needle:?} in error for {body:?}, got: {err}",
1128                );
1129            }
1130            Ok(())
1131        });
1132    }
1133
1134    #[test]
1135    fn legacy_storage_map_errors_with_the_rewrite_recipe() {
1136        figment::Jail::expect_with(|jail| {
1137            jail.create_file(
1138                "config.toml",
1139                r#"
1140[storage]
1141AWS_ACCESS_KEY_ID = "AKIA123"
1142AWS_SECRET_ACCESS_KEY = "shh"
1143AWS_REGION = "nbg1"
1144AWS_ENDPOINT = "https://ttq.nbg1.your-objectstorage.com"
1145aws_virtual_hosted_style_request = "true"
1146"#,
1147            )?;
1148            let err = Config::load("config.toml")
1149                .expect_err("legacy map must error")
1150                .to_string();
1151            // The error IS the migration: old keys mapped onto the new shape.
1152            assert!(err.contains("old [storage] passthrough map"), "got: {err}");
1153            // The declared virtual-hosted style pins the bucket as the leading
1154            // host label; the recipe must de-fold it, not repeat the folded
1155            // host (which the new grammar would fold again).
1156            assert!(
1157                err.contains("s3+https://nbg1.your-objectstorage.com/ttq/<prefix>"),
1158                "recipe must de-fold the virtual-hosted endpoint, got: {err}",
1159            );
1160            // spec.md#storage-redaction: the recipe must NOT echo the real
1161            // key values - placeholders plus a "copy from" pointer only.
1162            assert!(!err.contains("AKIA123"), "got: {err}");
1163            assert!(!err.contains("\"shh\""), "got: {err}");
1164            assert!(err.contains("access_key_id     = \"...\""), "got: {err}");
1165            // Region is autodetected (AWS) or defaulted (S3-compatible
1166            // endpoints ignore it): the recipe must not carry AWS_REGION
1167            // forward, only name the ?region= override.
1168            assert!(!err.contains("region            ="), "got: {err}");
1169            assert!(err.contains("?region="), "got: {err}");
1170            assert!(err.contains("pond storage check"), "got: {err}");
1171            // Without the addressing-style key the split is unknowable; the
1172            // recipe keeps the host verbatim with a <bucket> placeholder.
1173            jail.create_file(
1174                "config.toml",
1175                r#"
1176[storage]
1177AWS_ACCESS_KEY_ID = "AKIA123"
1178AWS_ENDPOINT = "https://ttq.nbg1.your-objectstorage.com"
1179"#,
1180            )?;
1181            let err = Config::load("config.toml")
1182                .expect_err("legacy map must error")
1183                .to_string();
1184            assert!(
1185                err.contains("s3+https://ttq.nbg1.your-objectstorage.com/<bucket>/<prefix>"),
1186                "got: {err}",
1187            );
1188            Ok(())
1189        });
1190    }
1191
1192    #[test]
1193    fn env_mirror_layers_over_file() {
1194        figment::Jail::expect_with(|jail| {
1195            jail.create_file(
1196                "config.toml",
1197                r#"
1198[storage]
1199path = "/from-file"
1200
1201[creds.work]
1202scope         = "s3://file-bucket/"
1203access_key_id = "from-file"
1204region        = "file-region"
1205"#,
1206            )?;
1207            // Env beats file per field; untouched fields survive the merge.
1208            jail.set_env("POND_STORAGE_PATH", "/from-env");
1209            jail.set_env("POND_CREDS_WORK_ACCESS_KEY_ID", "from-env");
1210            // A purely-numeric env secret must stay a string (extract_lossy).
1211            jail.set_env("POND_CREDS_WORK_SECRET_ACCESS_KEY", "12345");
1212            // A set defined only in env is discovered by the prefix scan.
1213            jail.set_env("POND_CREDS_CI_ACCESS_KEY_ID", "ci-key");
1214            let config = Config::load("config.toml").expect("env+file config loads");
1215            assert_eq!(config.storage.path.as_deref(), Some("/from-env"));
1216            let work = &config.creds["work"];
1217            assert_eq!(work.access_key_id.as_deref(), Some("from-env"));
1218            assert_eq!(work.secret_access_key.as_deref(), Some("12345"));
1219            assert_eq!(work.region.as_deref(), Some("file-region"));
1220            assert_eq!(work.scope.as_deref(), Some("s3://file-bucket/"));
1221            assert_eq!(config.creds["ci"].access_key_id.as_deref(), Some("ci-key"));
1222            Ok(())
1223        });
1224    }
1225}