Skip to main content

pond/
config.rs

1//! Configuration loading: the `[embeddings]`, `[adapters]`, `[storage]`, and
2//! `[creds.*]` blocks.
3//!
4//! pond ships built-in defaults, so an instance with no `config.toml` still
5//! works. `pond config schema` emits [`DEFAULT_CONFIG_TOML`], the
6//! fully-annotated example. Loading layers `config.toml` under the `POND_*`
7//! env mirror via figment, so every command also works with no config file
8//! at all (spec.md#storage-configless) - URLs + env vars are sufficient.
9
10use std::{
11    collections::BTreeMap,
12    path::{Path, PathBuf},
13};
14
15use anyhow::{Context, Result, anyhow, bail};
16use figment::{
17    Figment,
18    providers::{Env, Format, Toml},
19};
20use serde::{Deserialize, Deserializer, Serialize, de};
21use serde_json::Value;
22use url::Url;
23
24/// Parse `"128 MiB"`, `"1 GiB"`, `"500 KiB"`, or a bare byte count. Accepts
25/// SI (KB/MB/GB) and binary (KiB/MiB/GiB/TiB) suffixes; treats the bare unit
26/// `"B"` and unsuffixed numbers as raw bytes. Tolerant of whitespace and
27/// case. The result MUST fit in `usize` (Lance's cache APIs take `usize`).
28fn parse_byte_size(raw: &str) -> Result<usize, String> {
29    let trimmed = raw.trim();
30    if trimmed.is_empty() {
31        return Err("byte-size value is empty".to_owned());
32    }
33    let split = trimmed
34        .find(|c: char| c.is_ascii_alphabetic())
35        .unwrap_or(trimmed.len());
36    let (number, unit) = trimmed.split_at(split);
37    let number: f64 = number
38        .trim()
39        .parse()
40        .map_err(|_| format!("byte-size value {raw:?} is not a number"))?;
41    if !number.is_finite() || number < 0.0 {
42        return Err(format!("byte-size value {raw:?} must be non-negative"));
43    }
44    let multiplier: f64 = match unit.trim().to_ascii_lowercase().as_str() {
45        "" | "b" => 1.0,
46        "k" | "kb" => 1_000.0,
47        "kib" => 1_024.0,
48        "m" | "mb" => 1_000_000.0,
49        "mib" => 1_048_576.0,
50        "g" | "gb" => 1_000_000_000.0,
51        "gib" => 1_073_741_824.0,
52        "tib" => 1_099_511_627_776.0,
53        other => {
54            return Err(format!(
55                "byte-size unit {other:?} not recognized (try MiB / GiB)"
56            ));
57        }
58    };
59    let bytes = number * multiplier;
60    if !bytes.is_finite() || bytes > usize::MAX as f64 {
61        return Err(format!("byte-size value {raw:?} overflows usize"));
62    }
63    Ok(bytes as usize)
64}
65
66/// Accept string / integer / float / bool and stringify. The env mirror
67/// parses values TOML-ishly, so `POND_CREDS_X_SECRET_ACCESS_KEY=12345`
68/// arrives as a number; these fields are strings no matter how they scan.
69fn lenient_string<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
70where
71    D: Deserializer<'de>,
72{
73    #[derive(Deserialize)]
74    #[serde(untagged)]
75    enum Repr {
76        Text(String),
77        Int(i64),
78        Float(f64),
79        Bool(bool),
80    }
81    Ok(
82        Option::<Repr>::deserialize(deserializer)?.map(|repr| match repr {
83            Repr::Text(value) => value,
84            Repr::Int(value) => value.to_string(),
85            Repr::Float(value) => value.to_string(),
86            Repr::Bool(value) => value.to_string(),
87        }),
88    )
89}
90
91fn deserialize_byte_size_opt<'de, D>(deserializer: D) -> Result<Option<usize>, D::Error>
92where
93    D: Deserializer<'de>,
94{
95    #[derive(Deserialize)]
96    #[serde(untagged)]
97    enum Repr {
98        Bytes(u64),
99        Text(String),
100    }
101    let repr: Option<Repr> = Option::deserialize(deserializer)?;
102    match repr {
103        None => Ok(None),
104        Some(Repr::Bytes(value)) => usize::try_from(value).map(Some).map_err(de::Error::custom),
105        Some(Repr::Text(value)) => parse_byte_size(&value).map(Some).map_err(de::Error::custom),
106    }
107}
108
109/// True when the URL is on the local filesystem. Mirrors Lance's
110/// `ObjectStore::is_local` (lance-io/src/object_store.rs:541): the `file` and
111/// `file+uring` schemes are local; everything else (incl. `memory://`) is not.
112pub fn is_local(url: &Url) -> bool {
113    matches!(url.scheme(), "file" | "file+uring")
114}
115
116/// Extract the filesystem `PathBuf` for local URLs. `None` for remote.
117pub fn local_path(url: &Url) -> Option<PathBuf> {
118    if is_local(url) {
119        url.to_file_path().ok()
120    } else {
121        None
122    }
123}
124
125/// URI string for a child of this location (typically one Lance dataset under
126/// the data dir). Trims a single trailing slash on the base, then concatenates
127/// with a `/` separator. This keeps `Dataset::open` / `Dataset::write` happy
128/// on both filesystem and object-store backends - they want the URI form, not
129/// a `url::Url`.
130pub fn child_uri(base: &Url, suffix: &str) -> String {
131    // For local URLs we strip the `file://` prefix so log lines and error
132    // messages render as plain paths (`/srv/pond/sessions.lance`), matching
133    // what pond used to emit before the URL migration.
134    if let Some(path) = local_path(base) {
135        return path.join(suffix).display().to_string();
136    }
137    format!("{}/{suffix}", base.as_str().trim_end_matches('/'))
138}
139
140/// Render a `Url` for human-readable log/diagnostic output: local URLs come
141/// back as plain paths (no `file://` prefix, `$HOME` contracted to `~`);
142/// remote URLs stay verbatim.
143pub fn display(url: &Url) -> String {
144    if let Some(path) = local_path(url) {
145        contract_home(&path).display().to_string()
146    } else {
147        url.to_string()
148    }
149}
150
151/// Build a `Url` from a filesystem path. Convenience for tests and for
152/// callers that hold a `PathBuf` already. The path must be
153/// absolute (`url::Url::from_file_path` is a hard requirement on Unix); a
154/// relative path gets canonicalized via `std::path::absolute` first.
155pub fn url_for_path(path: impl AsRef<Path>) -> Result<Url> {
156    let path = path.as_ref();
157    let absolute = if path.is_absolute() {
158        path.to_path_buf()
159    } else {
160        std::path::absolute(path)
161            .with_context(|| format!("failed to absolutize {}", path.display()))?
162    };
163    Url::from_file_path(&absolute).map_err(|()| {
164        anyhow!(
165            "failed to convert path {} into a file:// URL",
166            absolute.display()
167        )
168    })
169}
170
171/// Default `config.toml` body emitted by `pond config schema`. Every
172/// line is commented: pond ships built-in defaults, so the file is purely a
173/// discoverable template and pond still works with no `config.toml` on disk.
174pub const DEFAULT_CONFIG_TOML: &str = "\
175# pond configuration.
176#
177# pond ships built-in defaults, so every setting here is optional - delete this
178# file and pond still works. Uncomment and edit to override.
179
180# Where pond looks for adapter data to import. One entry per adapter type
181# (`claude-code`, `codex-cli`, ...). `pond sync` with no arguments syncs every
182# entry; `pond sync <adapter>` syncs just one. With an empty `[adapters]`,
183# `pond sync` runs an interactive discovery against the known default paths
184# and writes the picks back here.
185#
186# Future wrap: pond is single-namespace in v1 (spec.md#wire-namespace-resolution); `[adapters]` is
187# flat here. When multi-namespace pond lands, adapter registration becomes
188# per-tenant under `[namespaces.<ns>.adapters.<adapter>]`. Pre-v1 the schema
189# is breakable; the rename is operationally free until a real second tenant
190# exists.
191#
192# [adapters.claude-code]
193# enabled = true
194# path = \"~/.claude/projects\"
195#
196# [adapters.codex-cli]
197# enabled = true
198# path = \"~/.codex/sessions\"
199#
200# Set `enabled = false` to keep the section but skip it on `pond sync`;
201# re-enable via `pond adapters enable <adapter>`.
202
203# Embeddings. Search defaults to the vector arm (matching on meaning) when the
204# store has any vectors, falling back to FTS otherwise - the model loads lazily
205# on the first vector query, so there's no cost on FTS-only corpora. `model`
206# selects the HuggingFace XLM-RoBERTa model; `dim` declares its output width and
207# is baked into the messages.vector schema on table creation - it must equal the
208# model's hidden_size.
209#
210# Common pairings:
211#   model = \"intfloat/multilingual-e5-small\"   dim = 384   (default)
212#   model = \"intfloat/multilingual-e5-base\"    dim = 768
213#   model = \"intfloat/multilingual-e5-large\"   dim = 1024
214#
215# A different-dim model needs a fresh data dir; pond enforces this at the
216# schema boundary.
217#
218# [embeddings]
219# model = \"intfloat/multilingual-e5-small\"
220# dim = 384
221
222# Search tuning. Leave unset for Lance defaults; set when tuning vector recall
223# against a corpus.
224#
225# [search]
226# nprobes = 16
227
228# Storage maintenance. Tunes the compaction + cleanup pass that runs inside
229# `pond sync` and `pond optimize`.
230#
231# - `compaction_fragment_cap` is the per-task fragment-count backstop: a
232#   planned compaction task touching at least this many fragments always runs
233#   even when the write-amplification veto would skip it. Default 64; 0
234#   disables the veto and runs every task Lance plans.
235# - `cleanup_older_than` is the manifest-retention window for the safe cleanup
236#   pass. Accepts `Ns` / `Nm` / `Nh` / `Nd` (default `1d`, floor `1h` - it is
237#   what protects in-flight readers). Versions older than this are reclaimed
238#   by Lance's OCC-coordinated GC.
239#
240# [maintenance]
241# compaction_fragment_cap = 64
242# cleanup_older_than = \"1d\"
243
244# Long-running process caps. Both accept either a plain byte count or a
245# humansize-style suffix (\"128 MiB\", \"1 GiB\"). Both are optional - leave
246# unset to let pond pick the backend-aware default:
247#   local FS  : index_cache = 256 MiB, metadata_cache = 128 MiB
248#   remote    : index_cache = 2 GiB,   metadata_cache = 512 MiB
249# Lance's library defaults (6 GiB / 1 GiB) are too generous for a per-session
250# `pond mcp` process; tightening them is what keeps RSS under the 500 MiB target
251# without measurable latency regressions on typical agent-history corpora.
252#
253# [runtime]
254# index_cache_bytes    = \"256 MiB\"
255# metadata_cache_bytes = \"128 MiB\"
256
257# Storage address and credentials (spec.md#storage-url-grammar).
258#
259# `path` is the default destination used when `--storage-path` (env
260# `POND_STORAGE_PATH`) is not passed. Absent = the platform-local data dir.
261# Addresses are URLs; the `s3+https` form carries the endpoint, bucket, and
262# prefix in one token:
263#
264#   /abs/path or ~/path                  local filesystem
265#   s3://bucket/prefix                   AWS S3 (ambient credential chain)
266#   s3+https://host/bucket/prefix        S3-compatible endpoint (Hetzner, R2, B2, MinIO)
267#   gs://bucket/prefix                   Google Cloud Storage
268#   az://account/container/prefix        Azure Blob
269#
270# Credentials live in `[creds.<name>]` sets and bind to URLs by `scope`
271# prefix - longest match wins (spec.md#creds-scope-match); a set without
272# `scope` matches any URL. With no matching set, the standard cloud SDK
273# chain applies (AWS_* env, shared credentials file, instance metadata).
274# Secrets never go in URLs or CLI flags; besides inline values,
275# `access_key_id_file` / `secret_access_key_file` read a file and
276# `secret_access_key_command` runs a command (e.g. `op read ...`). `extra`
277# holds verbatim `object_store` options pond has not typed.
278#
279# Every field mirrors to env: `POND_STORAGE_PATH`, `POND_CREDS_<NAME>_<FIELD>`
280# (set names are lowercase alphanumeric, so the env grammar is unambiguous).
281# Precedence: CLI flag > POND_* env > this file > ambient cloud chain.
282# Probe a destination end-to-end with `pond storage check`.
283#
284# Future wrap: pond is single-namespace in v1 (spec.md#wire-namespace-resolution);
285# `[storage]` is flat here on the assumption of one bucket per pond. When
286# multi-namespace pond lands this becomes `[namespaces.<ns>.storage]`.
287#
288# [storage]
289# path = \"s3+https://nbg1.your-objectstorage.com/my-pond\"
290#
291# [creds.default]
292# access_key_id     = \"...\"
293# secret_access_key = \"...\"
294";
295
296/// Top-level `config.toml` shape.
297#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
298#[serde(deny_unknown_fields)]
299pub struct Config {
300    #[serde(default)]
301    pub embeddings: EmbeddingsConfig,
302    #[serde(default)]
303    pub search: SearchConfig,
304    #[serde(default)]
305    pub maintenance: MaintenanceConfig,
306    #[serde(default)]
307    pub runtime: RuntimeConfig,
308    /// `[adapters.<adapter>]` map: per-adapter config blobs the matching
309    /// factory deserializes inside its `open()`. The shape is adapter-defined
310    /// (filesystem adapters expect `{ path = "..." }`; API-backed adapters
311    /// expect endpoint + auth keys), so this layer stays opaque. Empty by
312    /// default; `pond sync` runs discovery into this map on first use.
313    #[serde(default)]
314    pub adapters: BTreeMap<String, Value>,
315    /// `[storage]`: the default destination URL (spec.md#storage-url-grammar).
316    /// `None` = the platform-local data dir.
317    #[serde(default)]
318    pub storage: StorageConfig,
319    /// `[creds.<name>]`: URL-scoped credential sets. Every storage URL
320    /// resolves its own set by longest-prefix `scope` match
321    /// (spec.md#creds-scope-match); the resolver lives in `pond::substrate`.
322    #[serde(default)]
323    pub creds: BTreeMap<String, CredsSet>,
324}
325
326/// `[storage]`: the single default destination. Typed so the legacy
327/// passthrough map (ENV-style `object_store` keys) fails loudly with the
328/// rewrite recipe instead of silently changing meaning.
329#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
330#[serde(deny_unknown_fields)]
331pub struct StorageConfig {
332    #[serde(default)]
333    pub path: Option<String>,
334}
335
336/// One `[creds.<name>]` set. All fields optional; validation enforces at most
337/// one variant per logical secret. `extra` carries verbatim `object_store`
338/// options pond has not typed (redaction in `pond config show` still applies
339/// to its keys by name).
340#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
341#[serde(deny_unknown_fields)]
342pub struct CredsSet {
343    /// URL prefix this set binds to. `None` = the catch-all set (at most one).
344    #[serde(default)]
345    pub scope: Option<String>,
346    // Key / region fields are `lenient_string`: the env mirror parses values
347    // TOML-ishly, so an all-digit key or region arrives as a number and must
348    // still land in these String fields.
349    #[serde(default, deserialize_with = "lenient_string")]
350    pub access_key_id: Option<String>,
351    #[serde(default)]
352    pub access_key_id_file: Option<PathBuf>,
353    #[serde(default, deserialize_with = "lenient_string")]
354    pub secret_access_key: Option<String>,
355    #[serde(default)]
356    pub secret_access_key_file: Option<PathBuf>,
357    #[serde(default)]
358    pub secret_access_key_command: Option<String>,
359    #[serde(default, deserialize_with = "lenient_string")]
360    pub region: Option<String>,
361    #[serde(default)]
362    pub virtual_hosted_style_request: Option<bool>,
363    #[serde(default)]
364    pub extra: BTreeMap<String, String>,
365}
366
367/// `[creds.<name>]` name charset `[a-z][a-z0-9]{0,15}` (spec.md#storage-env-mirror):
368/// lowercase-alphanumeric keeps `POND_CREDS_<NAME>_<FIELD>` splittable at the
369/// first `_` after the name. Shared by config validation and `pond creds`.
370pub fn valid_creds_set_name(name: &str) -> bool {
371    let mut chars = name.chars();
372    chars.next().is_some_and(|c| c.is_ascii_lowercase())
373        && name.len() <= 16
374        && chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit())
375}
376
377/// The rejection message for a name that fails [`valid_creds_set_name`], shared
378/// by config validation and `pond creds` so the rule and its wording never drift.
379pub fn creds_set_name_error(name: &str) -> String {
380    format!(
381        "creds set name {name:?} must match [a-z][a-z0-9]{{0,15}} (lowercase alphanumeric, no separators)"
382    )
383}
384
385/// `[runtime]`: long-running process caps. Both knobs accept either a plain
386/// byte count or a `humansize`-style suffix (`"128 MiB"`, `"1 GiB"`). Both are
387/// optional - `None` lets `pond::substrate` pick the backend-aware default
388/// (local FS gets a tight cap; object stores stay near Lance's defaults).
389#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
390#[serde(deny_unknown_fields, default)]
391pub struct RuntimeConfig {
392    #[serde(default, deserialize_with = "deserialize_byte_size_opt")]
393    pub index_cache_bytes: Option<usize>,
394    #[serde(default, deserialize_with = "deserialize_byte_size_opt")]
395    pub metadata_cache_bytes: Option<usize>,
396}
397
398/// `[search]`: optional Lance vector-query tuning knobs.
399#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
400#[serde(deny_unknown_fields)]
401pub struct SearchConfig {
402    #[serde(default)]
403    pub nprobes: Option<usize>,
404}
405
406/// `[maintenance]`: storage-maintenance knobs shared by `pond sync` and
407/// `pond optimize`. All optional - omit and pond falls back to the
408/// in-process defaults in `pond::substrate` (`DEFAULT_COMPACTION_FRAGMENT_CAP`,
409/// `default_cleanup_older_than`).
410#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
411#[serde(deny_unknown_fields)]
412pub struct MaintenanceConfig {
413    /// Sub-target fragment count past which the compaction phase runs (it also
414    /// runs once those fragments hold a whole target fragment's worth of rows).
415    /// Default 64 stops the automated sync re-compacting the trailing fragment
416    /// every pass; 0 compacts every pass.
417    #[serde(default)]
418    pub compaction_fragment_cap: Option<usize>,
419    /// Manifest-retention window for the safe cleanup pass. Accepts
420    /// `Ns`/`Nm`/`Nh`/`Nd` (default `1d`). Versions older than this are
421    /// reclaimed by Lance's OCC-coordinated GC (`delete_unverified=false`),
422    /// which never races a concurrent writer on any backend.
423    #[serde(default)]
424    pub cleanup_older_than: Option<String>,
425}
426
427/// `[embeddings]`: model selector and vector dimension. There is no master
428/// switch - a `vector` search degrades to FTS when no vectors exist in the
429/// store (`has_embeddings()` is the only gate); the candle/Metal model is
430/// `LazyEmbedder`-loaded on the first query that
431/// actually needs it. `model` and `dim` are installed into the process at
432/// startup via `embed::init_model_id` / `sessions::init_embedding_dim`, so
433/// swapping models for a one-off experiment is a temporary config file - no
434/// CLI flag and no per-call-site plumbing.
435#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
436#[serde(deny_unknown_fields, default)]
437pub struct EmbeddingsConfig {
438    /// The embedding model id (spec.md#search): any XLM-RoBERTa model loadable
439    /// by `candle-transformers`. Defaults to `intfloat/multilingual-e5-small`.
440    pub model: String,
441    /// Output dimension of `model`. Must equal the model's `hidden_size`.
442    /// Defaults to 384 (e5-small). Set to 768 for e5-base, 1024 for e5-large.
443    pub dim: usize,
444}
445
446impl Default for EmbeddingsConfig {
447    fn default() -> Self {
448        Self {
449            model: crate::embed::DEFAULT_MODEL_ID.to_owned(),
450            dim: crate::sessions::DEFAULT_EMBEDDING_DIM,
451        }
452    }
453}
454
455/// The platform-local default storage path, used when neither
456/// `--storage-path` / `POND_STORAGE_PATH` nor `[storage].path` is set:
457/// `$XDG_DATA_HOME/pond`, then `$HOME/.local/share/pond`, then `.pond`.
458/// `xdg_data_home` is honored only if absolute, per the XDG base-directory
459/// spec.
460pub fn default_storage_path(xdg_data_home: Option<PathBuf>, home: Option<PathBuf>) -> Result<Url> {
461    if let Some(xdg) = xdg_data_home.filter(|path| path.is_absolute()) {
462        return url_for_path(xdg.join("pond"));
463    }
464    if let Some(home) = home {
465        return url_for_path(home.join(".local").join("share").join("pond"));
466    }
467    // No HOME and no usable XDG var - stay usable rather than panic.
468    url_for_path(PathBuf::from(".pond"))
469}
470
471/// Cache dir for rebuildable artifacts (the search row meta map): the XDG-cache
472/// analog of [`default_storage_path`]. Separate root because the contents are
473/// regenerated from the store, not durable data.
474pub fn default_cache_path(xdg_cache_home: Option<PathBuf>, home: Option<PathBuf>) -> PathBuf {
475    if let Some(xdg) = xdg_cache_home.filter(|path| path.is_absolute()) {
476        return xdg.join("pond");
477    }
478    if let Some(home) = home {
479        return home.join(".cache").join("pond");
480    }
481    PathBuf::from(".pond-cache")
482}
483
484/// Local default path for `config.toml`. URI-backed data dirs always land
485/// here because the config file has to be local (it names the bucket and
486/// any creds). XDG hierarchy: `$XDG_CONFIG_HOME/pond/config.toml`, then
487/// `$HOME/.config/pond/config.toml`, then `.pond.toml` in cwd.
488pub fn default_config_path(xdg_config_home: Option<PathBuf>, home: Option<PathBuf>) -> PathBuf {
489    if let Some(xdg) = xdg_config_home.filter(|path| path.is_absolute()) {
490        return xdg.join("pond").join("config.toml");
491    }
492    if let Some(home) = home {
493        return home.join(".config").join("pond").join("config.toml");
494    }
495    PathBuf::from(".pond.toml")
496}
497
498impl Config {
499    /// Load `config.toml` from `path` (if it exists) layered under the
500    /// `POND_*` env mirror, and validate. A missing file yields the built-in
501    /// defaults - env vars alone are a complete config
502    /// (spec.md#storage-configless). On success the resolved embedding model
503    /// id + dim are installed into the process (`OnceLock`-backed; only the
504    /// first call per process sticks), so all downstream code paths see a
505    /// consistent pair without per-handler plumbing.
506    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
507        Ok(Self::load_with_provenance(path)?.0)
508    }
509
510    /// [`Config::load`] over an in-memory TOML body (still layered under the
511    /// `POND_*` env mirror). `pond init` uses this to validate and resolve
512    /// the config it is composing BEFORE anything touches disk - the wizard
513    /// writes exactly once, at the end.
514    pub fn load_str(body: &str) -> Result<Self> {
515        let figment = Figment::new().merge(Toml::string(body)).merge(env_mirror());
516        let config: Self = figment
517            .extract_lossy()
518            .map_err(|error| anyhow!("failed to load config: {error}"))?;
519        config.embeddings.validate()?;
520        config.validate_creds()?;
521        Ok(config)
522    }
523
524    /// [`Config::load`] that also returns the figment, so `pond config show`
525    /// can attribute each value to its source layer (file / env / default).
526    pub fn load_with_provenance(path: impl AsRef<Path>) -> Result<(Self, Figment)> {
527        let path = path.as_ref();
528        let figment = Figment::new().merge(Toml::file(path)).merge(env_mirror());
529        // `extract_lossy`, not `extract`: env values parse TOML-ishly, so an
530        // all-digit secret would arrive as a number and fail the String field;
531        // lossy stringifies scalars instead.
532        let config: Self = figment.extract_lossy().map_err(|error| {
533            if let Some(recipe) = detect_legacy_storage(path) {
534                return anyhow!("{recipe}");
535            }
536            if let Some(recipe) = detect_legacy_sources(path) {
537                return anyhow!("{recipe}");
538            }
539            // Inline figment's message (it already names the failing key and
540            // source layer) so single-line error surfaces keep the detail.
541            anyhow!("failed to load config {}: {error}", path.display())
542        })?;
543        config.embeddings.validate()?;
544        config.validate_creds()?;
545        config.embeddings.install_runtime();
546        // Tilde expansion is per-adapter (inside each factory's `open()`):
547        // an API-backed adapter has no path to expand, and only the
548        // filesystem-shaped adapters need the helper. See `expand_home_under`.
549        Ok((config, figment))
550    }
551
552    /// `[creds.*]` structural rules (spec.md#creds-scope-match): set-name
553    /// charset, at most one variant per logical secret, at most one
554    /// scope-less set, no duplicate scopes. All parse-time so a misbinding
555    /// dies before any URL resolves against it.
556    fn validate_creds(&self) -> Result<()> {
557        let mut scopeless: Option<&str> = None;
558        let mut scopes: BTreeMap<String, &str> = BTreeMap::new();
559        for (name, set) in &self.creds {
560            if !valid_creds_set_name(name) {
561                bail!(creds_set_name_error(name));
562            }
563            if set.access_key_id.is_some() && set.access_key_id_file.is_some() {
564                bail!("[creds.{name}] sets both access_key_id and access_key_id_file; pick one");
565            }
566            let secret_variants = [
567                set.secret_access_key.is_some(),
568                set.secret_access_key_file.is_some(),
569                set.secret_access_key_command.is_some(),
570            ]
571            .iter()
572            .filter(|present| **present)
573            .count();
574            if secret_variants > 1 {
575                bail!(
576                    "[creds.{name}] sets more than one of secret_access_key / secret_access_key_file / secret_access_key_command; pick one"
577                );
578            }
579            match set.scope.as_deref() {
580                None => {
581                    if let Some(other) = scopeless {
582                        bail!(
583                            "[creds.{other}] and [creds.{name}] are both scope-less; at most one catch-all set is allowed - add a `scope` to one"
584                        );
585                    }
586                    scopeless = Some(name);
587                }
588                Some(scope) => {
589                    // Duplicates are checked on the canonical form (incl.
590                    // trailing-slash trim, matching scope-match semantics),
591                    // so two spellings of one prefix can never tie at
592                    // resolve time.
593                    let canonical = crate::substrate::parse_scope(scope)
594                        .map(|url| url.as_str().trim_end_matches('/').to_owned())
595                        .with_context(|| {
596                            format!("[creds.{name}] scope {scope:?} is not a valid URL prefix")
597                        })?;
598                    if let Some(other) = scopes.insert(canonical, name) {
599                        bail!(
600                            "[creds.{other}] and [creds.{name}] declare the same scope {scope:?}; merge them or narrow one"
601                        );
602                    }
603                }
604            }
605        }
606        Ok(())
607    }
608
609    /// Resolve the `[adapters.<adapter>]` entries to drive `pond sync`. Only
610    /// sections with `enabled = true` flow through; sections with
611    /// `enabled = false` (or absent) are treated as opt-out and the
612    /// per-adapter blob (minus `enabled`) is handed to the factory's
613    /// `open()`. With `adapter = None` returns every enabled entry; with
614    /// `Some(name)` returns just that one - and errors if it's not in
615    /// config OR if it's currently disabled (the caller should then
616    /// re-prompt or report).
617    pub fn resolve_adapters(&self, adapter: Option<&str>) -> Result<Vec<(String, Value)>> {
618        match adapter {
619            None => Ok(self
620                .adapters
621                .iter()
622                .filter_map(|(name, blob)| take_enabled(name, blob))
623                .collect()),
624            Some(name) => {
625                let blob = self
626                    .adapters
627                    .get(name)
628                    .ok_or_else(|| anyhow!("no [adapters.{name}] entry in config"))?;
629                take_enabled(name, blob).map(|entry| vec![entry]).ok_or_else(|| {
630                    anyhow!(
631                        "adapter [{name}] is disabled (enabled = false); run `pond adapters enable {name}` to re-enable, then `pond sync {name}`"
632                    )
633                })
634            }
635        }
636    }
637
638    /// Names that are configured but currently `enabled = false`. Used by
639    /// `pond sync` post-import to know not to re-probe an adapter the user
640    /// already declined (the decline persists; re-prompt only via the
641    /// positional override `pond sync <name>`).
642    pub fn disabled_adapter_names(&self) -> Vec<&str> {
643        self.adapters
644            .iter()
645            .filter_map(|(name, blob)| {
646                let enabled = blob
647                    .get("enabled")
648                    .and_then(Value::as_bool)
649                    .unwrap_or(false);
650                if enabled { None } else { Some(name.as_str()) }
651            })
652            .collect()
653    }
654}
655
656/// The `POND_*` env mirror (spec.md#storage-env-mirror): `POND_STORAGE_PATH`
657/// -> `storage.path`, `POND_CREDS_<NAME>_<FIELD>` -> `creds.<name>.<field>`.
658/// Filtered to exactly those two shapes - clap owns its own `POND_*` vars
659/// (`POND_CONFIG_FILE`, `POND_HOST`, ...) and an unfiltered prefix would turn each
660/// of them into an unknown-field error here.
661fn env_mirror() -> Env {
662    // Keys reach these closures pre-lowercasing (`CREDS_...`), so compare on
663    // an ascii-lowered copy; `str::starts_with` is case-sensitive.
664    Env::prefixed("POND_")
665        .filter(|key| {
666            let key = key.as_str().to_ascii_lowercase();
667            // `extra` has no env form (spec.md#storage-env-mirror): the env
668            // grammar stays flat strings; structured options belong in the
669            // file (or URL query params).
670            key == "storage_path" || (key.starts_with("creds_") && !key.ends_with("_extra"))
671        })
672        .map(|key| {
673            // Set names are lowercase alphanumeric (validate_creds), so the
674            // first `_` after `creds` and the one after the name are the only
675            // separators; field names keep their underscores.
676            let key = key.as_str().to_ascii_lowercase();
677            let dots = if key.starts_with("creds_") { 2 } else { 1 };
678            key.replacen('_', ".", dots).into()
679        })
680}
681
682/// The pre-redesign `[storage]` passthrough keys, by role (ENV-style
683/// `object_store` aliases). Both the load-time error recipe
684/// (`detect_legacy_storage`) and the `pond init` rewrite read these, so the
685/// legacy vocabulary lives in one place - a new alias must not require
686/// editing two detectors in lockstep.
687pub const LEGACY_ENDPOINT_KEYS: &[&str] = &["aws_endpoint", "endpoint"];
688pub const LEGACY_ACCESS_KEY_KEYS: &[&str] = &["aws_access_key_id", "access_key_id"];
689pub const LEGACY_SECRET_KEY_KEYS: &[&str] = &["aws_secret_access_key", "secret_access_key"];
690pub const LEGACY_VIRTUAL_HOSTED_KEYS: &[&str] = &[
691    "aws_virtual_hosted_style_request",
692    "virtual_hosted_style_request",
693];
694
695/// Recognize the pre-redesign `[storage]` passthrough map (ENV-style
696/// `object_store` keys) and return the exact rewrite onto `[storage].path` +
697/// `[creds.default]`. An error with a recipe, not a shim: old configs do not
698/// keep working.
699fn detect_legacy_storage(path: &Path) -> Option<String> {
700    let text = std::fs::read_to_string(path).ok()?;
701    let value: toml::Value = toml::from_str(&text).ok()?;
702    let storage = value.get("storage")?.as_table()?;
703    if storage.is_empty() || storage.keys().all(|key| key == "path") {
704        return None;
705    }
706    let get = |names: &[&str]| {
707        storage.iter().find_map(|(key, value)| {
708            names
709                .iter()
710                .any(|name| key.eq_ignore_ascii_case(name))
711                .then(|| value.as_str().unwrap_or_default().to_owned())
712        })
713    };
714    let endpoint = get(LEGACY_ENDPOINT_KEYS);
715    let host = endpoint
716        .as_deref()
717        .and_then(|e| e.split("://").nth(1))
718        .unwrap_or("<endpoint-host>");
719    // Under the declared virtual-hosted style the endpoint host leads with
720    // the bucket; de-fold it, or following the recipe verbatim folds the
721    // bucket in twice (the new grammar re-applies virtual hosting).
722    let virtual_hosted = storage.iter().any(|(key, value)| {
723        LEGACY_VIRTUAL_HOSTED_KEYS
724            .iter()
725            .any(|name| key.eq_ignore_ascii_case(name))
726            && (value.as_bool().unwrap_or(false)
727                || value
728                    .as_str()
729                    .is_some_and(|text| text.eq_ignore_ascii_case("true") || text == "1"))
730    });
731    let path_recipe = match host.split_once('.') {
732        Some((bucket, rest)) if virtual_hosted && rest.contains('.') => {
733            format!("s3+https://{rest}/{bucket}/<prefix>")
734        }
735        _ => format!("s3+https://{host}/<bucket>/<prefix>"),
736    };
737    // spec.md#storage-redaction: never echo credential values, even back to
738    // their owner - stderr lands in logs, scrollback, and pasted bug reports.
739    let mut recipe = format!(
740        "config {} uses the old [storage] passthrough map; rewrite it as:\n\n[storage]\npath = \"{path_recipe}\"\n\n[creds.default]\n",
741        path.display(),
742    );
743    recipe.push_str("access_key_id     = \"...\"  # copy from the old [storage] section\n");
744    recipe.push_str("secret_access_key = \"...\"  # copy from the old [storage] section\n");
745    recipe.push_str(
746        "\n(the endpoint and bucket fold into the URL; allow_http is scheme-derived; virtual-hosted addressing defaults on; the region is autodetected - append ?region=<x> to the URL only if your store insists. `pond storage check` verifies the result end-to-end, and `pond init` can apply this rewrite for you)",
747    );
748    Some(recipe)
749}
750
751/// Recognize a pre-rename `[sources.<name>]` config block (the adapter map was
752/// renamed `sources` -> `adapters`) and return a one-line recipe pointing at
753/// `pond init`. An error with a recipe, not a shim: old configs do not silently
754/// keep working. Transitional - delete once live configs have migrated.
755fn detect_legacy_sources(path: &Path) -> Option<String> {
756    let text = std::fs::read_to_string(path).ok()?;
757    let value: toml::Value = toml::from_str(&text).ok()?;
758    value.get("sources")?.as_table()?;
759    Some(format!(
760        "config {} uses a [sources.*] block; the adapter map was renamed to [adapters.*]. Run `pond init` to migrate it, or rename each `[sources.<name>]` header to `[adapters.<name>]` by hand.",
761        path.display(),
762    ))
763}
764
765/// Inner helper: return `Some((name, blob))` when the adapter section is
766/// enabled, stripping the discriminator from the blob before handing it on;
767/// `None` when the section is missing `enabled` or has `enabled = false`.
768fn take_enabled(name: &str, blob: &Value) -> Option<(String, Value)> {
769    let enabled = blob
770        .get("enabled")
771        .and_then(Value::as_bool)
772        .unwrap_or(false);
773    if !enabled {
774        return None;
775    }
776    let mut clean = blob.clone();
777    if let Some(obj) = clean.as_object_mut() {
778        obj.remove("enabled");
779    }
780    Some((name.to_owned(), clean))
781}
782
783/// Expand `~` and `$VAR`/`${VAR}` in `path` against an explicit `home`.
784/// Filesystem-shaped adapters call this from inside their factory's `open()`.
785/// Tests use it directly to exercise the rule without mutating the
786/// process-wide `HOME` env var (`std::env::set_var` is `unsafe` under
787/// edition 2024 and pond forbids unsafe code). Unset vars and `~user` forms
788/// pass through unchanged - never guess.
789pub fn expand_home_under(path: &Path, home: &Path) -> PathBuf {
790    let Some(text) = path.to_str() else {
791        return path.to_path_buf();
792    };
793    let home_text = home.to_string_lossy();
794    let expanded = shellexpand::full_with_context_no_errors(
795        text,
796        || Some(home_text.clone()),
797        |var| std::env::var(var).ok(),
798    );
799    PathBuf::from(expanded.as_ref())
800}
801
802/// The inverse of [`expand_home_under`] for display and config writes:
803/// contract a `home` prefix back to `~` so user-facing surfaces (and the
804/// paths `pond init` persists) stay portable and readable. Non-home paths
805/// pass through unchanged.
806pub fn contract_home_under(path: &Path, home: &Path) -> PathBuf {
807    match path.strip_prefix(home) {
808        Ok(rest) if rest.as_os_str().is_empty() => PathBuf::from("~"),
809        Ok(rest) => Path::new("~").join(rest),
810        Err(_) => path.to_path_buf(),
811    }
812}
813
814/// [`contract_home_under`] against the process `HOME`. Returns the input
815/// rendered for humans; machine surfaces (JSON output, the wire) keep
816/// absolute paths.
817pub fn contract_home(path: &Path) -> PathBuf {
818    match std::env::var_os("HOME") {
819        Some(home) => contract_home_under(path, Path::new(&home)),
820        None => path.to_path_buf(),
821    }
822}
823
824impl EmbeddingsConfig {
825    /// Surface-level validation: model id non-empty and dim positive. The
826    /// dim/model mismatch is the load-time check inside `CandleEmbedder::load`,
827    /// which knows the model's `hidden_size`.
828    pub fn validate(&self) -> Result<()> {
829        if self.model.trim().is_empty() {
830            bail!("embeddings.model must be a non-empty HuggingFace model id");
831        }
832        if self.dim == 0 {
833            bail!("embeddings.dim must be positive; got {}", self.dim);
834        }
835        Ok(())
836    }
837
838    /// Install model id + dim into the process. Idempotent: only the first
839    /// call sticks (matches `OnceLock` semantics in `embed::init_model_id` and
840    /// `sessions::init_embedding_dim`).
841    pub fn install_runtime(&self) {
842        crate::embed::init_model_id(self.model.clone());
843        crate::sessions::init_embedding_dim(self.dim);
844    }
845}
846
847/// Write `config.toml` with owner-only perms (0600). The file can carry a
848/// plaintext `secret_access_key` (inline `[creds.*]`), so it must never be
849/// group/world-readable - matching the AWS CLI's 0600 on its credentials file.
850/// Unix only; Windows is out of v1 scope. Order is truncate -> chmod -> write,
851/// so the secret is only ever written once perms are already 0600, even when
852/// repairing a pre-existing 0644 file.
853pub fn write_config_file(path: &Path, contents: &str) -> Result<()> {
854    #[cfg(unix)]
855    {
856        use std::io::Write as _;
857        use std::os::unix::fs::{OpenOptionsExt as _, PermissionsExt as _};
858        let mut file = std::fs::OpenOptions::new()
859            .write(true)
860            .create(true)
861            .truncate(true)
862            .mode(0o600)
863            .open(path)
864            .with_context(|| format!("failed to write {}", path.display()))?;
865        // `.mode()` applies only on creation; chmod also repairs a pre-existing file.
866        file.set_permissions(std::fs::Permissions::from_mode(0o600))
867            .with_context(|| format!("failed to chmod 0600 {}", path.display()))?;
868        file.write_all(contents.as_bytes())
869            .with_context(|| format!("failed to write {}", path.display()))?;
870    }
871    #[cfg(not(unix))]
872    {
873        std::fs::write(path, contents)
874            .with_context(|| format!("failed to write {}", path.display()))?;
875    }
876    Ok(())
877}
878
879#[cfg(test)]
880mod tests {
881    // `result_large_err`: `figment::Jail` closures return `figment::Error`
882    // by contract; the size is figment's, not ours.
883    #![allow(clippy::expect_used, clippy::unwrap_used, clippy::result_large_err)]
884
885    use super::*;
886    use serde_json::Value;
887    use tempfile::TempDir;
888
889    #[cfg(unix)]
890    #[test]
891    fn write_config_file_is_owner_only_0600() {
892        use std::os::unix::fs::PermissionsExt;
893        let dir = TempDir::new().unwrap();
894        let path = dir.path().join("config.toml");
895        // A pre-existing world-readable file must be repaired, not left at 0644.
896        std::fs::write(&path, "old").unwrap();
897        std::fs::set_permissions(&path, std::fs::Permissions::from_mode(0o644)).unwrap();
898        write_config_file(&path, "[creds.default]\nsecret_access_key = \"x\"\n").unwrap();
899        let mode = std::fs::metadata(&path).unwrap().permissions().mode() & 0o777;
900        assert_eq!(mode, 0o600, "config with secrets must be owner-only");
901        assert!(
902            std::fs::read_to_string(&path)
903                .unwrap()
904                .contains("secret_access_key")
905        );
906    }
907
908    #[test]
909    fn validate_catches_empty_model_and_bad_dim() {
910        assert!(EmbeddingsConfig::default().validate().is_ok());
911        // Empty / whitespace-only model id is rejected: HuggingFace fetch
912        // would fail far away from the config error.
913        let bad_model = EmbeddingsConfig {
914            model: "   ".to_owned(),
915            dim: 768,
916        };
917        assert!(bad_model.validate().is_err());
918        // Non-multiple-of-8 dims are accepted now: IVF_SQ has no subspace
919        // stride, so the old `dim % 8` requirement is gone.
920        let odd_dim = EmbeddingsConfig {
921            model: "intfloat/multilingual-e5-base".to_owned(),
922            dim: 100,
923        };
924        assert!(odd_dim.validate().is_ok());
925        // Zero is still rejected.
926        let zero_dim = EmbeddingsConfig {
927            model: "intfloat/multilingual-e5-base".to_owned(),
928            dim: 0,
929        };
930        assert!(zero_dim.validate().is_err());
931    }
932
933    #[test]
934    fn config_load_missing_file_falls_back_to_builtin() {
935        let config = Config::load("/nonexistent/pond-config-xyz.toml").unwrap();
936        assert_eq!(config.embeddings, EmbeddingsConfig::default());
937    }
938
939    #[test]
940    fn default_config_toml_loads_to_the_builtin_defaults() {
941        let dir = TempDir::new().unwrap();
942        let path = dir.path().join("config.toml");
943        std::fs::write(&path, DEFAULT_CONFIG_TOML).unwrap();
944        // The shipped template is all comments, so it must load and validate as
945        // the built-in defaults - a malformed template fails right here.
946        let config = Config::load(&path).unwrap();
947        assert_eq!(config.embeddings, EmbeddingsConfig::default());
948        assert_eq!(config.embeddings.model, crate::embed::DEFAULT_MODEL_ID);
949        assert_eq!(
950            config.embeddings.dim,
951            crate::sessions::DEFAULT_EMBEDDING_DIM
952        );
953    }
954
955    #[test]
956    fn default_storage_path_follows_xdg_then_home() {
957        // An absolute XDG_DATA_HOME wins.
958        let resolved =
959            default_storage_path(Some(PathBuf::from("/xdg")), Some(PathBuf::from("/home")))
960                .unwrap();
961        assert!(is_local(&resolved));
962        assert_eq!(local_path(&resolved).unwrap(), PathBuf::from("/xdg/pond"));
963
964        // A relative XDG_DATA_HOME is ignored per the XDG spec; HOME is the fallback.
965        let resolved = default_storage_path(
966            Some(PathBuf::from("relative")),
967            Some(PathBuf::from("/home")),
968        )
969        .unwrap();
970        assert_eq!(
971            local_path(&resolved).unwrap(),
972            PathBuf::from("/home/.local/share/pond"),
973        );
974
975        // No XDG and no HOME - stays usable: returns the cwd-anchored `.pond`.
976        // The result is absolute (Lance's URL conversion requires it), so we
977        // just check that the URL ends with the relative path's components.
978        let resolved = default_storage_path(None, None).unwrap();
979        assert!(is_local(&resolved));
980        assert!(
981            local_path(&resolved).unwrap().ends_with(".pond"),
982            "fallback path should end with .pond: {resolved}",
983        );
984    }
985
986    #[test]
987    fn expand_home_under_handles_tilde_forms() {
988        let home = Path::new("/srv/me");
989        assert_eq!(
990            expand_home_under(Path::new("~"), home),
991            PathBuf::from("/srv/me")
992        );
993        assert_eq!(
994            expand_home_under(Path::new("~/.codex/sessions"), home),
995            PathBuf::from("/srv/me/.codex/sessions"),
996        );
997        // Absolute paths pass through unchanged.
998        assert_eq!(
999            expand_home_under(Path::new("/etc/passwd"), home),
1000            PathBuf::from("/etc/passwd"),
1001        );
1002        // A leading `~something` (no slash) is not the home form - leave it.
1003        assert_eq!(
1004            expand_home_under(Path::new("~user/elsewhere"), home),
1005            PathBuf::from("~user/elsewhere"),
1006        );
1007    }
1008
1009    #[test]
1010    fn expand_home_under_handles_env_vars() {
1011        // Jail serializes env mutation against the other env-touching tests.
1012        figment::Jail::expect_with(|jail| {
1013            jail.set_env("POND_TEST_EXPAND_DIR", "/srv/data");
1014            let home = Path::new("/srv/me");
1015            assert_eq!(
1016                expand_home_under(Path::new("$POND_TEST_EXPAND_DIR/pond"), home),
1017                PathBuf::from("/srv/data/pond"),
1018            );
1019            assert_eq!(
1020                expand_home_under(Path::new("${POND_TEST_EXPAND_DIR}/pond"), home),
1021                PathBuf::from("/srv/data/pond"),
1022            );
1023            // Unset vars pass through unchanged - never guess.
1024            assert_eq!(
1025                expand_home_under(Path::new("$POND_TEST_UNSET_VAR/x"), home),
1026                PathBuf::from("$POND_TEST_UNSET_VAR/x"),
1027            );
1028            Ok(())
1029        });
1030    }
1031
1032    #[test]
1033    fn contract_home_under_inverts_expansion() {
1034        let home = Path::new("/srv/me");
1035        assert_eq!(
1036            contract_home_under(Path::new("/srv/me/.local/share/pond"), home),
1037            PathBuf::from("~/.local/share/pond"),
1038        );
1039        assert_eq!(
1040            contract_home_under(Path::new("/srv/me"), home),
1041            PathBuf::from("~")
1042        );
1043        // Non-home paths pass through unchanged.
1044        assert_eq!(
1045            contract_home_under(Path::new("/etc/passwd"), home),
1046            PathBuf::from("/etc/passwd"),
1047        );
1048    }
1049
1050    #[test]
1051    fn resolve_adapters_returns_one_or_all_or_errors() {
1052        let temp = TempDir::new().unwrap();
1053        let body = "\
1054[adapters.claude-code]
1055enabled = true
1056path = \"/srv/claude\"
1057
1058[adapters.codex-cli]
1059enabled = true
1060path = \"/srv/codex\"
1061
1062[adapters.opencode]
1063enabled = false
1064";
1065        let path = temp.path().join("config.toml");
1066        std::fs::write(&path, body).expect("write config");
1067        let config = Config::load(&path).unwrap();
1068
1069        // None -> only enabled entries
1070        let all = config.resolve_adapters(None).unwrap();
1071        assert_eq!(all.len(), 2);
1072        let names: Vec<_> = all.iter().map(|(n, _)| n.as_str()).collect();
1073        assert!(names.contains(&"claude-code"));
1074        assert!(names.contains(&"codex-cli"));
1075        // The `enabled` discriminator never reaches the adapter blob.
1076        for (_, blob) in &all {
1077            assert!(blob.get("enabled").is_none(), "enabled should be stripped");
1078        }
1079
1080        // Some(name) -> one entry, opaque JSON blob
1081        let one = config.resolve_adapters(Some("codex-cli")).unwrap();
1082        assert_eq!(one.len(), 1);
1083        assert_eq!(one[0].0, "codex-cli");
1084        assert_eq!(
1085            one[0].1.get("path").and_then(Value::as_str),
1086            Some("/srv/codex"),
1087        );
1088
1089        // Disabled positional -> errors with the recovery hint baked in.
1090        let disabled = config.resolve_adapters(Some("opencode"));
1091        let err = disabled
1092            .expect_err("disabled adapter must error")
1093            .to_string();
1094        assert!(err.contains("enabled = false"), "got: {err}");
1095        assert!(err.contains("pond sync opencode"), "got: {err}");
1096
1097        // Unknown -> error
1098        assert!(config.resolve_adapters(Some("nope")).is_err());
1099
1100        // disabled_adapter_names lists exactly the off ones.
1101        assert_eq!(config.disabled_adapter_names(), vec!["opencode"]);
1102    }
1103
1104    #[test]
1105    fn memory_uri_is_classified_as_remote() {
1106        let url = Url::parse("memory:///pond-remote-test").expect("memory uri parses");
1107        assert!(
1108            !is_local(&url),
1109            "memory:// is not a local-filesystem URL: {url}",
1110        );
1111        assert!(
1112            local_path(&url).is_none(),
1113            "local_path must return None for non-file schemes",
1114        );
1115    }
1116
1117    // The storage/creds tests run inside `figment::Jail` even when they set
1118    // no env vars: the Jail-based env-mirror test mutates process-global env
1119    // mid-flight, and the Jail lock is what serializes them against it.
1120
1121    #[test]
1122    fn storage_and_creds_round_trip() {
1123        figment::Jail::expect_with(|jail| {
1124            jail.create_file(
1125                "config.toml",
1126                r#"
1127[storage]
1128path = "s3+https://nbg1.example.com/my-pond"
1129
1130[creds.default]
1131access_key_id     = "AKIA123"
1132secret_access_key = "shh"
1133
1134[creds.work]
1135scope             = "s3+https://fsn1.example.com/work-pond/"
1136access_key_id     = "AKIA456"
1137secret_access_key_command = "op read op://vault/pond/secret"
1138region            = "fsn1"
1139virtual_hosted_style_request = false
1140extra = { request_timeout = "60 seconds" }
1141"#,
1142            )?;
1143            let config = Config::load("config.toml").expect("config loads");
1144            assert_eq!(
1145                config.storage.path.as_deref(),
1146                Some("s3+https://nbg1.example.com/my-pond"),
1147            );
1148            assert_eq!(config.creds.len(), 2);
1149            let work = &config.creds["work"];
1150            assert_eq!(
1151                work.secret_access_key_command.as_deref(),
1152                Some("op read op://vault/pond/secret"),
1153            );
1154            assert_eq!(work.virtual_hosted_style_request, Some(false));
1155            assert_eq!(work.extra["request_timeout"], "60 seconds");
1156            Ok(())
1157        });
1158    }
1159
1160    #[test]
1161    fn creds_validators_reject_bad_shapes() {
1162        let cases: &[(&str, &str)] = &[
1163            // Unknown key dies loudly (typos must not silently no-op).
1164            ("[creds.a]\nacces_key_id = \"x\"\n", "acces_key_id"),
1165            // Name charset: separators break the env-mirror grammar.
1166            ("[creds.my_set]\naccess_key_id = \"x\"\n", "[a-z][a-z0-9]"),
1167            ("[creds.A1]\naccess_key_id = \"x\"\n", "[a-z][a-z0-9]"),
1168            // One variant per logical secret.
1169            (
1170                "[creds.a]\nsecret_access_key = \"x\"\nsecret_access_key_command = \"cat\"\n",
1171                "more than one",
1172            ),
1173            (
1174                "[creds.a]\naccess_key_id = \"x\"\naccess_key_id_file = \"/k\"\n",
1175                "pick one",
1176            ),
1177            // At most one scope-less set.
1178            (
1179                "[creds.a]\naccess_key_id = \"x\"\n[creds.b]\naccess_key_id = \"y\"\n",
1180                "scope-less",
1181            ),
1182            // Duplicate scopes can never tie-break - checked canonicalized,
1183            // so two spellings of one prefix still collide.
1184            (
1185                "[creds.a]\nscope = \"s3+https://h:443/b/\"\naccess_key_id = \"x\"\n[creds.b]\nscope = \"s3+https://h/b\"\naccess_key_id = \"y\"\n",
1186                "same scope",
1187            ),
1188        ];
1189        figment::Jail::expect_with(|jail| {
1190            for (body, needle) in cases {
1191                jail.create_file("config.toml", body)?;
1192                let err = Config::load("config.toml").expect_err(body).to_string();
1193                assert!(
1194                    err.contains(needle),
1195                    "want {needle:?} in error for {body:?}, got: {err}",
1196                );
1197            }
1198            Ok(())
1199        });
1200    }
1201
1202    #[test]
1203    fn valid_creds_set_name_matches_env_mirror_charset() {
1204        for ok in ["default", "work", "work2", "a", "abcdefghij123456"] {
1205            assert!(valid_creds_set_name(ok), "{ok:?} should be valid");
1206        }
1207        for bad in ["", "Work", "my_set", "2fast", "abcdefghij1234567", "set-1"] {
1208            assert!(!valid_creds_set_name(bad), "{bad:?} should be invalid");
1209        }
1210    }
1211
1212    #[test]
1213    fn legacy_storage_map_errors_with_the_rewrite_recipe() {
1214        figment::Jail::expect_with(|jail| {
1215            jail.create_file(
1216                "config.toml",
1217                r#"
1218[storage]
1219AWS_ACCESS_KEY_ID = "AKIA123"
1220AWS_SECRET_ACCESS_KEY = "shh"
1221AWS_REGION = "nbg1"
1222AWS_ENDPOINT = "https://ttq.nbg1.your-objectstorage.com"
1223aws_virtual_hosted_style_request = "true"
1224"#,
1225            )?;
1226            let err = Config::load("config.toml")
1227                .expect_err("legacy map must error")
1228                .to_string();
1229            // The error IS the migration: old keys mapped onto the new shape.
1230            assert!(err.contains("old [storage] passthrough map"), "got: {err}");
1231            // The declared virtual-hosted style pins the bucket as the leading
1232            // host label; the recipe must de-fold it, not repeat the folded
1233            // host (which the new grammar would fold again).
1234            assert!(
1235                err.contains("s3+https://nbg1.your-objectstorage.com/ttq/<prefix>"),
1236                "recipe must de-fold the virtual-hosted endpoint, got: {err}",
1237            );
1238            // spec.md#storage-redaction: the recipe must NOT echo the real
1239            // key values - placeholders plus a "copy from" pointer only.
1240            assert!(!err.contains("AKIA123"), "got: {err}");
1241            assert!(!err.contains("\"shh\""), "got: {err}");
1242            assert!(err.contains("access_key_id     = \"...\""), "got: {err}");
1243            // Region is autodetected (AWS) or defaulted (S3-compatible
1244            // endpoints ignore it): the recipe must not carry AWS_REGION
1245            // forward, only name the ?region= override.
1246            assert!(!err.contains("region            ="), "got: {err}");
1247            assert!(err.contains("?region="), "got: {err}");
1248            assert!(err.contains("pond storage check"), "got: {err}");
1249            // Without the addressing-style key the split is unknowable; the
1250            // recipe keeps the host verbatim with a <bucket> placeholder.
1251            jail.create_file(
1252                "config.toml",
1253                r#"
1254[storage]
1255AWS_ACCESS_KEY_ID = "AKIA123"
1256AWS_ENDPOINT = "https://ttq.nbg1.your-objectstorage.com"
1257"#,
1258            )?;
1259            let err = Config::load("config.toml")
1260                .expect_err("legacy map must error")
1261                .to_string();
1262            assert!(
1263                err.contains("s3+https://ttq.nbg1.your-objectstorage.com/<bucket>/<prefix>"),
1264                "got: {err}",
1265            );
1266            Ok(())
1267        });
1268    }
1269
1270    #[test]
1271    fn legacy_sources_block_errors_with_the_adapters_recipe() {
1272        figment::Jail::expect_with(|jail| {
1273            jail.create_file(
1274                "config.toml",
1275                "[sources.claude-code]\nenabled = true\npath = \"/srv/claude\"\n",
1276            )?;
1277            let err = Config::load("config.toml")
1278                .expect_err("legacy [sources.*] must error")
1279                .to_string();
1280            assert!(err.contains("[adapters.*]"), "names the new key: {err}");
1281            assert!(err.contains("pond init"), "points at the fix: {err}");
1282            Ok(())
1283        });
1284    }
1285
1286    #[test]
1287    fn env_mirror_layers_over_file() {
1288        figment::Jail::expect_with(|jail| {
1289            jail.create_file(
1290                "config.toml",
1291                r#"
1292[storage]
1293path = "/from-file"
1294
1295[creds.work]
1296scope         = "s3://file-bucket/"
1297access_key_id = "from-file"
1298region        = "file-region"
1299"#,
1300            )?;
1301            // Env beats file per field; untouched fields survive the merge.
1302            jail.set_env("POND_STORAGE_PATH", "/from-env");
1303            jail.set_env("POND_CREDS_WORK_ACCESS_KEY_ID", "from-env");
1304            // A purely-numeric env secret must stay a string (extract_lossy).
1305            jail.set_env("POND_CREDS_WORK_SECRET_ACCESS_KEY", "12345");
1306            // A set defined only in env is discovered by the prefix scan.
1307            jail.set_env("POND_CREDS_CI_ACCESS_KEY_ID", "ci-key");
1308            let config = Config::load("config.toml").expect("env+file config loads");
1309            assert_eq!(config.storage.path.as_deref(), Some("/from-env"));
1310            let work = &config.creds["work"];
1311            assert_eq!(work.access_key_id.as_deref(), Some("from-env"));
1312            assert_eq!(work.secret_access_key.as_deref(), Some("12345"));
1313            assert_eq!(work.region.as_deref(), Some("file-region"));
1314            assert_eq!(work.scope.as_deref(), Some("s3://file-bucket/"));
1315            assert_eq!(config.creds["ci"].access_key_id.as_deref(), Some("ci-key"));
1316            Ok(())
1317        });
1318    }
1319}