Skip to main content

pond/
config.rs

1//! Configuration loading: the `[embeddings]`, `[sources]`, and `[storage]`
2//! blocks.
3//!
4//! pond ships built-in defaults, so an instance with no `config.toml` still
5//! works. `pond config --print-schema` emits [`DEFAULT_CONFIG_TOML`], the
6//! fully-annotated example.
7
8use std::{
9    collections::BTreeMap,
10    path::{Path, PathBuf},
11};
12
13use anyhow::{Context, Result, anyhow, bail};
14use lance_io::object_store::uri_to_url;
15use serde::{Deserialize, Deserializer, Serialize, de};
16use serde_json::Value;
17use url::Url;
18
19/// Parse `"128 MiB"`, `"1 GiB"`, `"500 KiB"`, or a bare byte count. Accepts
20/// SI (KB/MB/GB) and binary (KiB/MiB/GiB/TiB) suffixes; treats the bare unit
21/// `"B"` and unsuffixed numbers as raw bytes. Tolerant of whitespace and
22/// case. The result MUST fit in `usize` (Lance's cache APIs take `usize`).
23fn parse_byte_size(raw: &str) -> Result<usize, String> {
24    let trimmed = raw.trim();
25    if trimmed.is_empty() {
26        return Err("byte-size value is empty".to_owned());
27    }
28    let split = trimmed
29        .find(|c: char| c.is_ascii_alphabetic())
30        .unwrap_or(trimmed.len());
31    let (number, unit) = trimmed.split_at(split);
32    let number: f64 = number
33        .trim()
34        .parse()
35        .map_err(|_| format!("byte-size value {raw:?} is not a number"))?;
36    if !number.is_finite() || number < 0.0 {
37        return Err(format!("byte-size value {raw:?} must be non-negative"));
38    }
39    let multiplier: f64 = match unit.trim().to_ascii_lowercase().as_str() {
40        "" | "b" => 1.0,
41        "k" | "kb" => 1_000.0,
42        "kib" => 1_024.0,
43        "m" | "mb" => 1_000_000.0,
44        "mib" => 1_048_576.0,
45        "g" | "gb" => 1_000_000_000.0,
46        "gib" => 1_073_741_824.0,
47        "tib" => 1_099_511_627_776.0,
48        other => {
49            return Err(format!(
50                "byte-size unit {other:?} not recognized (try MiB / GiB)"
51            ));
52        }
53    };
54    let bytes = number * multiplier;
55    if !bytes.is_finite() || bytes > usize::MAX as f64 {
56        return Err(format!("byte-size value {raw:?} overflows usize"));
57    }
58    Ok(bytes as usize)
59}
60
61fn deserialize_byte_size_opt<'de, D>(deserializer: D) -> Result<Option<usize>, D::Error>
62where
63    D: Deserializer<'de>,
64{
65    #[derive(Deserialize)]
66    #[serde(untagged)]
67    enum Repr {
68        Bytes(u64),
69        Text(String),
70    }
71    let repr: Option<Repr> = Option::deserialize(deserializer)?;
72    match repr {
73        None => Ok(None),
74        Some(Repr::Bytes(value)) => usize::try_from(value).map(Some).map_err(de::Error::custom),
75        Some(Repr::Text(value)) => parse_byte_size(&value).map(Some).map_err(de::Error::custom),
76    }
77}
78
79/// Parse a CLI / env `--data-dir` argument into a `Url`. Delegates to Lance's
80/// own `uri_to_url`, which handles every form pond cares about:
81/// - bare paths like `/srv/pond` -> `file:///srv/pond`
82/// - explicit `file://...` URIs
83/// - object-store URIs (`s3://`, `gs://`, `az://`, ...)
84/// - tilde expansion (`~/...`)
85/// - Windows drive letters (we don't ship Windows, but the parser handles it)
86///
87/// Using Lance's parser keeps pond's CLI parse path identical to what Lance
88/// uses internally - no risk of pond accepting a string Lance later rejects.
89pub fn parse_data_dir(input: &str) -> Result<Url> {
90    uri_to_url(input).with_context(|| format!("invalid --data-dir {input:?}"))
91}
92
93/// True when the URL is on the local filesystem. Mirrors Lance's
94/// `ObjectStore::is_local` (lance-io/src/object_store.rs:541): the `file` and
95/// `file+uring` schemes are local; everything else (incl. `memory://`) is not.
96pub fn is_local(url: &Url) -> bool {
97    matches!(url.scheme(), "file" | "file+uring")
98}
99
100/// Extract the filesystem `PathBuf` for local URLs. `None` for remote.
101pub fn local_path(url: &Url) -> Option<PathBuf> {
102    if is_local(url) {
103        url.to_file_path().ok()
104    } else {
105        None
106    }
107}
108
109/// URI string for a child of this location (typically one Lance dataset under
110/// the data dir). Trims a single trailing slash on the base, then concatenates
111/// with a `/` separator. This keeps `Dataset::open` / `Dataset::write` happy
112/// on both filesystem and object-store backends - they want the URI form, not
113/// a `url::Url`.
114pub fn child_uri(base: &Url, suffix: &str) -> String {
115    // For local URLs we strip the `file://` prefix so log lines and error
116    // messages render as plain paths (`/srv/pond/sessions.lance`), matching
117    // what pond used to emit before the URL migration.
118    if let Some(path) = local_path(base) {
119        return path.join(suffix).display().to_string();
120    }
121    format!("{}/{suffix}", base.as_str().trim_end_matches('/'))
122}
123
124/// Render a `Url` for human-readable log/diagnostic output: local URLs come
125/// back as plain paths (no `file://` prefix); remote URLs stay verbatim.
126pub fn display(url: &Url) -> String {
127    if let Some(path) = local_path(url) {
128        path.display().to_string()
129    } else {
130        url.to_string()
131    }
132}
133
134/// Build a `Url` from a filesystem path. Convenience for tests and for
135/// `resolve_data_dir` callers that hold a `PathBuf` already. The path must be
136/// absolute (`url::Url::from_file_path` is a hard requirement on Unix); a
137/// relative path gets canonicalized via `std::path::absolute` first.
138pub fn url_for_path(path: impl AsRef<Path>) -> Result<Url> {
139    let path = path.as_ref();
140    let absolute = if path.is_absolute() {
141        path.to_path_buf()
142    } else {
143        std::path::absolute(path)
144            .with_context(|| format!("failed to absolutize {}", path.display()))?
145    };
146    Url::from_file_path(&absolute).map_err(|()| {
147        anyhow!(
148            "failed to convert path {} into a file:// URL",
149            absolute.display()
150        )
151    })
152}
153
154/// Default `config.toml` body emitted by `pond config --print-schema`. Every
155/// line is commented: pond ships built-in defaults, so the file is purely a
156/// discoverable template and pond still works with no `config.toml` on disk.
157pub const DEFAULT_CONFIG_TOML: &str = "\
158# pond configuration.
159#
160# pond ships built-in defaults, so every setting here is optional - delete this
161# file and pond still works. Uncomment and edit to override.
162
163# Where pond looks for source data to import. One entry per adapter type
164# (`claude-code`, `codex-cli`, ...). `pond sync` with no arguments syncs every
165# entry; `pond sync <adapter>` syncs just one. With an empty `[sources]`,
166# `pond sync` runs an interactive discovery against the known default paths
167# and writes the picks back here.
168#
169# Future wrap: pond is single-namespace in v1 (spec.md#wire-namespace-resolution); `[sources]` is
170# flat here. When multi-namespace pond lands, source registration becomes
171# per-tenant under `[namespaces.<ns>.sources.<adapter>]`. Pre-v1 the schema
172# is breakable; the rename is operationally free until a real second tenant
173# exists.
174#
175# [sources.claude-code]
176# enabled = true
177# path = \"~/.claude/projects\"
178#
179# [sources.codex-cli]
180# enabled = true
181# path = \"~/.codex/sessions\"
182#
183# Set `enabled = false` to keep the section but skip it on `pond sync`;
184# re-enable via `pond sync <adapter>`.
185
186# Embeddings. Search runs hybrid (vector + FTS) whenever the store has any
187# vectors, and FTS-only otherwise - the model loads lazily on the first hybrid
188# query, so there's no cost on FTS-only corpora. `model` selects the
189# HuggingFace XLM-RoBERTa model; `dim` declares its output width and is baked
190# into the messages.vector schema on table creation - it must equal the
191# model's hidden_size and be a multiple of 8 (IVF_PQ subspace stride).
192#
193# Common pairings:
194#   model = \"intfloat/multilingual-e5-small\"   dim = 384   (default)
195#   model = \"intfloat/multilingual-e5-base\"    dim = 768
196#   model = \"intfloat/multilingual-e5-large\"   dim = 1024
197#
198# A different-dim model needs a fresh data dir; pond enforces this at the
199# schema boundary.
200#
201# [embeddings]
202# model = \"intfloat/multilingual-e5-small\"
203# dim = 384
204
205# Search tuning. Leave unset for Lance defaults; set when tuning IVF_PQ recall
206# against a corpus.
207#
208# [search]
209# nprobes = 16
210# refine_factor = 2
211
212# Storage maintenance. Tunes the compaction + cleanup pass that runs inside
213# `pond sync` and `pond index optimize`.
214#
215# - `compaction_fragment_cap` is the per-task fragment-count backstop: a
216#   planned compaction task touching at least this many fragments always runs
217#   even when the write-amplification veto would skip it. Default 64; 0
218#   disables the veto and runs every task Lance plans.
219# - `cleanup_older_than` is the manifest-retention window for the safe cleanup
220#   pass. Accepts `Ns` / `Nm` / `Nh` / `Nd` (default `1d`, floor `1h` - it is
221#   what protects in-flight readers). Versions older than this are reclaimed
222#   by Lance's OCC-coordinated GC.
223# - `index_lag_threshold` is the minimum unindexed-fragment count before a
224#   per-intent append/rebuild runs in `pond index optimize`; the brute-force
225#   fallback keeps queries correct while fragments accumulate. Default 4.
226#
227# [maintenance]
228# compaction_fragment_cap = 64
229# cleanup_older_than = \"1d\"
230# index_lag_threshold = 4
231
232# Long-running process caps. Both accept either a plain byte count or a
233# humansize-style suffix (\"128 MiB\", \"1 GiB\"). Both are optional - leave
234# unset to let pond pick the backend-aware default:
235#   local FS  : index_cache = 256 MiB, metadata_cache = 128 MiB
236#   remote    : index_cache = 2 GiB,   metadata_cache = 512 MiB
237# Lance's library defaults (6 GiB / 1 GiB) are too generous for a per-session
238# `pond mcp` process; tightening them is what keeps RSS under the 500 MiB target
239# without measurable latency regressions on typical agent-history corpora.
240#
241# [runtime]
242# index_cache_bytes    = \"256 MiB\"
243# metadata_cache_bytes = \"128 MiB\"
244
245# Object-store credentials and tuning, passed verbatim to Lance's
246# `DatasetBuilder::with_storage_options`. Required only when `--data-dir` is
247# an `s3://` / `gs://` / `az://` URI that needs auth or a non-default region.
248# Keys follow the `object_store` crate's standard names. Environment
249# variables of the same name are read by `object_store` automatically;
250# values in this block override them. pond does not parse these.
251#
252# Future wrap: pond is single-namespace in v1 (spec.md#wire-namespace-resolution); `[storage]` is
253# flat here on the assumption of one bucket per pond. When multi-namespace
254# pond lands and tenants need separate buckets/regions, this becomes
255# `[namespaces.<ns>.storage]`. Pre-v1 the schema is breakable; the rename is
256# operationally free until a real second tenant exists.
257#
258# [storage]
259# AWS_ACCESS_KEY_ID = \"...\"
260# AWS_SECRET_ACCESS_KEY = \"...\"
261# AWS_REGION = \"us-east-1\"
262# AWS_ENDPOINT = \"https://minio.example.com\"  # for self-hosted MinIO
263# allow_http = \"true\"                          # only for non-TLS endpoints
264";
265
266/// Top-level `config.toml` shape.
267#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
268#[serde(deny_unknown_fields)]
269pub struct Config {
270    #[serde(default)]
271    pub embeddings: EmbeddingsConfig,
272    #[serde(default)]
273    pub search: SearchConfig,
274    #[serde(default)]
275    pub maintenance: MaintenanceConfig,
276    #[serde(default)]
277    pub runtime: RuntimeConfig,
278    /// `[sources.<adapter>]` map: per-adapter config blobs the matching
279    /// factory deserializes inside its `open()`. The shape is adapter-defined
280    /// (filesystem adapters expect `{ path = "..." }`; API-backed adapters
281    /// expect endpoint + auth keys), so this layer stays opaque. Empty by
282    /// default; `pond sync` runs discovery into this map on first use.
283    #[serde(default)]
284    pub sources: BTreeMap<String, Value>,
285    /// `[storage]` key=value pairs handed verbatim to Lance's
286    /// `DatasetBuilder::with_storage_options` and `WriteParams.store_params`.
287    /// Keys are the standard `object_store` config names
288    /// (`AWS_ACCESS_KEY_ID`, `AWS_REGION`, `AWS_ENDPOINT`, etc.); see Lance's
289    /// `DatasetBuilder::with_storage_options` doc for the per-scheme variants
290    /// (S3 / GCS / Azure). pond does not parse or validate these; Lance does.
291    /// Empty by default; required only when `--data-dir` is an object-store
292    /// URI that needs credentials or a non-default region/endpoint. Values
293    /// here override any matching environment variables.
294    #[serde(default)]
295    pub storage: BTreeMap<String, String>,
296}
297
298/// `[runtime]`: long-running process caps. Both knobs accept either a plain
299/// byte count or a `humansize`-style suffix (`"128 MiB"`, `"1 GiB"`). Both are
300/// optional - `None` lets `pond::substrate` pick the backend-aware default
301/// (local FS gets a tight cap; object stores stay near Lance's defaults).
302#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
303#[serde(deny_unknown_fields, default)]
304pub struct RuntimeConfig {
305    #[serde(default, deserialize_with = "deserialize_byte_size_opt")]
306    pub index_cache_bytes: Option<usize>,
307    #[serde(default, deserialize_with = "deserialize_byte_size_opt")]
308    pub metadata_cache_bytes: Option<usize>,
309}
310
311/// `[search]`: optional Lance vector-query tuning knobs.
312#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
313#[serde(deny_unknown_fields)]
314pub struct SearchConfig {
315    #[serde(default)]
316    pub nprobes: Option<usize>,
317    #[serde(default)]
318    pub refine_factor: Option<u32>,
319}
320
321/// `[maintenance]`: storage-maintenance knobs shared by `pond sync` and
322/// `pond index optimize`. All optional - omit and pond falls back to the
323/// in-process defaults in `pond::substrate` (`DEFAULT_COMPACTION_FRAGMENT_CAP`,
324/// `default_cleanup_older_than`, and the `index_lag_threshold` initializer).
325#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
326#[serde(deny_unknown_fields)]
327pub struct MaintenanceConfig {
328    /// Sub-target fragment count past which the compaction phase runs (it also
329    /// runs once those fragments hold a whole target fragment's worth of rows).
330    /// Default 64 stops the automated sync re-compacting the trailing fragment
331    /// every pass; 0 compacts every pass.
332    #[serde(default)]
333    pub compaction_fragment_cap: Option<usize>,
334    /// Manifest-retention window for the safe cleanup pass. Accepts
335    /// `Ns`/`Nm`/`Nh`/`Nd` (default `1d`). Versions older than this are
336    /// reclaimed by Lance's OCC-coordinated GC (`delete_unverified=false`),
337    /// which never races a concurrent writer on any backend.
338    #[serde(default)]
339    pub cleanup_older_than: Option<String>,
340    /// Minimum unindexed-fragment count below which `optimize_table_indices`
341    /// skips the per-intent append/rebuild path; the brute-force fallback
342    /// keeps queries correct while fragments accumulate. Default 4 trades a
343    /// little query latency on cold fragments for far fewer remote index
344    /// commits during high-rate ingest.
345    #[serde(default)]
346    pub index_lag_threshold: Option<usize>,
347}
348
349/// `[embeddings]`: model selector and vector dimension. There is no master
350/// switch - the search path always runs hybrid when vectors exist in the
351/// store and FTS-only when they don't (`has_embeddings()` is the only gate);
352/// the candle/Metal model is `LazyEmbedder`-loaded on the first query that
353/// actually needs it. `model` and `dim` are installed into the process at
354/// startup via `embed::init_model_id` / `sessions::init_embedding_dim`, so
355/// swapping models for a one-off experiment is a temporary config file - no
356/// CLI flag and no per-call-site plumbing.
357#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
358#[serde(deny_unknown_fields, default)]
359pub struct EmbeddingsConfig {
360    /// The embedding model id (spec.md#search): any XLM-RoBERTa model loadable
361    /// by `candle-transformers`. Defaults to `intfloat/multilingual-e5-base`.
362    pub model: String,
363    /// Output dimension of `model`. Must equal the model's `hidden_size` and
364    /// be divisible by 8 (the IVF_PQ subspace stride; see `embed::index_params`).
365    /// Defaults to 768 (e5-base). Set to 384 for e5-small, 1024 for e5-large.
366    pub dim: usize,
367}
368
369impl Default for EmbeddingsConfig {
370    fn default() -> Self {
371        Self {
372            model: crate::embed::DEFAULT_MODEL_ID.to_owned(),
373            dim: crate::sessions::DEFAULT_EMBEDDING_DIM,
374        }
375    }
376}
377
378/// Resolve pond's data directory. An explicit `--data-dir` / `POND_DATA_DIR`
379/// wins (and may carry an `s3://` / `gs://` / `az://` URI); otherwise the
380/// XDG-local fallback (`$XDG_DATA_HOME/pond`, then `$HOME/.local/share/pond`,
381/// then `.pond`). `xdg_data_home` is honored only if absolute, per the XDG
382/// base-directory spec.
383pub fn resolve_data_dir(
384    explicit: Option<Url>,
385    xdg_data_home: Option<PathBuf>,
386    home: Option<PathBuf>,
387) -> Result<Url> {
388    if let Some(location) = explicit {
389        return Ok(location);
390    }
391    if let Some(xdg) = xdg_data_home.filter(|path| path.is_absolute()) {
392        return url_for_path(xdg.join("pond"));
393    }
394    if let Some(home) = home {
395        return url_for_path(home.join(".local").join("share").join("pond"));
396    }
397    // No HOME and no usable XDG var - stay usable rather than panic.
398    url_for_path(PathBuf::from(".pond"))
399}
400
401/// Local default path for `config.toml`. URI-backed data dirs always land
402/// here because the config file has to be local (it names the bucket and
403/// any creds). XDG hierarchy: `$XDG_CONFIG_HOME/pond/config.toml`, then
404/// `$HOME/.config/pond/config.toml`, then `.pond.toml` in cwd.
405pub fn default_config_path(xdg_config_home: Option<PathBuf>, home: Option<PathBuf>) -> PathBuf {
406    if let Some(xdg) = xdg_config_home.filter(|path| path.is_absolute()) {
407        return xdg.join("pond").join("config.toml");
408    }
409    if let Some(home) = home {
410        return home.join(".config").join("pond").join("config.toml");
411    }
412    PathBuf::from(".pond.toml")
413}
414
415impl Config {
416    /// Load `config.toml` from `path` if it exists and validate it. A missing
417    /// file yields the built-in defaults. On success the resolved embedding
418    /// model id + dim are installed into the process (`OnceLock`-backed; only
419    /// the first call per process sticks), so all downstream code paths see a
420    /// consistent pair without per-handler plumbing.
421    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
422        let path = path.as_ref();
423        let config = if path.exists() {
424            let text = std::fs::read_to_string(path)
425                .with_context(|| format!("failed to read config {}", path.display()))?;
426            toml::from_str::<Self>(&text)
427                .with_context(|| format!("failed to parse config {}", path.display()))?
428        } else {
429            Self::default()
430        };
431        config.embeddings.validate()?;
432        config.embeddings.install_runtime();
433        if let Some(threshold) = config.maintenance.index_lag_threshold {
434            crate::substrate::init_index_lag_threshold(threshold);
435        }
436        // Tilde expansion is per-adapter (inside each factory's `open()`):
437        // an API-backed adapter has no path to expand, and only the
438        // filesystem-shaped adapters need the helper. See `expand_home_under`.
439        Ok(config)
440    }
441
442    /// Resolve the `[sources.<adapter>]` entries to drive `pond sync`. Only
443    /// sections with `enabled = true` flow through; sections with
444    /// `enabled = false` (or absent) are treated as opt-out and the
445    /// per-adapter blob (minus `enabled`) is handed to the factory's
446    /// `open()`. With `adapter = None` returns every enabled entry; with
447    /// `Some(name)` returns just that one - and errors if it's not in
448    /// config OR if it's currently disabled (the caller should then
449    /// re-prompt or report).
450    pub fn resolve_sources(&self, adapter: Option<&str>) -> Result<Vec<(String, Value)>> {
451        match adapter {
452            None => Ok(self
453                .sources
454                .iter()
455                .filter_map(|(name, blob)| take_enabled(name, blob))
456                .collect()),
457            Some(name) => {
458                let blob = self
459                    .sources
460                    .get(name)
461                    .ok_or_else(|| anyhow!("no [sources.{name}] entry in config"))?;
462                take_enabled(name, blob).map(|entry| vec![entry]).ok_or_else(|| {
463                    anyhow!(
464                        "source [{name}] is disabled (enabled = false); run `pond sync {name}` to re-enable"
465                    )
466                })
467            }
468        }
469    }
470
471    /// Names that are configured but currently `enabled = false`. Used by
472    /// `pond sync` post-import to know not to re-probe an adapter the user
473    /// already declined (the decline persists; re-prompt only via the
474    /// positional override `pond sync <name>`).
475    pub fn disabled_source_names(&self) -> Vec<&str> {
476        self.sources
477            .iter()
478            .filter_map(|(name, blob)| {
479                let enabled = blob
480                    .get("enabled")
481                    .and_then(Value::as_bool)
482                    .unwrap_or(false);
483                if enabled { None } else { Some(name.as_str()) }
484            })
485            .collect()
486    }
487}
488
489/// Inner helper: return `Some((name, blob))` when the source section is
490/// enabled, stripping the discriminator from the blob before handing it on;
491/// `None` when the section is missing `enabled` or has `enabled = false`.
492fn take_enabled(name: &str, blob: &Value) -> Option<(String, Value)> {
493    let enabled = blob
494        .get("enabled")
495        .and_then(Value::as_bool)
496        .unwrap_or(false);
497    if !enabled {
498        return None;
499    }
500    let mut clean = blob.clone();
501    if let Some(obj) = clean.as_object_mut() {
502        obj.remove("enabled");
503    }
504    Some((name.to_owned(), clean))
505}
506
507/// Tilde-expand `path` against an explicit `home`. Filesystem-shaped adapters
508/// call this from inside their factory's `open()`. Tests use it directly to
509/// exercise the rule without mutating the process-wide `HOME` env var
510/// (`std::env::set_var` is `unsafe` under edition 2024 and pond forbids
511/// unsafe code).
512pub fn expand_home_under(path: &Path, home: &Path) -> PathBuf {
513    let Some(text) = path.to_str() else {
514        return path.to_path_buf();
515    };
516    if text == "~" {
517        return home.to_path_buf();
518    }
519    if let Some(rest) = text.strip_prefix("~/") {
520        return home.join(rest);
521    }
522    path.to_path_buf()
523}
524
525impl EmbeddingsConfig {
526    /// Surface-level validation: model id non-empty and dim divisible by 8.
527    /// The dim/model mismatch is the load-time check inside `CandleEmbedder::load`,
528    /// which knows the model's `hidden_size`; what we can catch up front is the
529    /// IVF_PQ subspace stride (`dim / 8` in `embed::index_params`).
530    pub fn validate(&self) -> Result<()> {
531        if self.model.trim().is_empty() {
532            bail!("embeddings.model must be a non-empty HuggingFace model id");
533        }
534        if self.dim == 0 || !self.dim.is_multiple_of(8) {
535            bail!(
536                "embeddings.dim = {} must be a positive multiple of 8 (IVF_PQ subspace stride)",
537                self.dim,
538            );
539        }
540        Ok(())
541    }
542
543    /// Install model id + dim into the process. Idempotent: only the first
544    /// call sticks (matches `OnceLock` semantics in `embed::init_model_id` and
545    /// `sessions::init_embedding_dim`).
546    pub fn install_runtime(&self) {
547        crate::embed::init_model_id(self.model.clone());
548        crate::sessions::init_embedding_dim(self.dim);
549    }
550}
551
552#[cfg(test)]
553mod tests {
554    #![allow(clippy::expect_used, clippy::unwrap_used)]
555
556    use super::*;
557    use serde_json::Value;
558    use tempfile::TempDir;
559
560    #[test]
561    fn validate_catches_empty_model_and_bad_dim() {
562        assert!(EmbeddingsConfig::default().validate().is_ok());
563        // Empty / whitespace-only model id is rejected: HuggingFace fetch
564        // would fail far away from the config error.
565        let bad_model = EmbeddingsConfig {
566            model: "   ".to_owned(),
567            dim: 768,
568        };
569        assert!(bad_model.validate().is_err());
570        // Dim must divide 8 (PQ subspace stride in `embed::index_params`).
571        let bad_dim = EmbeddingsConfig {
572            model: "intfloat/multilingual-e5-base".to_owned(),
573            dim: 100,
574        };
575        assert!(bad_dim.validate().is_err());
576        // Zero is rejected too (would divide-by-zero inside index_params).
577        let zero_dim = EmbeddingsConfig {
578            model: "intfloat/multilingual-e5-base".to_owned(),
579            dim: 0,
580        };
581        assert!(zero_dim.validate().is_err());
582    }
583
584    #[test]
585    fn config_load_missing_file_falls_back_to_builtin() {
586        let config = Config::load("/nonexistent/pond-config-xyz.toml").unwrap();
587        assert_eq!(config.embeddings, EmbeddingsConfig::default());
588    }
589
590    #[test]
591    fn default_config_toml_loads_to_the_builtin_defaults() {
592        let dir = TempDir::new().unwrap();
593        let path = dir.path().join("config.toml");
594        std::fs::write(&path, DEFAULT_CONFIG_TOML).unwrap();
595        // The shipped template is all comments, so it must load and validate as
596        // the built-in defaults - a malformed template fails right here.
597        let config = Config::load(&path).unwrap();
598        assert_eq!(config.embeddings, EmbeddingsConfig::default());
599        assert_eq!(config.embeddings.model, crate::embed::DEFAULT_MODEL_ID);
600        assert_eq!(
601            config.embeddings.dim,
602            crate::sessions::DEFAULT_EMBEDDING_DIM
603        );
604    }
605
606    #[test]
607    fn resolve_data_dir_follows_explicit_then_xdg_then_home() {
608        // An explicit `--data-dir` / `POND_DATA_DIR` wins over everything. The
609        // explicit value can carry any URI form Lance accepts; here we test the
610        // local-path form (parsing is delegated to Lance's `uri_to_url`).
611        let explicit = parse_data_dir("/explicit").unwrap();
612        let resolved = resolve_data_dir(
613            Some(explicit.clone()),
614            Some(PathBuf::from("/xdg")),
615            Some(PathBuf::from("/home")),
616        )
617        .unwrap();
618        assert_eq!(resolved, explicit);
619
620        // An absolute XDG_DATA_HOME is used next.
621        let resolved = resolve_data_dir(
622            None,
623            Some(PathBuf::from("/xdg")),
624            Some(PathBuf::from("/home")),
625        )
626        .unwrap();
627        assert!(is_local(&resolved));
628        assert_eq!(local_path(&resolved).unwrap(), PathBuf::from("/xdg/pond"));
629
630        // A relative XDG_DATA_HOME is ignored per the XDG spec; HOME is the fallback.
631        let resolved = resolve_data_dir(
632            None,
633            Some(PathBuf::from("relative")),
634            Some(PathBuf::from("/home")),
635        )
636        .unwrap();
637        assert_eq!(
638            local_path(&resolved).unwrap(),
639            PathBuf::from("/home/.local/share/pond"),
640        );
641
642        // No XDG and no HOME - stays usable: returns the cwd-anchored `.pond`.
643        // The result is absolute (Lance's URL conversion requires it), so we
644        // just check that the URL ends with the relative path's components.
645        let resolved = resolve_data_dir(None, None, None).unwrap();
646        assert!(is_local(&resolved));
647        assert!(
648            local_path(&resolved).unwrap().ends_with(".pond"),
649            "fallback path should end with .pond: {resolved}",
650        );
651    }
652
653    #[test]
654    fn expand_home_under_handles_tilde_forms() {
655        let home = Path::new("/srv/me");
656        assert_eq!(
657            expand_home_under(Path::new("~"), home),
658            PathBuf::from("/srv/me")
659        );
660        assert_eq!(
661            expand_home_under(Path::new("~/.codex/sessions"), home),
662            PathBuf::from("/srv/me/.codex/sessions"),
663        );
664        // Absolute paths pass through unchanged.
665        assert_eq!(
666            expand_home_under(Path::new("/etc/passwd"), home),
667            PathBuf::from("/etc/passwd"),
668        );
669        // A leading `~something` (no slash) is not the home form - leave it.
670        assert_eq!(
671            expand_home_under(Path::new("~user/elsewhere"), home),
672            PathBuf::from("~user/elsewhere"),
673        );
674    }
675
676    #[test]
677    fn resolve_sources_returns_one_or_all_or_errors() {
678        let temp = TempDir::new().unwrap();
679        let body = "\
680[sources.claude-code]
681enabled = true
682path = \"/srv/claude\"
683
684[sources.codex-cli]
685enabled = true
686path = \"/srv/codex\"
687
688[sources.opencode]
689enabled = false
690";
691        let path = temp.path().join("config.toml");
692        std::fs::write(&path, body).expect("write config");
693        let config = Config::load(&path).unwrap();
694
695        // None -> only enabled entries
696        let all = config.resolve_sources(None).unwrap();
697        assert_eq!(all.len(), 2);
698        let names: Vec<_> = all.iter().map(|(n, _)| n.as_str()).collect();
699        assert!(names.contains(&"claude-code"));
700        assert!(names.contains(&"codex-cli"));
701        // The `enabled` discriminator never reaches the adapter blob.
702        for (_, blob) in &all {
703            assert!(blob.get("enabled").is_none(), "enabled should be stripped");
704        }
705
706        // Some(name) -> one entry, opaque JSON blob
707        let one = config.resolve_sources(Some("codex-cli")).unwrap();
708        assert_eq!(one.len(), 1);
709        assert_eq!(one[0].0, "codex-cli");
710        assert_eq!(
711            one[0].1.get("path").and_then(Value::as_str),
712            Some("/srv/codex"),
713        );
714
715        // Disabled positional -> errors with the recovery hint baked in.
716        let disabled = config.resolve_sources(Some("opencode"));
717        let err = disabled
718            .expect_err("disabled adapter must error")
719            .to_string();
720        assert!(err.contains("enabled = false"), "got: {err}");
721        assert!(err.contains("pond sync opencode"), "got: {err}");
722
723        // Unknown -> error
724        assert!(config.resolve_sources(Some("nope")).is_err());
725
726        // disabled_source_names lists exactly the off ones.
727        assert_eq!(config.disabled_source_names(), vec!["opencode"]);
728    }
729
730    #[test]
731    fn memory_uri_is_classified_as_remote() {
732        let url = parse_data_dir("memory:///pond-remote-test").expect("memory uri parses");
733        assert!(
734            !is_local(&url),
735            "memory:// is not a local-filesystem URL: {url}",
736        );
737        assert!(
738            local_path(&url).is_none(),
739            "local_path must return None for non-file schemes",
740        );
741    }
742}