1use std::{
9 collections::BTreeMap,
10 path::{Path, PathBuf},
11};
12
13use anyhow::{Context, Result, anyhow, bail};
14use lance_io::object_store::uri_to_url;
15use serde::{Deserialize, Deserializer, Serialize, de};
16use serde_json::Value;
17use url::Url;
18
19fn parse_byte_size(raw: &str) -> Result<usize, String> {
24 let trimmed = raw.trim();
25 if trimmed.is_empty() {
26 return Err("byte-size value is empty".to_owned());
27 }
28 let split = trimmed
29 .find(|c: char| c.is_ascii_alphabetic())
30 .unwrap_or(trimmed.len());
31 let (number, unit) = trimmed.split_at(split);
32 let number: f64 = number
33 .trim()
34 .parse()
35 .map_err(|_| format!("byte-size value {raw:?} is not a number"))?;
36 if !number.is_finite() || number < 0.0 {
37 return Err(format!("byte-size value {raw:?} must be non-negative"));
38 }
39 let multiplier: f64 = match unit.trim().to_ascii_lowercase().as_str() {
40 "" | "b" => 1.0,
41 "k" | "kb" => 1_000.0,
42 "kib" => 1_024.0,
43 "m" | "mb" => 1_000_000.0,
44 "mib" => 1_048_576.0,
45 "g" | "gb" => 1_000_000_000.0,
46 "gib" => 1_073_741_824.0,
47 "tib" => 1_099_511_627_776.0,
48 other => {
49 return Err(format!(
50 "byte-size unit {other:?} not recognized (try MiB / GiB)"
51 ));
52 }
53 };
54 let bytes = number * multiplier;
55 if !bytes.is_finite() || bytes > usize::MAX as f64 {
56 return Err(format!("byte-size value {raw:?} overflows usize"));
57 }
58 Ok(bytes as usize)
59}
60
61fn deserialize_byte_size_opt<'de, D>(deserializer: D) -> Result<Option<usize>, D::Error>
62where
63 D: Deserializer<'de>,
64{
65 #[derive(Deserialize)]
66 #[serde(untagged)]
67 enum Repr {
68 Bytes(u64),
69 Text(String),
70 }
71 let repr: Option<Repr> = Option::deserialize(deserializer)?;
72 match repr {
73 None => Ok(None),
74 Some(Repr::Bytes(value)) => usize::try_from(value).map(Some).map_err(de::Error::custom),
75 Some(Repr::Text(value)) => parse_byte_size(&value).map(Some).map_err(de::Error::custom),
76 }
77}
78
79pub fn parse_data_dir(input: &str) -> Result<Url> {
90 uri_to_url(input).with_context(|| format!("invalid --data-dir {input:?}"))
91}
92
93pub fn is_local(url: &Url) -> bool {
97 matches!(url.scheme(), "file" | "file+uring")
98}
99
100pub fn local_path(url: &Url) -> Option<PathBuf> {
102 if is_local(url) {
103 url.to_file_path().ok()
104 } else {
105 None
106 }
107}
108
109pub fn child_uri(base: &Url, suffix: &str) -> String {
115 if let Some(path) = local_path(base) {
119 return path.join(suffix).display().to_string();
120 }
121 format!("{}/{suffix}", base.as_str().trim_end_matches('/'))
122}
123
124pub fn display(url: &Url) -> String {
127 if let Some(path) = local_path(url) {
128 path.display().to_string()
129 } else {
130 url.to_string()
131 }
132}
133
134pub fn url_for_path(path: impl AsRef<Path>) -> Result<Url> {
139 let path = path.as_ref();
140 let absolute = if path.is_absolute() {
141 path.to_path_buf()
142 } else {
143 std::path::absolute(path)
144 .with_context(|| format!("failed to absolutize {}", path.display()))?
145 };
146 Url::from_file_path(&absolute).map_err(|()| {
147 anyhow!(
148 "failed to convert path {} into a file:// URL",
149 absolute.display()
150 )
151 })
152}
153
154pub const DEFAULT_CONFIG_TOML: &str = "\
158# pond configuration.
159#
160# pond ships built-in defaults, so every setting here is optional - delete this
161# file and pond still works. Uncomment and edit to override.
162
163# Where pond looks for source data to import. One entry per adapter type
164# (`claude-code`, `codex-cli`, ...). `pond sync` with no arguments syncs every
165# entry; `pond sync <adapter>` syncs just one. With an empty `[sources]`,
166# `pond sync` runs an interactive discovery against the known default paths
167# and writes the picks back here.
168#
169# Future wrap: pond is single-namespace in v1 (spec.md#wire-namespace-resolution); `[sources]` is
170# flat here. When multi-namespace pond lands, source registration becomes
171# per-tenant under `[namespaces.<ns>.sources.<adapter>]`. Pre-v1 the schema
172# is breakable; the rename is operationally free until a real second tenant
173# exists.
174#
175# [sources.claude-code]
176# enabled = true
177# path = \"~/.claude/projects\"
178#
179# [sources.codex-cli]
180# enabled = true
181# path = \"~/.codex/sessions\"
182#
183# Set `enabled = false` to keep the section but skip it on `pond sync`;
184# re-enable via `pond sync <adapter>`.
185
186# Embeddings. Search runs hybrid (vector + FTS) whenever the store has any
187# vectors, and FTS-only otherwise - the model loads lazily on the first hybrid
188# query, so there's no cost on FTS-only corpora. `model` selects the
189# HuggingFace XLM-RoBERTa model; `dim` declares its output width and is baked
190# into the messages.vector schema on table creation - it must equal the
191# model's hidden_size and be a multiple of 8 (IVF_PQ subspace stride).
192#
193# Common pairings:
194# model = \"intfloat/multilingual-e5-small\" dim = 384 (default)
195# model = \"intfloat/multilingual-e5-base\" dim = 768
196# model = \"intfloat/multilingual-e5-large\" dim = 1024
197#
198# A different-dim model needs a fresh data dir; pond enforces this at the
199# schema boundary.
200#
201# [embeddings]
202# model = \"intfloat/multilingual-e5-small\"
203# dim = 384
204
205# Search tuning. Leave unset for Lance defaults; set when tuning IVF_PQ recall
206# against a corpus.
207#
208# [search]
209# nprobes = 16
210# refine_factor = 2
211
212# Storage maintenance. Tunes the compaction + cleanup pass that runs inside
213# `pond sync` and `pond index optimize`.
214#
215# - `compaction_fragment_cap` is the per-task fragment-count backstop: a
216# planned compaction task touching at least this many fragments always runs
217# even when the write-amplification veto would skip it. Default 64; 0
218# disables the veto and runs every task Lance plans.
219# - `cleanup_older_than` is the manifest-retention window for the safe cleanup
220# pass. Accepts `Ns` / `Nm` / `Nh` / `Nd` (default `1d`, floor `1h` - it is
221# what protects in-flight readers). Versions older than this are reclaimed
222# by Lance's OCC-coordinated GC.
223# - `index_lag_threshold` is the minimum unindexed-fragment count before a
224# per-intent append/rebuild runs in `pond index optimize`; the brute-force
225# fallback keeps queries correct while fragments accumulate. Default 4.
226#
227# [maintenance]
228# compaction_fragment_cap = 64
229# cleanup_older_than = \"1d\"
230# index_lag_threshold = 4
231
232# Long-running process caps. Both accept either a plain byte count or a
233# humansize-style suffix (\"128 MiB\", \"1 GiB\"). Both are optional - leave
234# unset to let pond pick the backend-aware default:
235# local FS : index_cache = 256 MiB, metadata_cache = 128 MiB
236# remote : index_cache = 2 GiB, metadata_cache = 512 MiB
237# Lance's library defaults (6 GiB / 1 GiB) are too generous for a per-session
238# `pond mcp` process; tightening them is what keeps RSS under the 500 MiB target
239# without measurable latency regressions on typical agent-history corpora.
240#
241# [runtime]
242# index_cache_bytes = \"256 MiB\"
243# metadata_cache_bytes = \"128 MiB\"
244
245# Object-store credentials and tuning, passed verbatim to Lance's
246# `DatasetBuilder::with_storage_options`. Required only when `--data-dir` is
247# an `s3://` / `gs://` / `az://` URI that needs auth or a non-default region.
248# Keys follow the `object_store` crate's standard names. Environment
249# variables of the same name are read by `object_store` automatically;
250# values in this block override them. pond does not parse these.
251#
252# Future wrap: pond is single-namespace in v1 (spec.md#wire-namespace-resolution); `[storage]` is
253# flat here on the assumption of one bucket per pond. When multi-namespace
254# pond lands and tenants need separate buckets/regions, this becomes
255# `[namespaces.<ns>.storage]`. Pre-v1 the schema is breakable; the rename is
256# operationally free until a real second tenant exists.
257#
258# [storage]
259# AWS_ACCESS_KEY_ID = \"...\"
260# AWS_SECRET_ACCESS_KEY = \"...\"
261# AWS_REGION = \"us-east-1\"
262# AWS_ENDPOINT = \"https://minio.example.com\" # for self-hosted MinIO
263# allow_http = \"true\" # only for non-TLS endpoints
264";
265
266#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
268#[serde(deny_unknown_fields)]
269pub struct Config {
270 #[serde(default)]
271 pub embeddings: EmbeddingsConfig,
272 #[serde(default)]
273 pub search: SearchConfig,
274 #[serde(default)]
275 pub maintenance: MaintenanceConfig,
276 #[serde(default)]
277 pub runtime: RuntimeConfig,
278 #[serde(default)]
284 pub sources: BTreeMap<String, Value>,
285 #[serde(default)]
295 pub storage: BTreeMap<String, String>,
296}
297
298#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
303#[serde(deny_unknown_fields, default)]
304pub struct RuntimeConfig {
305 #[serde(default, deserialize_with = "deserialize_byte_size_opt")]
306 pub index_cache_bytes: Option<usize>,
307 #[serde(default, deserialize_with = "deserialize_byte_size_opt")]
308 pub metadata_cache_bytes: Option<usize>,
309}
310
311#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
313#[serde(deny_unknown_fields)]
314pub struct SearchConfig {
315 #[serde(default)]
316 pub nprobes: Option<usize>,
317 #[serde(default)]
318 pub refine_factor: Option<u32>,
319}
320
321#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
326#[serde(deny_unknown_fields)]
327pub struct MaintenanceConfig {
328 #[serde(default)]
333 pub compaction_fragment_cap: Option<usize>,
334 #[serde(default)]
339 pub cleanup_older_than: Option<String>,
340 #[serde(default)]
346 pub index_lag_threshold: Option<usize>,
347}
348
349#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
358#[serde(deny_unknown_fields, default)]
359pub struct EmbeddingsConfig {
360 pub model: String,
363 pub dim: usize,
367}
368
369impl Default for EmbeddingsConfig {
370 fn default() -> Self {
371 Self {
372 model: crate::embed::DEFAULT_MODEL_ID.to_owned(),
373 dim: crate::sessions::DEFAULT_EMBEDDING_DIM,
374 }
375 }
376}
377
378pub fn resolve_data_dir(
384 explicit: Option<Url>,
385 xdg_data_home: Option<PathBuf>,
386 home: Option<PathBuf>,
387) -> Result<Url> {
388 if let Some(location) = explicit {
389 return Ok(location);
390 }
391 if let Some(xdg) = xdg_data_home.filter(|path| path.is_absolute()) {
392 return url_for_path(xdg.join("pond"));
393 }
394 if let Some(home) = home {
395 return url_for_path(home.join(".local").join("share").join("pond"));
396 }
397 url_for_path(PathBuf::from(".pond"))
399}
400
401pub fn default_config_path(xdg_config_home: Option<PathBuf>, home: Option<PathBuf>) -> PathBuf {
406 if let Some(xdg) = xdg_config_home.filter(|path| path.is_absolute()) {
407 return xdg.join("pond").join("config.toml");
408 }
409 if let Some(home) = home {
410 return home.join(".config").join("pond").join("config.toml");
411 }
412 PathBuf::from(".pond.toml")
413}
414
415impl Config {
416 pub fn load(path: impl AsRef<Path>) -> Result<Self> {
422 let path = path.as_ref();
423 let config = if path.exists() {
424 let text = std::fs::read_to_string(path)
425 .with_context(|| format!("failed to read config {}", path.display()))?;
426 toml::from_str::<Self>(&text)
427 .with_context(|| format!("failed to parse config {}", path.display()))?
428 } else {
429 Self::default()
430 };
431 config.embeddings.validate()?;
432 config.embeddings.install_runtime();
433 if let Some(threshold) = config.maintenance.index_lag_threshold {
434 crate::substrate::init_index_lag_threshold(threshold);
435 }
436 Ok(config)
440 }
441
442 pub fn resolve_sources(&self, adapter: Option<&str>) -> Result<Vec<(String, Value)>> {
451 match adapter {
452 None => Ok(self
453 .sources
454 .iter()
455 .filter_map(|(name, blob)| take_enabled(name, blob))
456 .collect()),
457 Some(name) => {
458 let blob = self
459 .sources
460 .get(name)
461 .ok_or_else(|| anyhow!("no [sources.{name}] entry in config"))?;
462 take_enabled(name, blob).map(|entry| vec![entry]).ok_or_else(|| {
463 anyhow!(
464 "source [{name}] is disabled (enabled = false); run `pond sync {name}` to re-enable"
465 )
466 })
467 }
468 }
469 }
470
471 pub fn disabled_source_names(&self) -> Vec<&str> {
476 self.sources
477 .iter()
478 .filter_map(|(name, blob)| {
479 let enabled = blob
480 .get("enabled")
481 .and_then(Value::as_bool)
482 .unwrap_or(false);
483 if enabled { None } else { Some(name.as_str()) }
484 })
485 .collect()
486 }
487}
488
489fn take_enabled(name: &str, blob: &Value) -> Option<(String, Value)> {
493 let enabled = blob
494 .get("enabled")
495 .and_then(Value::as_bool)
496 .unwrap_or(false);
497 if !enabled {
498 return None;
499 }
500 let mut clean = blob.clone();
501 if let Some(obj) = clean.as_object_mut() {
502 obj.remove("enabled");
503 }
504 Some((name.to_owned(), clean))
505}
506
507pub fn expand_home_under(path: &Path, home: &Path) -> PathBuf {
513 let Some(text) = path.to_str() else {
514 return path.to_path_buf();
515 };
516 if text == "~" {
517 return home.to_path_buf();
518 }
519 if let Some(rest) = text.strip_prefix("~/") {
520 return home.join(rest);
521 }
522 path.to_path_buf()
523}
524
525impl EmbeddingsConfig {
526 pub fn validate(&self) -> Result<()> {
531 if self.model.trim().is_empty() {
532 bail!("embeddings.model must be a non-empty HuggingFace model id");
533 }
534 if self.dim == 0 || !self.dim.is_multiple_of(8) {
535 bail!(
536 "embeddings.dim = {} must be a positive multiple of 8 (IVF_PQ subspace stride)",
537 self.dim,
538 );
539 }
540 Ok(())
541 }
542
543 pub fn install_runtime(&self) {
547 crate::embed::init_model_id(self.model.clone());
548 crate::sessions::init_embedding_dim(self.dim);
549 }
550}
551
552#[cfg(test)]
553mod tests {
554 #![allow(clippy::expect_used, clippy::unwrap_used)]
555
556 use super::*;
557 use serde_json::Value;
558 use tempfile::TempDir;
559
560 #[test]
561 fn validate_catches_empty_model_and_bad_dim() {
562 assert!(EmbeddingsConfig::default().validate().is_ok());
563 let bad_model = EmbeddingsConfig {
566 model: " ".to_owned(),
567 dim: 768,
568 };
569 assert!(bad_model.validate().is_err());
570 let bad_dim = EmbeddingsConfig {
572 model: "intfloat/multilingual-e5-base".to_owned(),
573 dim: 100,
574 };
575 assert!(bad_dim.validate().is_err());
576 let zero_dim = EmbeddingsConfig {
578 model: "intfloat/multilingual-e5-base".to_owned(),
579 dim: 0,
580 };
581 assert!(zero_dim.validate().is_err());
582 }
583
584 #[test]
585 fn config_load_missing_file_falls_back_to_builtin() {
586 let config = Config::load("/nonexistent/pond-config-xyz.toml").unwrap();
587 assert_eq!(config.embeddings, EmbeddingsConfig::default());
588 }
589
590 #[test]
591 fn default_config_toml_loads_to_the_builtin_defaults() {
592 let dir = TempDir::new().unwrap();
593 let path = dir.path().join("config.toml");
594 std::fs::write(&path, DEFAULT_CONFIG_TOML).unwrap();
595 let config = Config::load(&path).unwrap();
598 assert_eq!(config.embeddings, EmbeddingsConfig::default());
599 assert_eq!(config.embeddings.model, crate::embed::DEFAULT_MODEL_ID);
600 assert_eq!(
601 config.embeddings.dim,
602 crate::sessions::DEFAULT_EMBEDDING_DIM
603 );
604 }
605
606 #[test]
607 fn resolve_data_dir_follows_explicit_then_xdg_then_home() {
608 let explicit = parse_data_dir("/explicit").unwrap();
612 let resolved = resolve_data_dir(
613 Some(explicit.clone()),
614 Some(PathBuf::from("/xdg")),
615 Some(PathBuf::from("/home")),
616 )
617 .unwrap();
618 assert_eq!(resolved, explicit);
619
620 let resolved = resolve_data_dir(
622 None,
623 Some(PathBuf::from("/xdg")),
624 Some(PathBuf::from("/home")),
625 )
626 .unwrap();
627 assert!(is_local(&resolved));
628 assert_eq!(local_path(&resolved).unwrap(), PathBuf::from("/xdg/pond"));
629
630 let resolved = resolve_data_dir(
632 None,
633 Some(PathBuf::from("relative")),
634 Some(PathBuf::from("/home")),
635 )
636 .unwrap();
637 assert_eq!(
638 local_path(&resolved).unwrap(),
639 PathBuf::from("/home/.local/share/pond"),
640 );
641
642 let resolved = resolve_data_dir(None, None, None).unwrap();
646 assert!(is_local(&resolved));
647 assert!(
648 local_path(&resolved).unwrap().ends_with(".pond"),
649 "fallback path should end with .pond: {resolved}",
650 );
651 }
652
653 #[test]
654 fn expand_home_under_handles_tilde_forms() {
655 let home = Path::new("/srv/me");
656 assert_eq!(
657 expand_home_under(Path::new("~"), home),
658 PathBuf::from("/srv/me")
659 );
660 assert_eq!(
661 expand_home_under(Path::new("~/.codex/sessions"), home),
662 PathBuf::from("/srv/me/.codex/sessions"),
663 );
664 assert_eq!(
666 expand_home_under(Path::new("/etc/passwd"), home),
667 PathBuf::from("/etc/passwd"),
668 );
669 assert_eq!(
671 expand_home_under(Path::new("~user/elsewhere"), home),
672 PathBuf::from("~user/elsewhere"),
673 );
674 }
675
676 #[test]
677 fn resolve_sources_returns_one_or_all_or_errors() {
678 let temp = TempDir::new().unwrap();
679 let body = "\
680[sources.claude-code]
681enabled = true
682path = \"/srv/claude\"
683
684[sources.codex-cli]
685enabled = true
686path = \"/srv/codex\"
687
688[sources.opencode]
689enabled = false
690";
691 let path = temp.path().join("config.toml");
692 std::fs::write(&path, body).expect("write config");
693 let config = Config::load(&path).unwrap();
694
695 let all = config.resolve_sources(None).unwrap();
697 assert_eq!(all.len(), 2);
698 let names: Vec<_> = all.iter().map(|(n, _)| n.as_str()).collect();
699 assert!(names.contains(&"claude-code"));
700 assert!(names.contains(&"codex-cli"));
701 for (_, blob) in &all {
703 assert!(blob.get("enabled").is_none(), "enabled should be stripped");
704 }
705
706 let one = config.resolve_sources(Some("codex-cli")).unwrap();
708 assert_eq!(one.len(), 1);
709 assert_eq!(one[0].0, "codex-cli");
710 assert_eq!(
711 one[0].1.get("path").and_then(Value::as_str),
712 Some("/srv/codex"),
713 );
714
715 let disabled = config.resolve_sources(Some("opencode"));
717 let err = disabled
718 .expect_err("disabled adapter must error")
719 .to_string();
720 assert!(err.contains("enabled = false"), "got: {err}");
721 assert!(err.contains("pond sync opencode"), "got: {err}");
722
723 assert!(config.resolve_sources(Some("nope")).is_err());
725
726 assert_eq!(config.disabled_source_names(), vec!["opencode"]);
728 }
729
730 #[test]
731 fn memory_uri_is_classified_as_remote() {
732 let url = parse_data_dir("memory:///pond-remote-test").expect("memory uri parses");
733 assert!(
734 !is_local(&url),
735 "memory:// is not a local-filesystem URL: {url}",
736 );
737 assert!(
738 local_path(&url).is_none(),
739 "local_path must return None for non-file schemes",
740 );
741 }
742}