Skip to main content

codemem_core/
config.rs

1//! Persistent configuration for Codemem.
2//!
3//! Loads/saves a TOML config at `~/.codemem/config.toml`.
4
5use crate::{CodememError, GraphConfig, ScoringWeights, VectorConfig};
6use serde::{Deserialize, Serialize};
7use std::path::{Path, PathBuf};
8
9/// Top-level Codemem configuration.
10#[derive(Debug, Clone, Default, Serialize, Deserialize)]
11#[serde(default)]
12pub struct CodememConfig {
13    pub scoring: ScoringWeights,
14    pub vector: VectorConfig,
15    pub graph: GraphConfig,
16    pub embedding: EmbeddingConfig,
17    pub storage: StorageConfig,
18    pub chunking: ChunkingConfig,
19    pub enrichment: EnrichmentConfig,
20    pub scip: ScipConfig,
21    pub memory: MemoryConfig,
22}
23
24impl CodememConfig {
25    /// Load configuration from the given path. Validates after loading.
26    pub fn load(path: &Path) -> Result<Self, CodememError> {
27        let content = std::fs::read_to_string(path)?;
28        let config: Self =
29            toml::from_str(&content).map_err(|e| CodememError::Config(e.to_string()))?;
30        config.validate()?;
31        Ok(config)
32    }
33
34    /// Validate configuration values.
35    ///
36    /// Checks that scoring weights are non-negative, dimensions and cache sizes
37    /// are positive, chunk size bounds are consistent, and dedup threshold is
38    /// in the valid range.
39    pub fn validate(&self) -> Result<(), CodememError> {
40        // M5: Scoring weights must be finite and non-negative.
41        // Check is_finite() first to reject NaN/Inf, then < 0.0 for negatives.
42        let w = &self.scoring;
43        let weights = [
44            w.vector_similarity,
45            w.graph_strength,
46            w.token_overlap,
47            w.temporal,
48            w.tag_matching,
49            w.importance,
50            w.confidence,
51            w.recency,
52        ];
53        if weights.iter().any(|v| !v.is_finite() || *v < 0.0) {
54            return Err(CodememError::Config(
55                "All scoring weights must be finite and non-negative".to_string(),
56            ));
57        }
58
59        // Embedding dimensions must be positive
60        if self.embedding.dimensions == 0 {
61            return Err(CodememError::Config(
62                "Embedding dimensions must be > 0".to_string(),
63            ));
64        }
65
66        // Vector dimensions must be positive
67        if self.vector.dimensions == 0 {
68            return Err(CodememError::Config(
69                "Vector dimensions must be > 0".to_string(),
70            ));
71        }
72
73        // Cache capacity must be positive
74        if self.embedding.cache_capacity == 0 {
75            return Err(CodememError::Config(
76                "Embedding cache capacity must be > 0".to_string(),
77            ));
78        }
79
80        // Batch size must be positive
81        if self.embedding.batch_size == 0 {
82            return Err(CodememError::Config(
83                "Embedding batch size must be > 0".to_string(),
84            ));
85        }
86
87        // Chunk size bounds
88        if self.chunking.min_chunk_size >= self.chunking.max_chunk_size {
89            return Err(CodememError::Config(
90                "min_chunk_size must be less than max_chunk_size".to_string(),
91            ));
92        }
93
94        // Dedup threshold in [0.0, 1.0] (also rejects NaN via range check)
95        if !(0.0..=1.0).contains(&self.enrichment.dedup_similarity_threshold) {
96            return Err(CodememError::Config(
97                "dedup_similarity_threshold must be between 0.0 and 1.0".to_string(),
98            ));
99        }
100
101        // Enrichment confidence in [0.0, 1.0]
102        if !(0.0..=1.0).contains(&self.enrichment.insight_confidence) {
103            return Err(CodememError::Config(
104                "insight_confidence must be between 0.0 and 1.0".to_string(),
105            ));
106        }
107
108        // SCIP max_references_per_symbol must be positive
109        if self.scip.max_references_per_symbol == 0 {
110            return Err(CodememError::Config(
111                "scip.max_references_per_symbol must be > 0".to_string(),
112            ));
113        }
114
115        // Chunking score thresholds in [0.0, 1.0]
116        let thresholds = [
117            (
118                self.chunking.min_chunk_score_threshold,
119                "min_chunk_score_threshold",
120            ),
121            (
122                self.chunking.min_symbol_score_threshold,
123                "min_symbol_score_threshold",
124            ),
125        ];
126        for (val, name) in &thresholds {
127            if !(0.0..=1.0).contains(val) {
128                return Err(CodememError::Config(format!(
129                    "{name} must be between 0.0 and 1.0"
130                )));
131            }
132        }
133
134        Ok(())
135    }
136
137    /// Save configuration to the given path. Validates before saving.
138    pub fn save(&self, path: &Path) -> Result<(), CodememError> {
139        // M5: Validate before saving to prevent persisting invalid config.
140        self.validate()?;
141        let content =
142            toml::to_string_pretty(self).map_err(|e| CodememError::Config(e.to_string()))?;
143        if let Some(parent) = path.parent() {
144            std::fs::create_dir_all(parent)?;
145        }
146        std::fs::write(path, content)?;
147        Ok(())
148    }
149
150    /// Load from the default path, or return defaults if the file doesn't exist.
151    pub fn load_or_default() -> Self {
152        let path = Self::default_path();
153        if path.exists() {
154            match Self::load(&path) {
155                Ok(config) => config,
156                Err(e) => {
157                    tracing::warn!("Failed to load config: {e}, using defaults");
158                    CodememConfig::default()
159                }
160            }
161        } else {
162            Self::default()
163        }
164    }
165
166    /// Default config path: `~/.codemem/config.toml`.
167    pub fn default_path() -> PathBuf {
168        dirs::home_dir()
169            .unwrap_or_else(|| PathBuf::from("."))
170            .join(".codemem")
171            .join("config.toml")
172    }
173}
174
175/// Embedding provider configuration.
176#[derive(Debug, Clone, Serialize, Deserialize)]
177#[serde(default)]
178pub struct EmbeddingConfig {
179    /// Provider name: "candle" (default), "ollama", or "openai".
180    pub provider: String,
181    /// Model name (provider-specific). For Candle: HF repo ID (e.g. "BAAI/bge-base-en-v1.5").
182    pub model: String,
183    /// API URL for remote providers.
184    pub url: String,
185    /// Embedding dimensions for remote providers (Ollama/OpenAI).
186    /// Ignored by Candle — reads `hidden_size` from model's config.json.
187    pub dimensions: usize,
188    /// LRU cache capacity.
189    pub cache_capacity: usize,
190    /// Batch size for embedding forward passes (GPU memory trade-off).
191    pub batch_size: usize,
192    /// Weight dtype: "f32" (default), "f16" (half precision), "bf16".
193    /// F16 halves memory and is faster on Metal GPU.
194    pub dtype: String,
195}
196
197impl Default for EmbeddingConfig {
198    fn default() -> Self {
199        Self {
200            provider: "candle".to_string(),
201            model: "BAAI/bge-base-en-v1.5".to_string(),
202            url: String::new(),
203            dimensions: 768,
204            cache_capacity: 10_000,
205            batch_size: 16,
206            dtype: "f32".to_string(),
207        }
208    }
209}
210
211/// Storage configuration.
212#[derive(Debug, Clone, Serialize, Deserialize)]
213#[serde(default)]
214pub struct StorageConfig {
215    /// Backend type: "sqlite" (default) or "postgres".
216    #[serde(default = "default_storage_backend")]
217    pub backend: String,
218    /// Connection URL for remote backends (e.g., "postgres://user:pass@host/db").
219    #[serde(default)]
220    pub url: Option<String>,
221    /// SQLite cache size in MB.
222    pub cache_size_mb: u32,
223    /// SQLite busy timeout in seconds.
224    pub busy_timeout_secs: u64,
225}
226
227fn default_storage_backend() -> String {
228    "sqlite".to_string()
229}
230
231impl Default for StorageConfig {
232    fn default() -> Self {
233        Self {
234            backend: default_storage_backend(),
235            url: None,
236            cache_size_mb: 64,
237            busy_timeout_secs: 5,
238        }
239    }
240}
241
242/// CST-aware code chunking configuration.
243#[derive(Debug, Clone, Serialize, Deserialize)]
244#[serde(default)]
245pub struct ChunkingConfig {
246    /// Whether chunking is enabled during indexing.
247    pub enabled: bool,
248    /// Maximum chunk size in non-whitespace characters.
249    pub max_chunk_size: usize,
250    /// Minimum chunk size in non-whitespace characters.
251    pub min_chunk_size: usize,
252    /// Whether to auto-compact the graph after indexing.
253    pub auto_compact: bool,
254    /// Maximum number of retained chunk graph-nodes per file after compaction.
255    pub max_retained_chunks_per_file: usize,
256    /// Minimum chunk score (0.0–1.0) to survive compaction.
257    pub min_chunk_score_threshold: f64,
258    /// Maximum number of retained symbol graph-nodes per file after compaction.
259    pub max_retained_symbols_per_file: usize,
260    /// Minimum symbol score (0.0–1.0) to survive compaction.
261    pub min_symbol_score_threshold: f64,
262}
263
264impl Default for ChunkingConfig {
265    fn default() -> Self {
266        Self {
267            enabled: true,
268            max_chunk_size: 1500,
269            min_chunk_size: 50,
270            auto_compact: true,
271            max_retained_chunks_per_file: 10,
272            min_chunk_score_threshold: 0.2,
273            max_retained_symbols_per_file: 15,
274            min_symbol_score_threshold: 0.15,
275        }
276    }
277}
278
279/// SCIP integration configuration.
280#[derive(Debug, Clone, Serialize, Deserialize)]
281#[serde(default)]
282pub struct ScipConfig {
283    /// Master switch for SCIP integration.
284    pub enabled: bool,
285    /// Check PATH for available indexers.
286    pub auto_detect_indexers: bool,
287    /// Cache .scip files between runs.
288    pub cache_index: bool,
289    /// Re-index if cache older than this many hours.
290    pub cache_ttl_hours: u64,
291    /// Create ext: nodes for dependency symbols.
292    pub create_external_nodes: bool,
293    /// Skip utility symbols with excessive fan-out (fallback for kinds without per-kind limits).
294    pub max_references_per_symbol: usize,
295    /// Attach hover docs as memories to nodes.
296    pub store_docs_as_memories: bool,
297    /// Build nested containment tree from SCIP descriptor chains.
298    /// When true: file→module→class→method. When false: flat file→symbol.
299    pub hierarchical_containment: bool,
300    /// Collapse intra-class edges into parent metadata.
301    pub collapse_intra_class_edges: bool,
302    /// Per-kind fan-out limits (0 = use max_references_per_symbol fallback).
303    pub fan_out_limits: FanOutLimits,
304    /// Per-language indexer command overrides.
305    pub indexers: ScipIndexersConfig,
306}
307
308/// Per-kind inbound reference limits. A module can be widely imported; a function less so.
309#[derive(Debug, Clone, Serialize, Deserialize)]
310#[serde(default)]
311pub struct FanOutLimits {
312    pub module: usize,
313    pub function: usize,
314    pub method: usize,
315    pub class: usize,
316}
317
318impl Default for FanOutLimits {
319    fn default() -> Self {
320        Self {
321            module: 200,
322            function: 30,
323            method: 30,
324            class: 50,
325        }
326    }
327}
328
329impl Default for ScipConfig {
330    fn default() -> Self {
331        Self {
332            enabled: true,
333            auto_detect_indexers: true,
334            cache_index: true,
335            cache_ttl_hours: 24,
336            create_external_nodes: true,
337            max_references_per_symbol: 100,
338            store_docs_as_memories: true,
339            hierarchical_containment: true,
340            collapse_intra_class_edges: true,
341            fan_out_limits: FanOutLimits::default(),
342            indexers: ScipIndexersConfig::default(),
343        }
344    }
345}
346
347/// Per-language SCIP indexer command overrides. Empty string means auto-detect from PATH.
348///
349/// Commands are split on whitespace — paths with spaces are **not** supported.
350/// Use symlinks or PATH entries for indexers in directories with spaces.
351#[derive(Debug, Clone, Default, Serialize, Deserialize)]
352#[serde(default)]
353pub struct ScipIndexersConfig {
354    pub rust: String,
355    pub typescript: String,
356    pub python: String,
357    pub java: String,
358    pub go: String,
359}
360
361/// Enrichment pipeline configuration for controlling insight generation thresholds.
362#[derive(Debug, Clone, Serialize, Deserialize)]
363#[serde(default)]
364pub struct EnrichmentConfig {
365    /// Minimum commit count for a file to generate a high-activity insight.
366    pub git_min_commit_count: usize,
367    /// Minimum co-change count for a file pair to generate a coupling insight.
368    pub git_min_co_change_count: usize,
369    /// Minimum coupling degree for a node to generate a high-coupling insight.
370    pub perf_min_coupling_degree: usize,
371    /// Minimum symbol count for a file to generate a complexity insight.
372    pub perf_min_symbol_count: usize,
373    /// Default confidence for auto-generated insights.
374    pub insight_confidence: f64,
375    /// Cosine similarity threshold for deduplicating insights.
376    pub dedup_similarity_threshold: f64,
377}
378
379impl Default for EnrichmentConfig {
380    fn default() -> Self {
381        Self {
382            git_min_commit_count: 25,
383            git_min_co_change_count: 5,
384            perf_min_coupling_degree: 25,
385            perf_min_symbol_count: 30,
386            insight_confidence: 0.5,
387            dedup_similarity_threshold: 0.90,
388        }
389    }
390}
391
392/// Memory expiration settings.
393#[derive(Debug, Clone, Serialize, Deserialize)]
394#[serde(default)]
395pub struct MemoryConfig {
396    /// Default TTL in hours for session-scoped memories (memories with a session_id).
397    /// Set to 0 to disable auto-expiry for session memories.
398    pub default_session_ttl_hours: u64,
399    /// Expire `static-analysis` tagged memories when the underlying file is re-indexed
400    /// with a changed content hash.
401    pub expire_enrichments_on_reindex: bool,
402}
403
404impl Default for MemoryConfig {
405    fn default() -> Self {
406        Self {
407            default_session_ttl_hours: 168, // 7 days
408            expire_enrichments_on_reindex: true,
409        }
410    }
411}
412
413#[cfg(test)]
414#[path = "tests/config_tests.rs"]
415mod tests;