codemem_core/
config.rs

1//! Persistent configuration for Codemem.
2//!
3//! Loads/saves a TOML config at `~/.codemem/config.toml`.
4
5use crate::{CodememError, GraphConfig, ScoringWeights, VectorConfig};
6use serde::{Deserialize, Serialize};
7use std::path::{Path, PathBuf};
8
9/// Top-level Codemem configuration.
10#[derive(Debug, Clone, Default, Serialize, Deserialize)]
11#[serde(default)]
12pub struct CodememConfig {
13    pub scoring: ScoringWeights,
14    pub vector: VectorConfig,
15    pub graph: GraphConfig,
16    pub embedding: EmbeddingConfig,
17    pub storage: StorageConfig,
18    pub chunking: ChunkingConfig,
19    pub enrichment: EnrichmentConfig,
20}
21
22impl CodememConfig {
23    /// Load configuration from the given path. Validates after loading.
24    pub fn load(path: &Path) -> Result<Self, CodememError> {
25        let content = std::fs::read_to_string(path)?;
26        let config: Self =
27            toml::from_str(&content).map_err(|e| CodememError::Config(e.to_string()))?;
28        config.validate()?;
29        Ok(config)
30    }
31
32    /// Validate configuration values.
33    ///
34    /// Checks that scoring weights are non-negative, dimensions and cache sizes
35    /// are positive, chunk size bounds are consistent, and dedup threshold is
36    /// in the valid range.
37    pub fn validate(&self) -> Result<(), CodememError> {
38        // M5: Scoring weights must be finite and non-negative.
39        // Check is_finite() first to reject NaN/Inf, then < 0.0 for negatives.
40        let w = &self.scoring;
41        let weights = [
42            w.vector_similarity,
43            w.graph_strength,
44            w.token_overlap,
45            w.temporal,
46            w.tag_matching,
47            w.importance,
48            w.confidence,
49            w.recency,
50        ];
51        if weights.iter().any(|v| !v.is_finite() || *v < 0.0) {
52            return Err(CodememError::Config(
53                "All scoring weights must be finite and non-negative".to_string(),
54            ));
55        }
56
57        // Embedding dimensions must be positive
58        if self.embedding.dimensions == 0 {
59            return Err(CodememError::Config(
60                "Embedding dimensions must be > 0".to_string(),
61            ));
62        }
63
64        // Vector dimensions must be positive
65        if self.vector.dimensions == 0 {
66            return Err(CodememError::Config(
67                "Vector dimensions must be > 0".to_string(),
68            ));
69        }
70
71        // Cache capacity must be positive
72        if self.embedding.cache_capacity == 0 {
73            return Err(CodememError::Config(
74                "Embedding cache capacity must be > 0".to_string(),
75            ));
76        }
77
78        // Batch size must be positive
79        if self.embedding.batch_size == 0 {
80            return Err(CodememError::Config(
81                "Embedding batch size must be > 0".to_string(),
82            ));
83        }
84
85        // Chunk size bounds
86        if self.chunking.min_chunk_size >= self.chunking.max_chunk_size {
87            return Err(CodememError::Config(
88                "min_chunk_size must be less than max_chunk_size".to_string(),
89            ));
90        }
91
92        // Dedup threshold in [0.0, 1.0] (also rejects NaN via range check)
93        if !(0.0..=1.0).contains(&self.enrichment.dedup_similarity_threshold) {
94            return Err(CodememError::Config(
95                "dedup_similarity_threshold must be between 0.0 and 1.0".to_string(),
96            ));
97        }
98
99        // Enrichment confidence in [0.0, 1.0]
100        if !(0.0..=1.0).contains(&self.enrichment.insight_confidence) {
101            return Err(CodememError::Config(
102                "insight_confidence must be between 0.0 and 1.0".to_string(),
103            ));
104        }
105
106        // Chunking score thresholds in [0.0, 1.0]
107        let thresholds = [
108            (
109                self.chunking.min_chunk_score_threshold,
110                "min_chunk_score_threshold",
111            ),
112            (
113                self.chunking.min_symbol_score_threshold,
114                "min_symbol_score_threshold",
115            ),
116        ];
117        for (val, name) in &thresholds {
118            if !(0.0..=1.0).contains(val) {
119                return Err(CodememError::Config(format!(
120                    "{name} must be between 0.0 and 1.0"
121                )));
122            }
123        }
124
125        Ok(())
126    }
127
128    /// Save configuration to the given path. Validates before saving.
129    pub fn save(&self, path: &Path) -> Result<(), CodememError> {
130        // M5: Validate before saving to prevent persisting invalid config.
131        self.validate()?;
132        let content =
133            toml::to_string_pretty(self).map_err(|e| CodememError::Config(e.to_string()))?;
134        if let Some(parent) = path.parent() {
135            std::fs::create_dir_all(parent)?;
136        }
137        std::fs::write(path, content)?;
138        Ok(())
139    }
140
141    /// Load from the default path, or return defaults if the file doesn't exist.
142    pub fn load_or_default() -> Self {
143        let path = Self::default_path();
144        if path.exists() {
145            match Self::load(&path) {
146                Ok(config) => config,
147                Err(e) => {
148                    tracing::warn!("Failed to load config: {e}, using defaults");
149                    CodememConfig::default()
150                }
151            }
152        } else {
153            Self::default()
154        }
155    }
156
157    /// Default config path: `~/.codemem/config.toml`.
158    pub fn default_path() -> PathBuf {
159        dirs::home_dir()
160            .unwrap_or_else(|| PathBuf::from("."))
161            .join(".codemem")
162            .join("config.toml")
163    }
164}
165
166/// Embedding provider configuration.
167#[derive(Debug, Clone, Serialize, Deserialize)]
168#[serde(default)]
169pub struct EmbeddingConfig {
170    /// Provider name: "candle" (default), "ollama", or "openai".
171    pub provider: String,
172    /// Model name (provider-specific). For Candle: HF repo ID (e.g. "BAAI/bge-base-en-v1.5").
173    pub model: String,
174    /// API URL for remote providers.
175    pub url: String,
176    /// Embedding dimensions for remote providers (Ollama/OpenAI).
177    /// Ignored by Candle — reads `hidden_size` from model's config.json.
178    pub dimensions: usize,
179    /// LRU cache capacity.
180    pub cache_capacity: usize,
181    /// Batch size for embedding forward passes (GPU memory trade-off).
182    pub batch_size: usize,
183    /// Weight dtype: "f32" (default), "f16" (half precision), "bf16".
184    /// F16 halves memory and is faster on Metal GPU.
185    pub dtype: String,
186}
187
188impl Default for EmbeddingConfig {
189    fn default() -> Self {
190        Self {
191            provider: "candle".to_string(),
192            model: "BAAI/bge-base-en-v1.5".to_string(),
193            url: String::new(),
194            dimensions: 768,
195            cache_capacity: 10_000,
196            batch_size: 16,
197            dtype: "f32".to_string(),
198        }
199    }
200}
201
202/// Storage configuration.
203#[derive(Debug, Clone, Serialize, Deserialize)]
204#[serde(default)]
205pub struct StorageConfig {
206    /// SQLite cache size in MB.
207    pub cache_size_mb: u32,
208    /// SQLite busy timeout in seconds.
209    pub busy_timeout_secs: u64,
210}
211
212impl Default for StorageConfig {
213    fn default() -> Self {
214        Self {
215            cache_size_mb: 64,
216            busy_timeout_secs: 5,
217        }
218    }
219}
220
221/// CST-aware code chunking configuration.
222#[derive(Debug, Clone, Serialize, Deserialize)]
223#[serde(default)]
224pub struct ChunkingConfig {
225    /// Whether chunking is enabled during indexing.
226    pub enabled: bool,
227    /// Maximum chunk size in non-whitespace characters.
228    pub max_chunk_size: usize,
229    /// Minimum chunk size in non-whitespace characters.
230    pub min_chunk_size: usize,
231    /// Whether to auto-compact the graph after indexing.
232    pub auto_compact: bool,
233    /// Maximum number of retained chunk graph-nodes per file after compaction.
234    pub max_retained_chunks_per_file: usize,
235    /// Minimum chunk score (0.0–1.0) to survive compaction.
236    pub min_chunk_score_threshold: f64,
237    /// Maximum number of retained symbol graph-nodes per file after compaction.
238    pub max_retained_symbols_per_file: usize,
239    /// Minimum symbol score (0.0–1.0) to survive compaction.
240    pub min_symbol_score_threshold: f64,
241}
242
243impl Default for ChunkingConfig {
244    fn default() -> Self {
245        Self {
246            enabled: true,
247            max_chunk_size: 1500,
248            min_chunk_size: 50,
249            auto_compact: true,
250            max_retained_chunks_per_file: 10,
251            min_chunk_score_threshold: 0.2,
252            max_retained_symbols_per_file: 15,
253            min_symbol_score_threshold: 0.15,
254        }
255    }
256}
257
258/// Enrichment pipeline configuration for controlling insight generation thresholds.
259#[derive(Debug, Clone, Serialize, Deserialize)]
260#[serde(default)]
261pub struct EnrichmentConfig {
262    /// Minimum commit count for a file to generate a high-activity insight.
263    pub git_min_commit_count: usize,
264    /// Minimum co-change count for a file pair to generate a coupling insight.
265    pub git_min_co_change_count: usize,
266    /// Minimum coupling degree for a node to generate a high-coupling insight.
267    pub perf_min_coupling_degree: usize,
268    /// Minimum symbol count for a file to generate a complexity insight.
269    pub perf_min_symbol_count: usize,
270    /// Default confidence for auto-generated insights.
271    pub insight_confidence: f64,
272    /// Cosine similarity threshold for deduplicating insights.
273    pub dedup_similarity_threshold: f64,
274}
275
276impl Default for EnrichmentConfig {
277    fn default() -> Self {
278        Self {
279            git_min_commit_count: 25,
280            git_min_co_change_count: 5,
281            perf_min_coupling_degree: 25,
282            perf_min_symbol_count: 30,
283            insight_confidence: 0.5,
284            dedup_similarity_threshold: 0.90,
285        }
286    }
287}
288
289#[cfg(test)]
290#[path = "tests/config_tests.rs"]
291mod tests;
codemem_core/config.rs

codemem_core/
config.rs