Skip to main content

codemem_core/
config.rs

1//! Persistent configuration for Codemem.
2//!
3//! Loads/saves a TOML config at `~/.codemem/config.toml`.
4
5use crate::{CodememError, GraphConfig, ScoringWeights, VectorConfig};
6use serde::{Deserialize, Serialize};
7use std::path::{Path, PathBuf};
8
9/// Top-level Codemem configuration.
10#[derive(Debug, Clone, Default, Serialize, Deserialize)]
11#[serde(default)]
12pub struct CodememConfig {
13    pub scoring: ScoringWeights,
14    pub vector: VectorConfig,
15    pub graph: GraphConfig,
16    pub embedding: EmbeddingConfig,
17    pub storage: StorageConfig,
18    pub chunking: ChunkingConfig,
19    pub enrichment: EnrichmentConfig,
20}
21
22impl CodememConfig {
23    /// Load configuration from the given path. Validates after loading.
24    pub fn load(path: &Path) -> Result<Self, CodememError> {
25        let content = std::fs::read_to_string(path)?;
26        let config: Self =
27            toml::from_str(&content).map_err(|e| CodememError::Config(e.to_string()))?;
28        config.validate()?;
29        Ok(config)
30    }
31
32    /// Validate configuration values.
33    ///
34    /// Checks that scoring weights are non-negative, dimensions and cache sizes
35    /// are positive, chunk size bounds are consistent, and dedup threshold is
36    /// in the valid range.
37    pub fn validate(&self) -> Result<(), CodememError> {
38        // M5: Scoring weights must be finite and non-negative.
39        // Check is_finite() first to reject NaN/Inf, then < 0.0 for negatives.
40        let w = &self.scoring;
41        let weights = [
42            w.vector_similarity,
43            w.graph_strength,
44            w.token_overlap,
45            w.temporal,
46            w.tag_matching,
47            w.importance,
48            w.confidence,
49            w.recency,
50        ];
51        if weights.iter().any(|v| !v.is_finite() || *v < 0.0) {
52            return Err(CodememError::Config(
53                "All scoring weights must be finite and non-negative".to_string(),
54            ));
55        }
56
57        // Embedding dimensions must be positive
58        if self.embedding.dimensions == 0 {
59            return Err(CodememError::Config(
60                "Embedding dimensions must be > 0".to_string(),
61            ));
62        }
63
64        // Vector dimensions must be positive
65        if self.vector.dimensions == 0 {
66            return Err(CodememError::Config(
67                "Vector dimensions must be > 0".to_string(),
68            ));
69        }
70
71        // Cache capacity must be positive
72        if self.embedding.cache_capacity == 0 {
73            return Err(CodememError::Config(
74                "Embedding cache capacity must be > 0".to_string(),
75            ));
76        }
77
78        // Chunk size bounds
79        if self.chunking.min_chunk_size >= self.chunking.max_chunk_size {
80            return Err(CodememError::Config(
81                "min_chunk_size must be less than max_chunk_size".to_string(),
82            ));
83        }
84
85        // Dedup threshold in [0.0, 1.0] (also rejects NaN via range check)
86        if !(0.0..=1.0).contains(&self.enrichment.dedup_similarity_threshold) {
87            return Err(CodememError::Config(
88                "dedup_similarity_threshold must be between 0.0 and 1.0".to_string(),
89            ));
90        }
91
92        // Enrichment confidence in [0.0, 1.0]
93        if !(0.0..=1.0).contains(&self.enrichment.insight_confidence) {
94            return Err(CodememError::Config(
95                "insight_confidence must be between 0.0 and 1.0".to_string(),
96            ));
97        }
98
99        // Chunking score thresholds in [0.0, 1.0]
100        let thresholds = [
101            (
102                self.chunking.min_chunk_score_threshold,
103                "min_chunk_score_threshold",
104            ),
105            (
106                self.chunking.min_symbol_score_threshold,
107                "min_symbol_score_threshold",
108            ),
109        ];
110        for (val, name) in &thresholds {
111            if !(0.0..=1.0).contains(val) {
112                return Err(CodememError::Config(format!(
113                    "{name} must be between 0.0 and 1.0"
114                )));
115            }
116        }
117
118        Ok(())
119    }
120
121    /// Save configuration to the given path. Validates before saving.
122    pub fn save(&self, path: &Path) -> Result<(), CodememError> {
123        // M5: Validate before saving to prevent persisting invalid config.
124        self.validate()?;
125        let content =
126            toml::to_string_pretty(self).map_err(|e| CodememError::Config(e.to_string()))?;
127        if let Some(parent) = path.parent() {
128            std::fs::create_dir_all(parent)?;
129        }
130        std::fs::write(path, content)?;
131        Ok(())
132    }
133
134    /// Load from the default path, or return defaults if the file doesn't exist.
135    pub fn load_or_default() -> Self {
136        let path = Self::default_path();
137        if path.exists() {
138            match Self::load(&path) {
139                Ok(config) => config,
140                Err(e) => {
141                    tracing::warn!("Failed to load config: {e}, using defaults");
142                    CodememConfig::default()
143                }
144            }
145        } else {
146            Self::default()
147        }
148    }
149
150    /// Default config path: `~/.codemem/config.toml`.
151    pub fn default_path() -> PathBuf {
152        dirs::home_dir()
153            .unwrap_or_else(|| PathBuf::from("."))
154            .join(".codemem")
155            .join("config.toml")
156    }
157}
158
159/// Embedding provider configuration.
160#[derive(Debug, Clone, Serialize, Deserialize)]
161#[serde(default)]
162pub struct EmbeddingConfig {
163    /// Provider name: "candle" (default), "ollama", or "openai".
164    pub provider: String,
165    /// Model name (provider-specific).
166    pub model: String,
167    /// API URL for remote providers.
168    pub url: String,
169    /// Embedding dimensions.
170    pub dimensions: usize,
171    /// LRU cache capacity.
172    pub cache_capacity: usize,
173}
174
175impl Default for EmbeddingConfig {
176    fn default() -> Self {
177        Self {
178            provider: "candle".to_string(),
179            model: "BAAI/bge-base-en-v1.5".to_string(),
180            url: String::new(),
181            dimensions: 768,
182            cache_capacity: 10_000,
183        }
184    }
185}
186
187/// Storage configuration.
188#[derive(Debug, Clone, Serialize, Deserialize)]
189#[serde(default)]
190pub struct StorageConfig {
191    /// Path to the database file.
192    pub db_path: String,
193    /// SQLite cache size in MB.
194    pub cache_size_mb: u32,
195    /// SQLite busy timeout in seconds.
196    pub busy_timeout_secs: u64,
197}
198
199impl Default for StorageConfig {
200    fn default() -> Self {
201        Self {
202            db_path: dirs::home_dir()
203                .unwrap_or_else(|| PathBuf::from("."))
204                .join(".codemem")
205                .join("codemem.db")
206                .to_string_lossy()
207                .into_owned(),
208            cache_size_mb: 64,
209            busy_timeout_secs: 5,
210        }
211    }
212}
213
214/// CST-aware code chunking configuration.
215#[derive(Debug, Clone, Serialize, Deserialize)]
216#[serde(default)]
217pub struct ChunkingConfig {
218    /// Whether chunking is enabled during indexing.
219    pub enabled: bool,
220    /// Maximum chunk size in non-whitespace characters.
221    pub max_chunk_size: usize,
222    /// Minimum chunk size in non-whitespace characters.
223    pub min_chunk_size: usize,
224    /// Whether to auto-compact the graph after indexing.
225    pub auto_compact: bool,
226    /// Maximum number of retained chunk graph-nodes per file after compaction.
227    pub max_retained_chunks_per_file: usize,
228    /// Minimum chunk score (0.0–1.0) to survive compaction.
229    pub min_chunk_score_threshold: f64,
230    /// Maximum number of retained symbol graph-nodes per file after compaction.
231    pub max_retained_symbols_per_file: usize,
232    /// Minimum symbol score (0.0–1.0) to survive compaction.
233    pub min_symbol_score_threshold: f64,
234}
235
236impl Default for ChunkingConfig {
237    fn default() -> Self {
238        Self {
239            enabled: true,
240            max_chunk_size: 1500,
241            min_chunk_size: 50,
242            auto_compact: true,
243            max_retained_chunks_per_file: 10,
244            min_chunk_score_threshold: 0.2,
245            max_retained_symbols_per_file: 15,
246            min_symbol_score_threshold: 0.15,
247        }
248    }
249}
250
251/// Enrichment pipeline configuration for controlling insight generation thresholds.
252#[derive(Debug, Clone, Serialize, Deserialize)]
253#[serde(default)]
254pub struct EnrichmentConfig {
255    /// Minimum commit count for a file to generate a high-activity insight.
256    pub git_min_commit_count: usize,
257    /// Minimum co-change count for a file pair to generate a coupling insight.
258    pub git_min_co_change_count: usize,
259    /// Minimum coupling degree for a node to generate a high-coupling insight.
260    pub perf_min_coupling_degree: usize,
261    /// Minimum symbol count for a file to generate a complexity insight.
262    pub perf_min_symbol_count: usize,
263    /// Default confidence for auto-generated insights.
264    pub insight_confidence: f64,
265    /// Cosine similarity threshold for deduplicating insights.
266    pub dedup_similarity_threshold: f64,
267}
268
269impl Default for EnrichmentConfig {
270    fn default() -> Self {
271        Self {
272            git_min_commit_count: 25,
273            git_min_co_change_count: 5,
274            perf_min_coupling_degree: 25,
275            perf_min_symbol_count: 30,
276            insight_confidence: 0.5,
277            dedup_similarity_threshold: 0.90,
278        }
279    }
280}
281
282#[cfg(test)]
283#[path = "tests/config_tests.rs"]
284mod tests;