Skip to main content

codemem_core/
config.rs

1//! Persistent configuration for Codemem.
2//!
3//! Loads/saves a TOML config at `~/.codemem/config.toml`.
4
5use crate::{CodememError, GraphConfig, ScoringWeights, VectorConfig};
6use serde::{Deserialize, Serialize};
7use std::path::{Path, PathBuf};
8
9/// Top-level Codemem configuration.
10#[derive(Debug, Clone, Default, Serialize, Deserialize)]
11#[serde(default)]
12pub struct CodememConfig {
13    pub scoring: ScoringWeights,
14    pub vector: VectorConfig,
15    pub graph: GraphConfig,
16    pub embedding: EmbeddingConfig,
17    pub storage: StorageConfig,
18    pub chunking: ChunkingConfig,
19    pub enrichment: EnrichmentConfig,
20    pub scip: ScipConfig,
21}
22
23impl CodememConfig {
24    /// Load configuration from the given path. Validates after loading.
25    pub fn load(path: &Path) -> Result<Self, CodememError> {
26        let content = std::fs::read_to_string(path)?;
27        let config: Self =
28            toml::from_str(&content).map_err(|e| CodememError::Config(e.to_string()))?;
29        config.validate()?;
30        Ok(config)
31    }
32
33    /// Validate configuration values.
34    ///
35    /// Checks that scoring weights are non-negative, dimensions and cache sizes
36    /// are positive, chunk size bounds are consistent, and dedup threshold is
37    /// in the valid range.
38    pub fn validate(&self) -> Result<(), CodememError> {
39        // M5: Scoring weights must be finite and non-negative.
40        // Check is_finite() first to reject NaN/Inf, then < 0.0 for negatives.
41        let w = &self.scoring;
42        let weights = [
43            w.vector_similarity,
44            w.graph_strength,
45            w.token_overlap,
46            w.temporal,
47            w.tag_matching,
48            w.importance,
49            w.confidence,
50            w.recency,
51        ];
52        if weights.iter().any(|v| !v.is_finite() || *v < 0.0) {
53            return Err(CodememError::Config(
54                "All scoring weights must be finite and non-negative".to_string(),
55            ));
56        }
57
58        // Embedding dimensions must be positive
59        if self.embedding.dimensions == 0 {
60            return Err(CodememError::Config(
61                "Embedding dimensions must be > 0".to_string(),
62            ));
63        }
64
65        // Vector dimensions must be positive
66        if self.vector.dimensions == 0 {
67            return Err(CodememError::Config(
68                "Vector dimensions must be > 0".to_string(),
69            ));
70        }
71
72        // Cache capacity must be positive
73        if self.embedding.cache_capacity == 0 {
74            return Err(CodememError::Config(
75                "Embedding cache capacity must be > 0".to_string(),
76            ));
77        }
78
79        // Batch size must be positive
80        if self.embedding.batch_size == 0 {
81            return Err(CodememError::Config(
82                "Embedding batch size must be > 0".to_string(),
83            ));
84        }
85
86        // Chunk size bounds
87        if self.chunking.min_chunk_size >= self.chunking.max_chunk_size {
88            return Err(CodememError::Config(
89                "min_chunk_size must be less than max_chunk_size".to_string(),
90            ));
91        }
92
93        // Dedup threshold in [0.0, 1.0] (also rejects NaN via range check)
94        if !(0.0..=1.0).contains(&self.enrichment.dedup_similarity_threshold) {
95            return Err(CodememError::Config(
96                "dedup_similarity_threshold must be between 0.0 and 1.0".to_string(),
97            ));
98        }
99
100        // Enrichment confidence in [0.0, 1.0]
101        if !(0.0..=1.0).contains(&self.enrichment.insight_confidence) {
102            return Err(CodememError::Config(
103                "insight_confidence must be between 0.0 and 1.0".to_string(),
104            ));
105        }
106
107        // SCIP max_references_per_symbol must be positive
108        if self.scip.max_references_per_symbol == 0 {
109            return Err(CodememError::Config(
110                "scip.max_references_per_symbol must be > 0".to_string(),
111            ));
112        }
113
114        // Chunking score thresholds in [0.0, 1.0]
115        let thresholds = [
116            (
117                self.chunking.min_chunk_score_threshold,
118                "min_chunk_score_threshold",
119            ),
120            (
121                self.chunking.min_symbol_score_threshold,
122                "min_symbol_score_threshold",
123            ),
124        ];
125        for (val, name) in &thresholds {
126            if !(0.0..=1.0).contains(val) {
127                return Err(CodememError::Config(format!(
128                    "{name} must be between 0.0 and 1.0"
129                )));
130            }
131        }
132
133        Ok(())
134    }
135
136    /// Save configuration to the given path. Validates before saving.
137    pub fn save(&self, path: &Path) -> Result<(), CodememError> {
138        // M5: Validate before saving to prevent persisting invalid config.
139        self.validate()?;
140        let content =
141            toml::to_string_pretty(self).map_err(|e| CodememError::Config(e.to_string()))?;
142        if let Some(parent) = path.parent() {
143            std::fs::create_dir_all(parent)?;
144        }
145        std::fs::write(path, content)?;
146        Ok(())
147    }
148
149    /// Load from the default path, or return defaults if the file doesn't exist.
150    pub fn load_or_default() -> Self {
151        let path = Self::default_path();
152        if path.exists() {
153            match Self::load(&path) {
154                Ok(config) => config,
155                Err(e) => {
156                    tracing::warn!("Failed to load config: {e}, using defaults");
157                    CodememConfig::default()
158                }
159            }
160        } else {
161            Self::default()
162        }
163    }
164
165    /// Default config path: `~/.codemem/config.toml`.
166    pub fn default_path() -> PathBuf {
167        dirs::home_dir()
168            .unwrap_or_else(|| PathBuf::from("."))
169            .join(".codemem")
170            .join("config.toml")
171    }
172}
173
174/// Embedding provider configuration.
175#[derive(Debug, Clone, Serialize, Deserialize)]
176#[serde(default)]
177pub struct EmbeddingConfig {
178    /// Provider name: "candle" (default), "ollama", or "openai".
179    pub provider: String,
180    /// Model name (provider-specific). For Candle: HF repo ID (e.g. "BAAI/bge-base-en-v1.5").
181    pub model: String,
182    /// API URL for remote providers.
183    pub url: String,
184    /// Embedding dimensions for remote providers (Ollama/OpenAI).
185    /// Ignored by Candle — reads `hidden_size` from model's config.json.
186    pub dimensions: usize,
187    /// LRU cache capacity.
188    pub cache_capacity: usize,
189    /// Batch size for embedding forward passes (GPU memory trade-off).
190    pub batch_size: usize,
191    /// Weight dtype: "f32" (default), "f16" (half precision), "bf16".
192    /// F16 halves memory and is faster on Metal GPU.
193    pub dtype: String,
194}
195
196impl Default for EmbeddingConfig {
197    fn default() -> Self {
198        Self {
199            provider: "candle".to_string(),
200            model: "BAAI/bge-base-en-v1.5".to_string(),
201            url: String::new(),
202            dimensions: 768,
203            cache_capacity: 10_000,
204            batch_size: 16,
205            dtype: "f32".to_string(),
206        }
207    }
208}
209
210/// Storage configuration.
211#[derive(Debug, Clone, Serialize, Deserialize)]
212#[serde(default)]
213pub struct StorageConfig {
214    /// SQLite cache size in MB.
215    pub cache_size_mb: u32,
216    /// SQLite busy timeout in seconds.
217    pub busy_timeout_secs: u64,
218}
219
220impl Default for StorageConfig {
221    fn default() -> Self {
222        Self {
223            cache_size_mb: 64,
224            busy_timeout_secs: 5,
225        }
226    }
227}
228
229/// CST-aware code chunking configuration.
230#[derive(Debug, Clone, Serialize, Deserialize)]
231#[serde(default)]
232pub struct ChunkingConfig {
233    /// Whether chunking is enabled during indexing.
234    pub enabled: bool,
235    /// Maximum chunk size in non-whitespace characters.
236    pub max_chunk_size: usize,
237    /// Minimum chunk size in non-whitespace characters.
238    pub min_chunk_size: usize,
239    /// Whether to auto-compact the graph after indexing.
240    pub auto_compact: bool,
241    /// Maximum number of retained chunk graph-nodes per file after compaction.
242    pub max_retained_chunks_per_file: usize,
243    /// Minimum chunk score (0.0–1.0) to survive compaction.
244    pub min_chunk_score_threshold: f64,
245    /// Maximum number of retained symbol graph-nodes per file after compaction.
246    pub max_retained_symbols_per_file: usize,
247    /// Minimum symbol score (0.0–1.0) to survive compaction.
248    pub min_symbol_score_threshold: f64,
249}
250
251impl Default for ChunkingConfig {
252    fn default() -> Self {
253        Self {
254            enabled: true,
255            max_chunk_size: 1500,
256            min_chunk_size: 50,
257            auto_compact: true,
258            max_retained_chunks_per_file: 10,
259            min_chunk_score_threshold: 0.2,
260            max_retained_symbols_per_file: 15,
261            min_symbol_score_threshold: 0.15,
262        }
263    }
264}
265
266/// SCIP integration configuration.
267#[derive(Debug, Clone, Serialize, Deserialize)]
268#[serde(default)]
269pub struct ScipConfig {
270    /// Master switch for SCIP integration.
271    pub enabled: bool,
272    /// Check PATH for available indexers.
273    pub auto_detect_indexers: bool,
274    /// Cache .scip files between runs.
275    pub cache_index: bool,
276    /// Re-index if cache older than this many hours.
277    pub cache_ttl_hours: u64,
278    /// Create ext: nodes for dependency symbols.
279    pub create_external_nodes: bool,
280    /// Skip utility symbols with excessive fan-out (fallback for kinds without per-kind limits).
281    pub max_references_per_symbol: usize,
282    /// Attach hover docs as memories to nodes.
283    pub store_docs_as_memories: bool,
284    /// Build nested containment tree from SCIP descriptor chains.
285    /// When true: file→module→class→method. When false: flat file→symbol.
286    pub hierarchical_containment: bool,
287    /// Collapse intra-class edges into parent metadata.
288    pub collapse_intra_class_edges: bool,
289    /// Per-kind fan-out limits (0 = use max_references_per_symbol fallback).
290    pub fan_out_limits: FanOutLimits,
291    /// Per-language indexer command overrides.
292    pub indexers: ScipIndexersConfig,
293}
294
295/// Per-kind inbound reference limits. A module can be widely imported; a function less so.
296#[derive(Debug, Clone, Serialize, Deserialize)]
297#[serde(default)]
298pub struct FanOutLimits {
299    pub module: usize,
300    pub function: usize,
301    pub method: usize,
302    pub class: usize,
303}
304
305impl Default for FanOutLimits {
306    fn default() -> Self {
307        Self {
308            module: 200,
309            function: 30,
310            method: 30,
311            class: 50,
312        }
313    }
314}
315
316impl Default for ScipConfig {
317    fn default() -> Self {
318        Self {
319            enabled: true,
320            auto_detect_indexers: true,
321            cache_index: true,
322            cache_ttl_hours: 24,
323            create_external_nodes: true,
324            max_references_per_symbol: 100,
325            store_docs_as_memories: true,
326            hierarchical_containment: true,
327            collapse_intra_class_edges: true,
328            fan_out_limits: FanOutLimits::default(),
329            indexers: ScipIndexersConfig::default(),
330        }
331    }
332}
333
334/// Per-language SCIP indexer command overrides. Empty string means auto-detect from PATH.
335///
336/// Commands are split on whitespace — paths with spaces are **not** supported.
337/// Use symlinks or PATH entries for indexers in directories with spaces.
338#[derive(Debug, Clone, Default, Serialize, Deserialize)]
339#[serde(default)]
340pub struct ScipIndexersConfig {
341    pub rust: String,
342    pub typescript: String,
343    pub python: String,
344    pub java: String,
345    pub go: String,
346}
347
348/// Enrichment pipeline configuration for controlling insight generation thresholds.
349#[derive(Debug, Clone, Serialize, Deserialize)]
350#[serde(default)]
351pub struct EnrichmentConfig {
352    /// Minimum commit count for a file to generate a high-activity insight.
353    pub git_min_commit_count: usize,
354    /// Minimum co-change count for a file pair to generate a coupling insight.
355    pub git_min_co_change_count: usize,
356    /// Minimum coupling degree for a node to generate a high-coupling insight.
357    pub perf_min_coupling_degree: usize,
358    /// Minimum symbol count for a file to generate a complexity insight.
359    pub perf_min_symbol_count: usize,
360    /// Default confidence for auto-generated insights.
361    pub insight_confidence: f64,
362    /// Cosine similarity threshold for deduplicating insights.
363    pub dedup_similarity_threshold: f64,
364}
365
366impl Default for EnrichmentConfig {
367    fn default() -> Self {
368        Self {
369            git_min_commit_count: 25,
370            git_min_co_change_count: 5,
371            perf_min_coupling_degree: 25,
372            perf_min_symbol_count: 30,
373            insight_confidence: 0.5,
374            dedup_similarity_threshold: 0.90,
375        }
376    }
377}
378
379#[cfg(test)]
380#[path = "tests/config_tests.rs"]
381mod tests;