Skip to main content

codanna/documents/
config.rs

1//! Configuration types for document chunking and collections.
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5use std::path::PathBuf;
6
7/// Top-level configuration for the documents feature.
8#[derive(Debug, Clone, Default, Serialize, Deserialize)]
9pub struct DocumentsConfig {
10    /// Whether document indexing is enabled.
11    #[serde(default)]
12    pub enabled: bool,
13
14    /// Default chunking configuration (applies to all collections unless overridden).
15    #[serde(default)]
16    pub defaults: ChunkingConfig,
17
18    /// Search result display configuration.
19    #[serde(default)]
20    pub search: SearchConfig,
21
22    /// Named collections of documents.
23    #[serde(default)]
24    pub collections: HashMap<String, CollectionConfig>,
25}
26
27/// Configuration for search result display.
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct SearchConfig {
30    /// Preview mode: "full" shows entire chunk, "kwic" centers on keyword.
31    #[serde(default)]
32    pub preview_mode: PreviewMode,
33
34    /// Number of characters to show in preview (for kwic mode).
35    #[serde(default = "default_preview_chars")]
36    pub preview_chars: usize,
37
38    /// Whether to highlight matching keywords in preview.
39    #[serde(default = "default_highlight")]
40    pub highlight: bool,
41}
42
43fn default_preview_chars() -> usize {
44    600
45}
46
47fn default_highlight() -> bool {
48    true
49}
50
51impl Default for SearchConfig {
52    fn default() -> Self {
53        Self {
54            preview_mode: PreviewMode::default(),
55            preview_chars: default_preview_chars(),
56            highlight: default_highlight(),
57        }
58    }
59}
60
61/// Preview mode for search results.
62#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
63#[serde(rename_all = "lowercase")]
64pub enum PreviewMode {
65    /// Show entire chunk content.
66    Full,
67    /// Keyword In Context: center preview window around first match.
68    #[default]
69    Kwic,
70}
71
72/// Configuration for a single document collection.
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct CollectionConfig {
75    /// Paths to include (directories or individual files).
76    #[serde(default)]
77    pub paths: Vec<PathBuf>,
78
79    /// Glob patterns for file matching (e.g., "**/*.md").
80    #[serde(default)]
81    pub patterns: Vec<String>,
82
83    /// Chunking strategy (overrides defaults).
84    pub strategy: Option<ChunkingStrategy>,
85
86    /// Minimum chunk size in characters (overrides defaults).
87    pub min_chunk_chars: Option<usize>,
88
89    /// Maximum chunk size in characters (overrides defaults).
90    pub max_chunk_chars: Option<usize>,
91
92    /// Overlap between chunks in characters (overrides defaults).
93    pub overlap_chars: Option<usize>,
94}
95
96impl CollectionConfig {
97    /// Merge with default config to get effective chunking settings.
98    pub fn effective_chunking(&self, defaults: &ChunkingConfig) -> ChunkingConfig {
99        ChunkingConfig {
100            strategy: self.strategy.clone().unwrap_or(defaults.strategy.clone()),
101            min_chunk_chars: self.min_chunk_chars.unwrap_or(defaults.min_chunk_chars),
102            max_chunk_chars: self.max_chunk_chars.unwrap_or(defaults.max_chunk_chars),
103            overlap_chars: self.overlap_chars.unwrap_or(defaults.overlap_chars),
104        }
105    }
106
107    /// Get default patterns if none specified.
108    pub fn effective_patterns(&self) -> Vec<String> {
109        if self.patterns.is_empty() {
110            vec!["**/*.md".to_string(), "**/*.txt".to_string()]
111        } else {
112            self.patterns.clone()
113        }
114    }
115}
116
117impl Default for CollectionConfig {
118    fn default() -> Self {
119        Self {
120            paths: Vec::new(),
121            patterns: vec!["**/*.md".to_string()],
122            strategy: None,
123            min_chunk_chars: None,
124            max_chunk_chars: None,
125            overlap_chars: None,
126        }
127    }
128}
129
130/// Configuration for document chunking.
131#[derive(Debug, Clone, Serialize, Deserialize)]
132pub struct ChunkingConfig {
133    /// Chunking strategy to use.
134    #[serde(default)]
135    pub strategy: ChunkingStrategy,
136
137    /// Minimum chunk size in characters. Smaller chunks are merged.
138    #[serde(default = "default_min_chunk_chars")]
139    pub min_chunk_chars: usize,
140
141    /// Maximum chunk size in characters. Larger chunks are split.
142    #[serde(default = "default_max_chunk_chars")]
143    pub max_chunk_chars: usize,
144
145    /// Overlap between adjacent chunks in characters.
146    #[serde(default = "default_overlap_chars")]
147    pub overlap_chars: usize,
148}
149
150fn default_min_chunk_chars() -> usize {
151    200
152}
153
154fn default_max_chunk_chars() -> usize {
155    1500
156}
157
158fn default_overlap_chars() -> usize {
159    100
160}
161
162impl Default for ChunkingConfig {
163    fn default() -> Self {
164        Self {
165            strategy: ChunkingStrategy::default(),
166            min_chunk_chars: default_min_chunk_chars(),
167            max_chunk_chars: default_max_chunk_chars(),
168            overlap_chars: default_overlap_chars(),
169        }
170    }
171}
172
173impl ChunkingConfig {
174    /// Validate configuration values.
175    pub fn validate(&self) -> Result<(), String> {
176        if self.min_chunk_chars >= self.max_chunk_chars {
177            return Err(format!(
178                "min_chunk_chars ({}) must be less than max_chunk_chars ({})",
179                self.min_chunk_chars, self.max_chunk_chars
180            ));
181        }
182
183        if self.overlap_chars >= self.min_chunk_chars {
184            return Err(format!(
185                "overlap_chars ({}) should be less than min_chunk_chars ({})",
186                self.overlap_chars, self.min_chunk_chars
187            ));
188        }
189
190        Ok(())
191    }
192}
193
194/// Strategy for splitting documents into chunks.
195#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
196#[serde(rename_all = "lowercase")]
197pub enum ChunkingStrategy {
198    /// Hybrid strategy: paragraph-based with size constraints.
199    /// Splits on double newlines, merges small chunks, splits large chunks with overlap.
200    #[default]
201    Hybrid,
202}
203
204#[cfg(test)]
205mod tests {
206    use super::*;
207
208    #[test]
209    fn test_chunking_config_defaults() {
210        let config = ChunkingConfig::default();
211        assert_eq!(config.min_chunk_chars, 200);
212        assert_eq!(config.max_chunk_chars, 1500);
213        assert_eq!(config.overlap_chars, 100);
214        assert_eq!(config.strategy, ChunkingStrategy::Hybrid);
215    }
216
217    #[test]
218    fn test_chunking_config_validation() {
219        let mut config = ChunkingConfig::default();
220
221        // Valid config
222        assert!(config.validate().is_ok());
223
224        // Invalid: min >= max
225        config.min_chunk_chars = 2000;
226        assert!(config.validate().is_err());
227
228        // Invalid: overlap >= min
229        config.min_chunk_chars = 200;
230        config.overlap_chars = 300;
231        assert!(config.validate().is_err());
232    }
233
234    #[test]
235    fn test_collection_effective_chunking() {
236        let defaults = ChunkingConfig::default();
237
238        // No overrides
239        let collection = CollectionConfig::default();
240        let effective = collection.effective_chunking(&defaults);
241        assert_eq!(effective.max_chunk_chars, 1500);
242
243        // With override
244        let collection = CollectionConfig {
245            max_chunk_chars: Some(2000),
246            ..Default::default()
247        };
248        let effective = collection.effective_chunking(&defaults);
249        assert_eq!(effective.max_chunk_chars, 2000);
250        assert_eq!(effective.min_chunk_chars, 200); // Still default
251    }
252}