use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DocumentsConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub defaults: ChunkingConfig,
#[serde(default)]
pub search: SearchConfig,
#[serde(default)]
pub collections: HashMap<String, CollectionConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchConfig {
#[serde(default)]
pub preview_mode: PreviewMode,
#[serde(default = "default_preview_chars")]
pub preview_chars: usize,
#[serde(default = "default_highlight")]
pub highlight: bool,
}
fn default_preview_chars() -> usize {
600
}
fn default_highlight() -> bool {
true
}
impl Default for SearchConfig {
fn default() -> Self {
Self {
preview_mode: PreviewMode::default(),
preview_chars: default_preview_chars(),
highlight: default_highlight(),
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum PreviewMode {
Full,
#[default]
Kwic,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CollectionConfig {
#[serde(default)]
pub paths: Vec<PathBuf>,
#[serde(default)]
pub patterns: Vec<String>,
pub strategy: Option<ChunkingStrategy>,
pub min_chunk_chars: Option<usize>,
pub max_chunk_chars: Option<usize>,
pub overlap_chars: Option<usize>,
}
impl CollectionConfig {
pub fn effective_chunking(&self, defaults: &ChunkingConfig) -> ChunkingConfig {
ChunkingConfig {
strategy: self.strategy.clone().unwrap_or(defaults.strategy.clone()),
min_chunk_chars: self.min_chunk_chars.unwrap_or(defaults.min_chunk_chars),
max_chunk_chars: self.max_chunk_chars.unwrap_or(defaults.max_chunk_chars),
overlap_chars: self.overlap_chars.unwrap_or(defaults.overlap_chars),
}
}
pub fn effective_patterns(&self) -> Vec<String> {
if self.patterns.is_empty() {
vec!["**/*.md".to_string(), "**/*.txt".to_string()]
} else {
self.patterns.clone()
}
}
}
impl Default for CollectionConfig {
fn default() -> Self {
Self {
paths: Vec::new(),
patterns: vec!["**/*.md".to_string()],
strategy: None,
min_chunk_chars: None,
max_chunk_chars: None,
overlap_chars: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkingConfig {
#[serde(default)]
pub strategy: ChunkingStrategy,
#[serde(default = "default_min_chunk_chars")]
pub min_chunk_chars: usize,
#[serde(default = "default_max_chunk_chars")]
pub max_chunk_chars: usize,
#[serde(default = "default_overlap_chars")]
pub overlap_chars: usize,
}
fn default_min_chunk_chars() -> usize {
200
}
fn default_max_chunk_chars() -> usize {
1500
}
fn default_overlap_chars() -> usize {
100
}
impl Default for ChunkingConfig {
fn default() -> Self {
Self {
strategy: ChunkingStrategy::default(),
min_chunk_chars: default_min_chunk_chars(),
max_chunk_chars: default_max_chunk_chars(),
overlap_chars: default_overlap_chars(),
}
}
}
impl ChunkingConfig {
pub fn validate(&self) -> Result<(), String> {
if self.min_chunk_chars >= self.max_chunk_chars {
return Err(format!(
"min_chunk_chars ({}) must be less than max_chunk_chars ({})",
self.min_chunk_chars, self.max_chunk_chars
));
}
if self.overlap_chars >= self.min_chunk_chars {
return Err(format!(
"overlap_chars ({}) should be less than min_chunk_chars ({})",
self.overlap_chars, self.min_chunk_chars
));
}
Ok(())
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum ChunkingStrategy {
#[default]
Hybrid,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunking_config_defaults() {
let config = ChunkingConfig::default();
assert_eq!(config.min_chunk_chars, 200);
assert_eq!(config.max_chunk_chars, 1500);
assert_eq!(config.overlap_chars, 100);
assert_eq!(config.strategy, ChunkingStrategy::Hybrid);
}
#[test]
fn test_chunking_config_validation() {
let mut config = ChunkingConfig::default();
assert!(config.validate().is_ok());
config.min_chunk_chars = 2000;
assert!(config.validate().is_err());
config.min_chunk_chars = 200;
config.overlap_chars = 300;
assert!(config.validate().is_err());
}
#[test]
fn test_collection_effective_chunking() {
let defaults = ChunkingConfig::default();
let collection = CollectionConfig::default();
let effective = collection.effective_chunking(&defaults);
assert_eq!(effective.max_chunk_chars, 1500);
let collection = CollectionConfig {
max_chunk_chars: Some(2000),
..Default::default()
};
let effective = collection.effective_chunking(&defaults);
assert_eq!(effective.max_chunk_chars, 2000);
assert_eq!(effective.min_chunk_chars, 200); }
}