Skip to main content

graphrag_core/config/
validation.rs

1use crate::{GraphRAGError, Result};
2use crate::config::{Config, SetConfig};
3use std::path::Path;
4
5/// Result of configuration validation
6#[derive(Debug, Clone, Default)]
7pub struct ValidationResult {
8    /// Whether the configuration is valid
9    pub is_valid: bool,
10    /// List of validation errors
11    pub errors: Vec<String>,
12    /// List of validation warnings
13    pub warnings: Vec<String>,
14    /// List of optimization suggestions
15    pub suggestions: Vec<String>,
16}
17
18impl ValidationResult {
19    /// Create a new validation result
20    pub fn new() -> Self {
21        Self::default()
22    }
23
24    /// Add an error and mark validation as failed
25    pub fn add_error(&mut self, error: String) {
26        self.errors.push(error);
27        self.is_valid = false;
28    }
29
30    /// Add a warning (doesn't affect validity)
31    pub fn add_warning(&mut self, warning: String) {
32        self.warnings.push(warning);
33    }
34
35    /// Add an optimization suggestion
36    pub fn add_suggestion(&mut self, suggestion: String) {
37        self.suggestions.push(suggestion);
38    }
39}
40
41/// Trait for configuration validation
42pub trait Validatable {
43    /// Validate configuration with standard checks
44    fn validate(&self) -> ValidationResult;
45    /// Validate configuration with strict checks (includes warnings and suggestions)
46    fn validate_strict(&self) -> ValidationResult;
47}
48
49impl Validatable for Config {
50    fn validate(&self) -> ValidationResult {
51        let mut result = ValidationResult::new();
52
53        // Validate output directory
54        if self.output_dir.is_empty() {
55            result.add_error("Output directory cannot be empty".to_string());
56        }
57
58        // Validate chunk size
59        if self.chunk_size == 0 {
60            result.add_error("Chunk size must be greater than 0".to_string());
61        } else if self.chunk_size < 100 {
62            result.add_warning("Chunk size is very small (<100), this may affect performance".to_string());
63        } else if self.chunk_size > 10000 {
64            result.add_warning("Chunk size is very large (>10000), this may affect quality".to_string());
65        } else {
66            // Chunk size is in acceptable range (100-10000)
67        }
68
69        // Validate chunk overlap
70        if self.chunk_overlap >= self.chunk_size {
71            result.add_error("Chunk overlap must be less than chunk size".to_string());
72        } else if self.chunk_overlap > self.chunk_size / 2 {
73            result.add_warning("Chunk overlap is more than 50% of chunk size, this may be inefficient".to_string());
74        } else {
75            // Chunk overlap is in acceptable range
76        }
77
78        // Validate entity extraction settings
79        if let Some(max_entities) = self.max_entities_per_chunk {
80            if max_entities == 0 {
81                result.add_error("Max entities per chunk must be greater than 0".to_string());
82            } else if max_entities > 100 {
83                result.add_warning("Max entities per chunk is very high (>100)".to_string());
84            } else {
85                // Max entities is in acceptable range
86            }
87        }
88
89        // Validate retrieval settings
90        if let Some(top_k) = self.top_k_results {
91            if top_k == 0 {
92                result.add_error("Top-k results must be greater than 0".to_string());
93            } else if top_k > 100 {
94                result.add_warning("Top-k results is very high (>100), this may affect performance".to_string());
95            } else {
96                // Top-k is in acceptable range
97            }
98        }
99
100        // Validate similarity threshold
101        if let Some(threshold) = self.similarity_threshold {
102            if !(0.0..=1.0).contains(&threshold) {
103                result.add_error("Similarity threshold must be between 0.0 and 1.0".to_string());
104            } else if threshold < 0.1 {
105                result.add_warning("Similarity threshold is very low (<0.1), this may return irrelevant results".to_string());
106            } else if threshold > 0.9 {
107                result.add_warning("Similarity threshold is very high (>0.9), this may return too few results".to_string());
108            } else {
109                // Similarity threshold is in acceptable range (0.1-0.9)
110            }
111        }
112
113        // Add suggestions based on configuration
114        if self.chunk_size > 1000 && self.chunk_overlap < 100 {
115            result.add_suggestion("Consider increasing chunk overlap for better context preservation with large chunks".to_string());
116        }
117
118        result
119    }
120
121    fn validate_strict(&self) -> ValidationResult {
122        let mut result = self.validate();
123
124        // Additional strict validations
125
126        // Ensure all paths exist
127        let output_path = Path::new(&self.output_dir);
128        if !output_path.exists() {
129            result.add_warning(format!("Output directory does not exist: {}", self.output_dir));
130            result.add_suggestion("Directory will be created automatically".to_string());
131        }
132
133        // Validate feature compatibility
134        #[cfg(not(feature = "ollama"))]
135        {
136            result.add_warning("Ollama feature is not enabled, local LLM support unavailable".to_string());
137        }
138
139        #[cfg(not(feature = "parallel-processing"))]
140        {
141            result.add_warning("Parallel processing is not enabled, performance may be reduced".to_string());
142        }
143
144        // Check for optimal settings
145        let optimal_chunk_size = 800;
146        let optimal_overlap = 200;
147
148        if (self.chunk_size as i32 - optimal_chunk_size).abs() > 300 {
149            result.add_suggestion(format!(
150                "Consider using chunk size around {} for optimal performance",
151                optimal_chunk_size
152            ));
153        }
154
155        if (self.chunk_overlap as i32 - optimal_overlap).abs() > 100 {
156            result.add_suggestion(format!(
157                "Consider using chunk overlap around {} for optimal context preservation",
158                optimal_overlap
159            ));
160        }
161
162        result
163    }
164}
165
166/// Validate pipeline approach configuration (semantic/algorithmic/hybrid)
167fn validate_pipeline_approach(config: &SetConfig, result: &mut ValidationResult) {
168    let approach = &config.mode.approach;
169
170    // Validate approach value
171    match approach.as_str() {
172        "semantic" | "algorithmic" | "hybrid" => {},
173        invalid => {
174            result.add_error(format!("Invalid pipeline approach: '{}'. Must be 'semantic', 'algorithmic', or 'hybrid'", invalid));
175            return;
176        }
177    }
178
179    // Validate semantic pipeline
180    if approach == "semantic" {
181        match &config.semantic {
182            None => {
183                result.add_error("Semantic pipeline approach selected but [semantic] configuration is missing".to_string());
184            }
185            Some(semantic) => {
186                if !semantic.enabled {
187                    result.add_error("Semantic pipeline approach selected but semantic.enabled = false".to_string());
188                }
189
190                // Validate semantic embeddings
191                let valid_backends = ["huggingface", "openai", "voyage", "cohere", "jina", "mistral", "together", "ollama"];
192                if !valid_backends.contains(&semantic.embeddings.backend.as_str()) {
193                    result.add_error(format!(
194                        "Invalid semantic embedding backend: '{}'. Must be one of: {}",
195                        semantic.embeddings.backend,
196                        valid_backends.join(", ")
197                    ));
198                }
199
200                if semantic.embeddings.dimension == 0 {
201                    result.add_error("Semantic embedding dimension must be greater than 0".to_string());
202                }
203
204                // Validate semantic entity extraction
205                if semantic.entity_extraction.confidence_threshold < 0.0 || semantic.entity_extraction.confidence_threshold > 1.0 {
206                    result.add_error("Semantic entity extraction confidence threshold must be between 0.0 and 1.0".to_string());
207                }
208
209                if semantic.entity_extraction.temperature < 0.0 || semantic.entity_extraction.temperature > 2.0 {
210                    result.add_error("Semantic entity extraction temperature must be between 0.0 and 2.0".to_string());
211                }
212
213                // Validate semantic retrieval
214                if semantic.retrieval.similarity_threshold < 0.0 || semantic.retrieval.similarity_threshold > 1.0 {
215                    result.add_error("Semantic retrieval similarity threshold must be between 0.0 and 1.0".to_string());
216                }
217
218                if semantic.retrieval.top_k == 0 {
219                    result.add_error("Semantic retrieval top_k must be greater than 0".to_string());
220                }
221            }
222        }
223    }
224
225    // Validate algorithmic pipeline
226    if approach == "algorithmic" {
227        match &config.algorithmic {
228            None => {
229                result.add_error("Algorithmic pipeline approach selected but [algorithmic] configuration is missing".to_string());
230            }
231            Some(algorithmic) => {
232                if !algorithmic.enabled {
233                    result.add_error("Algorithmic pipeline approach selected but algorithmic.enabled = false".to_string());
234                }
235
236                // Validate algorithmic embeddings
237                if algorithmic.embeddings.backend != "hash" {
238                    result.add_warning(format!(
239                        "Algorithmic pipeline typically uses 'hash' backend, but '{}' is configured",
240                        algorithmic.embeddings.backend
241                    ));
242                }
243
244                if algorithmic.embeddings.dimension == 0 {
245                    result.add_error("Algorithmic embedding dimension must be greater than 0".to_string());
246                }
247
248                if algorithmic.embeddings.max_document_frequency < 0.0 || algorithmic.embeddings.max_document_frequency > 1.0 {
249                    result.add_error("Algorithmic max_document_frequency must be between 0.0 and 1.0".to_string());
250                }
251
252                // Validate algorithmic entity extraction
253                if algorithmic.entity_extraction.confidence_threshold < 0.0 || algorithmic.entity_extraction.confidence_threshold > 1.0 {
254                    result.add_error("Algorithmic entity extraction confidence threshold must be between 0.0 and 1.0".to_string());
255                }
256
257                // Validate algorithmic retrieval (BM25 parameters)
258                if algorithmic.retrieval.k1 < 0.0 {
259                    result.add_error("BM25 k1 parameter must be non-negative".to_string());
260                }
261
262                if algorithmic.retrieval.b < 0.0 || algorithmic.retrieval.b > 1.0 {
263                    result.add_error("BM25 b parameter must be between 0.0 and 1.0".to_string());
264                }
265
266                if algorithmic.retrieval.top_k == 0 {
267                    result.add_error("Algorithmic retrieval top_k must be greater than 0".to_string());
268                }
269            }
270        }
271    }
272
273    // Validate hybrid pipeline
274    if approach == "hybrid" {
275        match &config.hybrid {
276            None => {
277                result.add_error("Hybrid pipeline approach selected but [hybrid] configuration is missing".to_string());
278            }
279            Some(hybrid) => {
280                if !hybrid.enabled {
281                    result.add_error("Hybrid pipeline approach selected but hybrid.enabled = false".to_string());
282                }
283
284                // Validate hybrid weights
285                let weight_sum = hybrid.weights.semantic_weight + hybrid.weights.algorithmic_weight;
286                if (weight_sum - 1.0).abs() > 0.01 {
287                    result.add_warning(format!(
288                        "Hybrid weights should sum to 1.0 (currently: {:.2})",
289                        weight_sum
290                    ));
291                }
292
293                if hybrid.weights.semantic_weight < 0.0 || hybrid.weights.semantic_weight > 1.0 {
294                    result.add_error("Hybrid semantic_weight must be between 0.0 and 1.0".to_string());
295                }
296
297                if hybrid.weights.algorithmic_weight < 0.0 || hybrid.weights.algorithmic_weight > 1.0 {
298                    result.add_error("Hybrid algorithmic_weight must be between 0.0 and 1.0".to_string());
299                }
300
301                // Validate hybrid entity extraction weights
302                let entity_weight_sum = hybrid.entity_extraction.llm_weight + hybrid.entity_extraction.pattern_weight;
303                if (entity_weight_sum - 1.0).abs() > 0.01 {
304                    result.add_warning(format!(
305                        "Hybrid entity extraction weights should sum to 1.0 (currently: {:.2})",
306                        entity_weight_sum
307                    ));
308                }
309
310                // Validate hybrid retrieval weights
311                let retrieval_weight_sum = hybrid.retrieval.vector_weight + hybrid.retrieval.bm25_weight;
312                if (retrieval_weight_sum - 1.0).abs() > 0.01 {
313                    result.add_warning(format!(
314                        "Hybrid retrieval weights should sum to 1.0 (currently: {:.2})",
315                        retrieval_weight_sum
316                    ));
317                }
318
319                if hybrid.retrieval.rrf_constant == 0 {
320                    result.add_error("Hybrid RRF constant must be greater than 0 (typically 60)".to_string());
321                }
322
323                // Validate confidence boost
324                if hybrid.entity_extraction.confidence_boost < 0.0 || hybrid.entity_extraction.confidence_boost > 1.0 {
325                    result.add_warning("Hybrid confidence_boost should typically be between 0.0 and 1.0".to_string());
326                }
327            }
328        }
329    }
330
331    // Add suggestions based on approach
332    match approach.as_str() {
333        "semantic" => {
334            result.add_suggestion("Semantic pipeline uses neural embeddings and LLM-based extraction for high-quality results".to_string());
335            if config.ollama.enabled {
336                result.add_suggestion("Consider using 'llama3.1:8b' for entity extraction with gleaning enabled".to_string());
337            }
338        }
339        "algorithmic" => {
340            result.add_suggestion("Algorithmic pipeline uses pattern matching and TF-IDF for fast, resource-efficient processing".to_string());
341            result.add_suggestion("Algorithmic pipeline works well for structured documents and doesn't require an LLM".to_string());
342        }
343        "hybrid" => {
344            result.add_suggestion("Hybrid pipeline combines semantic and algorithmic approaches for balanced quality and performance".to_string());
345            result.add_suggestion("Fine-tune hybrid weights based on your specific use case and evaluation metrics".to_string());
346        }
347        _ => {}
348    }
349}
350
351impl Validatable for SetConfig {
352    fn validate(&self) -> ValidationResult {
353        let mut result = ValidationResult::new();
354
355        // Validate pipeline approach configuration
356        validate_pipeline_approach(self, &mut result);
357
358        // Validate general settings
359        if let Some(input_path) = &self.general.input_document_path {
360            if input_path.is_empty() {
361                result.add_error("Input document path cannot be empty".to_string());
362            } else {
363                let path = Path::new(input_path);
364                if !path.exists() {
365                    result.add_error(format!("Input document not found: {}", input_path));
366                } else if !path.is_file() {
367                    result.add_error(format!("Input path is not a file: {}", input_path));
368                } else {
369                    // Input path exists and is a valid file
370                }
371            }
372        } else {
373            result.add_error("Input document path is required".to_string());
374        }
375
376        if self.general.output_dir.is_empty() {
377            result.add_error("Output directory cannot be empty".to_string());
378        }
379
380        // Validate pipeline settings
381        let pipeline = &self.pipeline;
382        if pipeline.text_extraction.chunk_size == 0 {
383            result.add_error("Chunk size must be greater than 0".to_string());
384        }
385
386        if pipeline.text_extraction.chunk_overlap >= pipeline.text_extraction.chunk_size {
387            result.add_error("Chunk overlap must be less than chunk size".to_string());
388        }
389
390        // Validate Ollama settings if enabled
391        let ollama = &self.ollama;
392        if ollama.enabled {
393            if ollama.host.is_empty() {
394                result.add_error("Ollama host cannot be empty when enabled".to_string());
395            }
396
397            if ollama.port == 0 {
398                result.add_error("Ollama port must be valid".to_string());
399            }
400
401            if ollama.chat_model.is_empty() {
402                result.add_error("Ollama chat model must be specified".to_string());
403            }
404
405            if ollama.embedding_model.is_empty() {
406                result.add_error("Ollama embedding model must be specified".to_string());
407            }
408
409            // Suggest common models if using defaults
410            if ollama.chat_model == "llama2" {
411                result.add_suggestion("Consider using 'llama3.1:8b' for better performance".to_string());
412            }
413        }
414
415        // Validate storage settings
416        let storage = &self.storage;
417        match storage.database_type.as_str() {
418            "memory" | "file" | "sqlite" | "postgresql" | "neo4j" => {},
419            db_type => {
420                result.add_error(format!("Unknown database type: {}", db_type));
421                result.add_suggestion("Supported types: memory, file, sqlite, postgresql, neo4j".to_string());
422            }
423        }
424
425        result
426    }
427
428    fn validate_strict(&self) -> ValidationResult {
429        let mut result = self.validate();
430
431        // Additional strict checks
432        if !self.ollama.enabled {
433            result.add_warning("Ollama is not enabled, will use mock LLM".to_string());
434        }
435
436        result
437    }
438}
439
440/// Validate a TOML configuration file
441pub fn validate_config_file(path: &Path, strict: bool) -> Result<ValidationResult> {
442    let config_str = std::fs::read_to_string(path)?;
443    let config: SetConfig = toml::from_str(&config_str)
444        .map_err(|e| GraphRAGError::Config {
445            message: format!("Failed to parse TOML config: {}", e)
446        })?;
447
448    let result = if strict {
449        config.validate_strict()
450    } else {
451        config.validate()
452    };
453
454    Ok(result)
455}
456
457#[cfg(test)]
458mod tests {
459    use super::*;
460
461    #[test]
462    fn test_config_validation() {
463        let config = Config {
464            chunk_size: 0,
465            ..Default::default()
466        };
467
468        let result = config.validate();
469        assert!(!result.is_valid);
470        assert!(!result.errors.is_empty());
471    }
472
473    #[test]
474    fn test_chunk_overlap_validation() {
475        let config = Config {
476            chunk_size: 100,
477            chunk_overlap: 150,
478            ..Default::default()
479        };
480
481        let result = config.validate();
482        assert!(!result.is_valid);
483        assert!(result.errors.iter().any(|e| e.contains("overlap")));
484    }
485
486    #[test]
487    fn test_suggestions() {
488        let config = Config {
489            chunk_size: 2000,
490            chunk_overlap: 50,
491            ..Default::default()
492        };
493
494        let result = config.validate();
495        assert!(!result.suggestions.is_empty());
496    }
497}